1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * LDoms virtual disk client (vdc) device driver 29 * 30 * This driver runs on a guest logical domain and communicates with the virtual 31 * disk server (vds) driver running on the service domain which is exporting 32 * virtualized "disks" to the guest logical domain. 33 * 34 * The driver can be divided into four sections: 35 * 36 * 1) generic device driver housekeeping 37 * _init, _fini, attach, detach, ops structures, etc. 38 * 39 * 2) communication channel setup 40 * Setup the communications link over the LDC channel that vdc uses to 41 * talk to the vDisk server. Initialise the descriptor ring which 42 * allows the LDC clients to transfer data via memory mappings. 43 * 44 * 3) Support exported to upper layers (filesystems, etc) 45 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 46 * ioctl calls. vdc will copy the data to be written to the descriptor 47 * ring or maps the buffer to store the data read by the vDisk 48 * server into the descriptor ring. It then sends a message to the 49 * vDisk server requesting it to complete the operation. 50 * 51 * 4) Handling responses from vDisk server. 52 * The vDisk server will ACK some or all of the messages vdc sends to it 53 * (this is configured during the handshake). Upon receipt of an ACK 54 * vdc will check the descriptor ring and signal to the upper layer 55 * code waiting on the IO. 56 */ 57 58 #include <sys/atomic.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/ddi.h> 62 #include <sys/dkio.h> 63 #include <sys/efi_partition.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/kstat.h> 67 #include <sys/mach_descrip.h> 68 #include <sys/modctl.h> 69 #include <sys/mdeg.h> 70 #include <sys/note.h> 71 #include <sys/open.h> 72 #include <sys/sdt.h> 73 #include <sys/stat.h> 74 #include <sys/sunddi.h> 75 #include <sys/types.h> 76 #include <sys/promif.h> 77 #include <sys/var.h> 78 #include <sys/vtoc.h> 79 #include <sys/archsystm.h> 80 #include <sys/sysmacros.h> 81 82 #include <sys/cdio.h> 83 #include <sys/dktp/fdisk.h> 84 #include <sys/dktp/dadkio.h> 85 #include <sys/mhd.h> 86 #include <sys/scsi/generic/sense.h> 87 #include <sys/scsi/impl/uscsi.h> 88 #include <sys/scsi/impl/services.h> 89 #include <sys/scsi/targets/sddef.h> 90 91 #include <sys/ldoms.h> 92 #include <sys/ldc.h> 93 #include <sys/vio_common.h> 94 #include <sys/vio_mailbox.h> 95 #include <sys/vio_util.h> 96 #include <sys/vdsk_common.h> 97 #include <sys/vdsk_mailbox.h> 98 #include <sys/vdc.h> 99 100 #define VD_OLDVTOC_LIMIT 0x7fffffff 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, 149 struct extvtoc *); 150 static void vdc_store_label_unk(vdc_t *vdc); 151 static boolean_t vdc_is_opened(vdc_t *vdc); 152 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 153 154 /* handshake with vds */ 155 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 156 static int vdc_ver_negotiation(vdc_t *vdcp); 157 static int vdc_init_attr_negotiation(vdc_t *vdc); 158 static int vdc_attr_negotiation(vdc_t *vdcp); 159 static int vdc_init_dring_negotiate(vdc_t *vdc); 160 static int vdc_dring_negotiation(vdc_t *vdcp); 161 static int vdc_send_rdx(vdc_t *vdcp); 162 static int vdc_rdx_exchange(vdc_t *vdcp); 163 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 164 165 /* processing incoming messages from vDisk server */ 166 static void vdc_process_msg_thread(vdc_t *vdc); 167 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 168 169 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 170 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 171 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 172 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 173 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 174 static int vdc_send_request(vdc_t *vdcp, int operation, 175 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 176 int cb_type, void *cb_arg, vio_desc_direction_t dir); 177 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 178 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 179 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 180 int cb_type, void *cb_arg, vio_desc_direction_t dir); 181 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 182 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 183 void *cb_arg, vio_desc_direction_t dir, boolean_t); 184 185 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 186 static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); 187 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 188 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 189 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 190 191 /* dkio */ 192 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 193 int *rvalp); 194 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 195 static void vdc_create_fake_geometry(vdc_t *vdc); 196 static int vdc_validate_geometry(vdc_t *vdc); 197 static void vdc_validate(vdc_t *vdc); 198 static void vdc_validate_task(void *arg); 199 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 216 int mode, int dir); 217 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 218 int mode, int dir); 219 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 220 int mode, int dir); 221 222 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 223 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 224 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 225 static int vdc_failfast_check_resv(vdc_t *vdc); 226 227 /* 228 * Module variables 229 */ 230 231 /* 232 * Tunable variables to control how long vdc waits before timing out on 233 * various operations 234 */ 235 static int vdc_hshake_retries = 3; 236 237 static int vdc_timeout = 0; /* units: seconds */ 238 static int vdc_ldcup_timeout = 1; /* units: seconds */ 239 240 static uint64_t vdc_hz_min_ldc_delay; 241 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 242 static uint64_t vdc_hz_max_ldc_delay; 243 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 244 245 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 246 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 247 248 /* values for dumping - need to run in a tighter loop */ 249 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 250 static int vdc_dump_retries = 100; 251 252 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 253 254 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 255 256 /* Count of the number of vdc instances attached */ 257 static volatile uint32_t vdc_instance_count = 0; 258 259 /* Tunable to log all SCSI errors */ 260 static boolean_t vdc_scsi_log_error = B_FALSE; 261 262 /* Soft state pointer */ 263 static void *vdc_state; 264 265 /* 266 * Controlling the verbosity of the error/debug messages 267 * 268 * vdc_msglevel - controls level of messages 269 * vdc_matchinst - 64-bit variable where each bit corresponds 270 * to the vdc instance the vdc_msglevel applies. 271 */ 272 int vdc_msglevel = 0x0; 273 uint64_t vdc_matchinst = 0ull; 274 275 /* 276 * Supported vDisk protocol version pairs. 277 * 278 * The first array entry is the latest and preferred version. 279 */ 280 static const vio_ver_t vdc_version[] = {{1, 1}}; 281 282 static struct cb_ops vdc_cb_ops = { 283 vdc_open, /* cb_open */ 284 vdc_close, /* cb_close */ 285 vdc_strategy, /* cb_strategy */ 286 vdc_print, /* cb_print */ 287 vdc_dump, /* cb_dump */ 288 vdc_read, /* cb_read */ 289 vdc_write, /* cb_write */ 290 vdc_ioctl, /* cb_ioctl */ 291 nodev, /* cb_devmap */ 292 nodev, /* cb_mmap */ 293 nodev, /* cb_segmap */ 294 nochpoll, /* cb_chpoll */ 295 vdc_prop_op, /* cb_prop_op */ 296 NULL, /* cb_str */ 297 D_MP | D_64BIT, /* cb_flag */ 298 CB_REV, /* cb_rev */ 299 vdc_aread, /* cb_aread */ 300 vdc_awrite /* cb_awrite */ 301 }; 302 303 static struct dev_ops vdc_ops = { 304 DEVO_REV, /* devo_rev */ 305 0, /* devo_refcnt */ 306 vdc_getinfo, /* devo_getinfo */ 307 nulldev, /* devo_identify */ 308 nulldev, /* devo_probe */ 309 vdc_attach, /* devo_attach */ 310 vdc_detach, /* devo_detach */ 311 nodev, /* devo_reset */ 312 &vdc_cb_ops, /* devo_cb_ops */ 313 NULL, /* devo_bus_ops */ 314 nulldev /* devo_power */ 315 }; 316 317 static struct modldrv modldrv = { 318 &mod_driverops, 319 "virtual disk client", 320 &vdc_ops, 321 }; 322 323 static struct modlinkage modlinkage = { 324 MODREV_1, 325 &modldrv, 326 NULL 327 }; 328 329 /* -------------------------------------------------------------------------- */ 330 331 /* 332 * Device Driver housekeeping and setup 333 */ 334 335 int 336 _init(void) 337 { 338 int status; 339 340 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 341 return (status); 342 if ((status = mod_install(&modlinkage)) != 0) 343 ddi_soft_state_fini(&vdc_state); 344 return (status); 345 } 346 347 int 348 _info(struct modinfo *modinfop) 349 { 350 return (mod_info(&modlinkage, modinfop)); 351 } 352 353 int 354 _fini(void) 355 { 356 int status; 357 358 if ((status = mod_remove(&modlinkage)) != 0) 359 return (status); 360 ddi_soft_state_fini(&vdc_state); 361 return (0); 362 } 363 364 static int 365 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 366 { 367 _NOTE(ARGUNUSED(dip)) 368 369 int instance = VDCUNIT((dev_t)arg); 370 vdc_t *vdc = NULL; 371 372 switch (cmd) { 373 case DDI_INFO_DEVT2DEVINFO: 374 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 375 *resultp = NULL; 376 return (DDI_FAILURE); 377 } 378 *resultp = vdc->dip; 379 return (DDI_SUCCESS); 380 case DDI_INFO_DEVT2INSTANCE: 381 *resultp = (void *)(uintptr_t)instance; 382 return (DDI_SUCCESS); 383 default: 384 *resultp = NULL; 385 return (DDI_FAILURE); 386 } 387 } 388 389 static int 390 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 391 { 392 kt_did_t failfast_tid, ownership_tid; 393 int instance; 394 int rv; 395 vdc_server_t *srvr; 396 vdc_t *vdc = NULL; 397 398 switch (cmd) { 399 case DDI_DETACH: 400 /* the real work happens below */ 401 break; 402 case DDI_SUSPEND: 403 /* nothing to do for this non-device */ 404 return (DDI_SUCCESS); 405 default: 406 return (DDI_FAILURE); 407 } 408 409 ASSERT(cmd == DDI_DETACH); 410 instance = ddi_get_instance(dip); 411 DMSGX(1, "[%d] Entered\n", instance); 412 413 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 414 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 415 return (DDI_FAILURE); 416 } 417 418 /* 419 * This function is called when vdc is detached or if it has failed to 420 * attach. In that case, the attach may have fail before the vdisk type 421 * has been set so we can't call vdc_is_opened(). However as the attach 422 * has failed, we know that the vdisk is not opened and we can safely 423 * detach. 424 */ 425 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 426 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 427 return (DDI_FAILURE); 428 } 429 430 if (vdc->dkio_flush_pending) { 431 DMSG(vdc, 0, 432 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 433 instance, vdc->dkio_flush_pending); 434 return (DDI_FAILURE); 435 } 436 437 if (vdc->validate_pending) { 438 DMSG(vdc, 0, 439 "[%d] Cannot detach: %d outstanding validate request\n", 440 instance, vdc->validate_pending); 441 return (DDI_FAILURE); 442 } 443 444 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 445 446 /* If we took ownership, release ownership */ 447 mutex_enter(&vdc->ownership_lock); 448 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 449 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 450 if (rv == 0) { 451 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 452 } 453 } 454 mutex_exit(&vdc->ownership_lock); 455 456 /* mark instance as detaching */ 457 vdc->lifecycle = VDC_LC_DETACHING; 458 459 /* 460 * Try and disable callbacks to prevent another handshake. We have to 461 * disable callbacks for all servers. 462 */ 463 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 464 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 465 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 466 srvr->ldc_id, rv); 467 } 468 469 if (vdc->initialized & VDC_THREAD) { 470 mutex_enter(&vdc->read_lock); 471 if ((vdc->read_state == VDC_READ_WAITING) || 472 (vdc->read_state == VDC_READ_RESET)) { 473 vdc->read_state = VDC_READ_RESET; 474 cv_signal(&vdc->read_cv); 475 } 476 477 mutex_exit(&vdc->read_lock); 478 479 /* wake up any thread waiting for connection to come online */ 480 mutex_enter(&vdc->lock); 481 if (vdc->state == VDC_STATE_INIT_WAITING) { 482 DMSG(vdc, 0, 483 "[%d] write reset - move to resetting state...\n", 484 instance); 485 vdc->state = VDC_STATE_RESETTING; 486 cv_signal(&vdc->initwait_cv); 487 } 488 mutex_exit(&vdc->lock); 489 490 /* now wait until state transitions to VDC_STATE_DETACH */ 491 thread_join(vdc->msg_proc_thr->t_did); 492 ASSERT(vdc->state == VDC_STATE_DETACH); 493 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 494 vdc->instance); 495 } 496 497 mutex_enter(&vdc->lock); 498 499 if (vdc->initialized & VDC_DRING) 500 vdc_destroy_descriptor_ring(vdc); 501 502 vdc_fini_ports(vdc); 503 504 if (vdc->failfast_thread) { 505 failfast_tid = vdc->failfast_thread->t_did; 506 vdc->failfast_interval = 0; 507 cv_signal(&vdc->failfast_cv); 508 } else { 509 failfast_tid = 0; 510 } 511 512 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 513 ownership_tid = vdc->ownership_thread->t_did; 514 vdc->ownership = VDC_OWNERSHIP_NONE; 515 cv_signal(&vdc->ownership_cv); 516 } else { 517 ownership_tid = 0; 518 } 519 520 mutex_exit(&vdc->lock); 521 522 if (failfast_tid != 0) 523 thread_join(failfast_tid); 524 525 if (ownership_tid != 0) 526 thread_join(ownership_tid); 527 528 if (vdc->initialized & VDC_MINOR) 529 ddi_remove_minor_node(dip, NULL); 530 531 if (vdc->io_stats) { 532 kstat_delete(vdc->io_stats); 533 vdc->io_stats = NULL; 534 } 535 536 if (vdc->err_stats) { 537 kstat_delete(vdc->err_stats); 538 vdc->err_stats = NULL; 539 } 540 541 if (vdc->initialized & VDC_LOCKS) { 542 mutex_destroy(&vdc->lock); 543 mutex_destroy(&vdc->read_lock); 544 mutex_destroy(&vdc->ownership_lock); 545 cv_destroy(&vdc->initwait_cv); 546 cv_destroy(&vdc->dring_free_cv); 547 cv_destroy(&vdc->membind_cv); 548 cv_destroy(&vdc->sync_pending_cv); 549 cv_destroy(&vdc->sync_blocked_cv); 550 cv_destroy(&vdc->read_cv); 551 cv_destroy(&vdc->running_cv); 552 cv_destroy(&vdc->ownership_cv); 553 cv_destroy(&vdc->failfast_cv); 554 cv_destroy(&vdc->failfast_io_cv); 555 } 556 557 if (vdc->minfo) 558 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 559 560 if (vdc->cinfo) 561 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 562 563 if (vdc->vtoc) 564 kmem_free(vdc->vtoc, sizeof (struct extvtoc)); 565 566 if (vdc->geom) 567 kmem_free(vdc->geom, sizeof (struct dk_geom)); 568 569 if (vdc->devid) { 570 ddi_devid_unregister(dip); 571 ddi_devid_free(vdc->devid); 572 } 573 574 if (vdc->initialized & VDC_SOFT_STATE) 575 ddi_soft_state_free(vdc_state, instance); 576 577 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 578 579 return (DDI_SUCCESS); 580 } 581 582 583 static int 584 vdc_do_attach(dev_info_t *dip) 585 { 586 int instance; 587 vdc_t *vdc = NULL; 588 int status; 589 md_t *mdp; 590 mde_cookie_t vd_node; 591 592 ASSERT(dip != NULL); 593 594 instance = ddi_get_instance(dip); 595 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 596 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 597 instance); 598 return (DDI_FAILURE); 599 } 600 601 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 602 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 603 return (DDI_FAILURE); 604 } 605 606 /* 607 * We assign the value to initialized in this case to zero out the 608 * variable and then set bits in it to indicate what has been done 609 */ 610 vdc->initialized = VDC_SOFT_STATE; 611 612 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 613 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 614 615 vdc->dip = dip; 616 vdc->instance = instance; 617 vdc->vdisk_type = VD_DISK_TYPE_UNK; 618 vdc->vdisk_label = VD_DISK_LABEL_UNK; 619 vdc->state = VDC_STATE_INIT; 620 vdc->lifecycle = VDC_LC_ATTACHING; 621 vdc->session_id = 0; 622 vdc->block_size = DEV_BSIZE; 623 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 624 625 /* 626 * We assume, for now, that the vDisk server will export 'read' 627 * operations to us at a minimum (this is needed because of checks 628 * in vdc for supported operations early in the handshake process). 629 * The vDisk server will return ENOTSUP if this is not the case. 630 * The value will be overwritten during the attribute exchange with 631 * the bitmask of operations exported by server. 632 */ 633 vdc->operations = VD_OP_MASK_READ; 634 635 vdc->vtoc = NULL; 636 vdc->geom = NULL; 637 vdc->cinfo = NULL; 638 vdc->minfo = NULL; 639 640 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 641 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 642 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 645 646 vdc->threads_pending = 0; 647 vdc->sync_op_pending = B_FALSE; 648 vdc->sync_op_blocked = B_FALSE; 649 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 650 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 651 652 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 653 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 654 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 655 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 656 657 /* init blocking msg read functionality */ 658 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 659 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 660 vdc->read_state = VDC_READ_IDLE; 661 662 vdc->initialized |= VDC_LOCKS; 663 664 /* get device and port MD node for this disk instance */ 665 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 666 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 667 instance); 668 return (DDI_FAILURE); 669 } 670 671 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 672 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 673 return (DDI_FAILURE); 674 } 675 676 (void) md_fini_handle(mdp); 677 678 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 679 vdc_create_io_kstats(vdc); 680 vdc_create_err_kstats(vdc); 681 682 /* Initialize remaining structures before starting the msg thread */ 683 vdc->vdisk_label = VD_DISK_LABEL_UNK; 684 vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP); 685 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 686 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 687 688 /* initialize the thread responsible for managing state with server */ 689 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 690 vdc, 0, &p0, TS_RUN, minclsyspri); 691 if (vdc->msg_proc_thr == NULL) { 692 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 693 instance); 694 return (DDI_FAILURE); 695 } 696 697 vdc->initialized |= VDC_THREAD; 698 699 atomic_inc_32(&vdc_instance_count); 700 701 /* 702 * Check the disk label. This will send requests and do the handshake. 703 * We don't really care about the disk label now. What we really need is 704 * the handshake do be done so that we know the type of the disk (slice 705 * or full disk) and the appropriate device nodes can be created. 706 */ 707 708 mutex_enter(&vdc->lock); 709 (void) vdc_validate_geometry(vdc); 710 mutex_exit(&vdc->lock); 711 712 /* 713 * Now that we have the device info we can create the device nodes 714 */ 715 status = vdc_create_device_nodes(vdc); 716 if (status) { 717 DMSG(vdc, 0, "[%d] Failed to create device nodes", 718 instance); 719 goto return_status; 720 } 721 722 /* 723 * Setup devid 724 */ 725 if (vdc_setup_devid(vdc)) { 726 DMSG(vdc, 0, "[%d] No device id available\n", instance); 727 } 728 729 /* 730 * Fill in the fields of the error statistics kstat that were not 731 * available when creating the kstat 732 */ 733 vdc_set_err_kstats(vdc); 734 735 ddi_report_dev(dip); 736 vdc->lifecycle = VDC_LC_ONLINE; 737 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 738 739 return_status: 740 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 741 return (status); 742 } 743 744 static int 745 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 746 { 747 int status; 748 749 switch (cmd) { 750 case DDI_ATTACH: 751 if ((status = vdc_do_attach(dip)) != 0) 752 (void) vdc_detach(dip, DDI_DETACH); 753 return (status); 754 case DDI_RESUME: 755 /* nothing to do for this non-device */ 756 return (DDI_SUCCESS); 757 default: 758 return (DDI_FAILURE); 759 } 760 } 761 762 static int 763 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 764 { 765 int status = 0; 766 ldc_status_t ldc_state; 767 ldc_attr_t ldc_attr; 768 769 ASSERT(vdc != NULL); 770 ASSERT(srvr != NULL); 771 772 ldc_attr.devclass = LDC_DEV_BLK; 773 ldc_attr.instance = vdc->instance; 774 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 775 ldc_attr.mtu = VD_LDC_MTU; 776 777 if ((srvr->state & VDC_LDC_INIT) == 0) { 778 status = ldc_init(srvr->ldc_id, &ldc_attr, 779 &srvr->ldc_handle); 780 if (status != 0) { 781 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 782 vdc->instance, srvr->ldc_id, status); 783 return (status); 784 } 785 srvr->state |= VDC_LDC_INIT; 786 } 787 status = ldc_status(srvr->ldc_handle, &ldc_state); 788 if (status != 0) { 789 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 790 vdc->instance, status); 791 goto init_exit; 792 } 793 srvr->ldc_state = ldc_state; 794 795 if ((srvr->state & VDC_LDC_CB) == 0) { 796 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 797 (caddr_t)srvr); 798 if (status != 0) { 799 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 800 vdc->instance, status); 801 goto init_exit; 802 } 803 srvr->state |= VDC_LDC_CB; 804 } 805 806 /* 807 * At this stage we have initialised LDC, we will now try and open 808 * the connection. 809 */ 810 if (srvr->ldc_state == LDC_INIT) { 811 status = ldc_open(srvr->ldc_handle); 812 if (status != 0) { 813 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 814 vdc->instance, srvr->ldc_id, status); 815 goto init_exit; 816 } 817 srvr->state |= VDC_LDC_OPEN; 818 } 819 820 init_exit: 821 if (status) { 822 vdc_terminate_ldc(vdc, srvr); 823 } 824 825 return (status); 826 } 827 828 static int 829 vdc_start_ldc_connection(vdc_t *vdc) 830 { 831 int status = 0; 832 833 ASSERT(vdc != NULL); 834 835 ASSERT(MUTEX_HELD(&vdc->lock)); 836 837 status = vdc_do_ldc_up(vdc); 838 839 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 840 841 return (status); 842 } 843 844 static int 845 vdc_stop_ldc_connection(vdc_t *vdcp) 846 { 847 int status; 848 849 ASSERT(vdcp != NULL); 850 851 ASSERT(MUTEX_HELD(&vdcp->lock)); 852 853 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 854 vdcp->state); 855 856 status = ldc_down(vdcp->curr_server->ldc_handle); 857 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 858 859 vdcp->initialized &= ~VDC_HANDSHAKE; 860 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 861 862 return (status); 863 } 864 865 static void 866 vdc_create_io_kstats(vdc_t *vdc) 867 { 868 if (vdc->io_stats != NULL) { 869 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 870 return; 871 } 872 873 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 874 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 875 if (vdc->io_stats != NULL) { 876 vdc->io_stats->ks_lock = &vdc->lock; 877 kstat_install(vdc->io_stats); 878 } else { 879 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 880 " will not be gathered", vdc->instance); 881 } 882 } 883 884 static void 885 vdc_create_err_kstats(vdc_t *vdc) 886 { 887 vd_err_stats_t *stp; 888 char kstatmodule_err[KSTAT_STRLEN]; 889 char kstatname[KSTAT_STRLEN]; 890 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 891 int instance = vdc->instance; 892 893 if (vdc->err_stats != NULL) { 894 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 895 return; 896 } 897 898 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 899 "%serr", VDC_DRIVER_NAME); 900 (void) snprintf(kstatname, sizeof (kstatname), 901 "%s%d,err", VDC_DRIVER_NAME, instance); 902 903 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 904 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 905 906 if (vdc->err_stats == NULL) { 907 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 908 " will not be gathered", instance); 909 return; 910 } 911 912 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 913 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 914 KSTAT_DATA_UINT32); 915 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 916 KSTAT_DATA_UINT32); 917 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 918 KSTAT_DATA_UINT32); 919 kstat_named_init(&stp->vd_vid, "Vendor", 920 KSTAT_DATA_CHAR); 921 kstat_named_init(&stp->vd_pid, "Product", 922 KSTAT_DATA_CHAR); 923 kstat_named_init(&stp->vd_capacity, "Size", 924 KSTAT_DATA_ULONGLONG); 925 926 vdc->err_stats->ks_update = nulldev; 927 928 kstat_install(vdc->err_stats); 929 } 930 931 static void 932 vdc_set_err_kstats(vdc_t *vdc) 933 { 934 vd_err_stats_t *stp; 935 936 if (vdc->err_stats == NULL) 937 return; 938 939 mutex_enter(&vdc->lock); 940 941 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 942 ASSERT(stp != NULL); 943 944 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 945 (void) strcpy(stp->vd_vid.value.c, "SUN"); 946 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 947 948 mutex_exit(&vdc->lock); 949 } 950 951 static int 952 vdc_create_device_nodes_efi(vdc_t *vdc) 953 { 954 ddi_remove_minor_node(vdc->dip, "h"); 955 ddi_remove_minor_node(vdc->dip, "h,raw"); 956 957 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 958 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 959 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 960 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 961 vdc->instance); 962 return (EIO); 963 } 964 965 /* if any device node is created we set this flag */ 966 vdc->initialized |= VDC_MINOR; 967 968 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 969 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 970 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 971 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 972 vdc->instance); 973 return (EIO); 974 } 975 976 return (0); 977 } 978 979 static int 980 vdc_create_device_nodes_vtoc(vdc_t *vdc) 981 { 982 ddi_remove_minor_node(vdc->dip, "wd"); 983 ddi_remove_minor_node(vdc->dip, "wd,raw"); 984 985 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 986 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 987 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 988 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 989 vdc->instance); 990 return (EIO); 991 } 992 993 /* if any device node is created we set this flag */ 994 vdc->initialized |= VDC_MINOR; 995 996 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 997 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 998 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 999 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1000 vdc->instance); 1001 return (EIO); 1002 } 1003 1004 return (0); 1005 } 1006 1007 /* 1008 * Function: 1009 * vdc_create_device_nodes 1010 * 1011 * Description: 1012 * This function creates the block and character device nodes under 1013 * /devices. It is called as part of the attach(9E) of the instance 1014 * during the handshake with vds after vds has sent the attributes 1015 * to vdc. 1016 * 1017 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1018 * of 2 is used in keeping with the Solaris convention that slice 2 1019 * refers to a whole disk. Slices start at 'a' 1020 * 1021 * Parameters: 1022 * vdc - soft state pointer 1023 * 1024 * Return Values 1025 * 0 - Success 1026 * EIO - Failed to create node 1027 * EINVAL - Unknown type of disk exported 1028 */ 1029 static int 1030 vdc_create_device_nodes(vdc_t *vdc) 1031 { 1032 char name[sizeof ("s,raw")]; 1033 dev_info_t *dip = NULL; 1034 int instance, status; 1035 int num_slices = 1; 1036 int i; 1037 1038 ASSERT(vdc != NULL); 1039 1040 instance = vdc->instance; 1041 dip = vdc->dip; 1042 1043 switch (vdc->vdisk_type) { 1044 case VD_DISK_TYPE_DISK: 1045 num_slices = V_NUMPAR; 1046 break; 1047 case VD_DISK_TYPE_SLICE: 1048 num_slices = 1; 1049 break; 1050 case VD_DISK_TYPE_UNK: 1051 default: 1052 return (EINVAL); 1053 } 1054 1055 /* 1056 * Minor nodes are different for EFI disks: EFI disks do not have 1057 * a minor node 'g' for the minor number corresponding to slice 1058 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1059 * representing the whole disk. 1060 */ 1061 for (i = 0; i < num_slices; i++) { 1062 1063 if (i == VD_EFI_WD_SLICE) { 1064 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1065 status = vdc_create_device_nodes_efi(vdc); 1066 else 1067 status = vdc_create_device_nodes_vtoc(vdc); 1068 if (status != 0) 1069 return (status); 1070 continue; 1071 } 1072 1073 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1074 if (ddi_create_minor_node(dip, name, S_IFBLK, 1075 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1076 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1077 instance, name); 1078 return (EIO); 1079 } 1080 1081 /* if any device node is created we set this flag */ 1082 vdc->initialized |= VDC_MINOR; 1083 1084 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1085 1086 if (ddi_create_minor_node(dip, name, S_IFCHR, 1087 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1088 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1089 instance, name); 1090 return (EIO); 1091 } 1092 } 1093 1094 return (0); 1095 } 1096 1097 /* 1098 * Driver prop_op(9e) entry point function. Return the number of blocks for 1099 * the partition in question or forward the request to the property facilities. 1100 */ 1101 static int 1102 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1103 char *name, caddr_t valuep, int *lengthp) 1104 { 1105 int instance = ddi_get_instance(dip); 1106 vdc_t *vdc; 1107 uint64_t nblocks; 1108 uint_t blksize; 1109 1110 vdc = ddi_get_soft_state(vdc_state, instance); 1111 1112 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1113 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1114 name, valuep, lengthp)); 1115 } 1116 1117 mutex_enter(&vdc->lock); 1118 (void) vdc_validate_geometry(vdc); 1119 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1120 mutex_exit(&vdc->lock); 1121 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1122 name, valuep, lengthp)); 1123 } 1124 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1125 blksize = vdc->block_size; 1126 mutex_exit(&vdc->lock); 1127 1128 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1129 name, valuep, lengthp, nblocks, blksize)); 1130 } 1131 1132 /* 1133 * Function: 1134 * vdc_is_opened 1135 * 1136 * Description: 1137 * This function checks if any slice of a given virtual disk is 1138 * currently opened. 1139 * 1140 * Parameters: 1141 * vdc - soft state pointer 1142 * 1143 * Return Values 1144 * B_TRUE - at least one slice is opened. 1145 * B_FALSE - no slice is opened. 1146 */ 1147 static boolean_t 1148 vdc_is_opened(vdc_t *vdc) 1149 { 1150 int i, nslices; 1151 1152 switch (vdc->vdisk_type) { 1153 case VD_DISK_TYPE_DISK: 1154 nslices = V_NUMPAR; 1155 break; 1156 case VD_DISK_TYPE_SLICE: 1157 nslices = 1; 1158 break; 1159 case VD_DISK_TYPE_UNK: 1160 default: 1161 ASSERT(0); 1162 } 1163 1164 /* check if there's any layered open */ 1165 for (i = 0; i < nslices; i++) { 1166 if (vdc->open_lyr[i] > 0) 1167 return (B_TRUE); 1168 } 1169 1170 /* check if there is any other kind of open */ 1171 for (i = 0; i < OTYPCNT; i++) { 1172 if (vdc->open[i] != 0) 1173 return (B_TRUE); 1174 } 1175 1176 return (B_FALSE); 1177 } 1178 1179 static int 1180 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1181 { 1182 uint8_t slicemask; 1183 int i; 1184 1185 ASSERT(otyp < OTYPCNT); 1186 ASSERT(slice < V_NUMPAR); 1187 ASSERT(MUTEX_HELD(&vdc->lock)); 1188 1189 slicemask = 1 << slice; 1190 1191 /* check if slice is already exclusively opened */ 1192 if (vdc->open_excl & slicemask) 1193 return (EBUSY); 1194 1195 /* if open exclusive, check if slice is already opened */ 1196 if (flag & FEXCL) { 1197 if (vdc->open_lyr[slice] > 0) 1198 return (EBUSY); 1199 for (i = 0; i < OTYPCNT; i++) { 1200 if (vdc->open[i] & slicemask) 1201 return (EBUSY); 1202 } 1203 vdc->open_excl |= slicemask; 1204 } 1205 1206 /* mark slice as opened */ 1207 if (otyp == OTYP_LYR) { 1208 vdc->open_lyr[slice]++; 1209 } else { 1210 vdc->open[otyp] |= slicemask; 1211 } 1212 1213 return (0); 1214 } 1215 1216 static void 1217 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1218 { 1219 uint8_t slicemask; 1220 1221 ASSERT(otyp < OTYPCNT); 1222 ASSERT(slice < V_NUMPAR); 1223 ASSERT(MUTEX_HELD(&vdc->lock)); 1224 1225 slicemask = 1 << slice; 1226 1227 if (otyp == OTYP_LYR) { 1228 ASSERT(vdc->open_lyr[slice] > 0); 1229 vdc->open_lyr[slice]--; 1230 } else { 1231 vdc->open[otyp] &= ~slicemask; 1232 } 1233 1234 if (flag & FEXCL) 1235 vdc->open_excl &= ~slicemask; 1236 } 1237 1238 static int 1239 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1240 { 1241 _NOTE(ARGUNUSED(cred)) 1242 1243 int instance, nodelay; 1244 int slice, status = 0; 1245 vdc_t *vdc; 1246 1247 ASSERT(dev != NULL); 1248 instance = VDCUNIT(*dev); 1249 1250 if (otyp >= OTYPCNT) 1251 return (EINVAL); 1252 1253 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1254 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1255 return (ENXIO); 1256 } 1257 1258 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1259 getminor(*dev), flag, otyp); 1260 1261 slice = VDCPART(*dev); 1262 1263 nodelay = flag & (FNDELAY | FNONBLOCK); 1264 1265 if ((flag & FWRITE) && (!nodelay) && 1266 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1267 return (EROFS); 1268 } 1269 1270 mutex_enter(&vdc->lock); 1271 1272 status = vdc_mark_opened(vdc, slice, flag, otyp); 1273 1274 if (status != 0) { 1275 mutex_exit(&vdc->lock); 1276 return (status); 1277 } 1278 1279 if (nodelay) { 1280 1281 /* don't resubmit a validate request if there's already one */ 1282 if (vdc->validate_pending > 0) { 1283 mutex_exit(&vdc->lock); 1284 return (0); 1285 } 1286 1287 /* call vdc_validate() asynchronously to avoid blocking */ 1288 if (taskq_dispatch(system_taskq, vdc_validate_task, 1289 (void *)vdc, TQ_NOSLEEP) == NULL) { 1290 vdc_mark_closed(vdc, slice, flag, otyp); 1291 mutex_exit(&vdc->lock); 1292 return (ENXIO); 1293 } 1294 1295 vdc->validate_pending++; 1296 mutex_exit(&vdc->lock); 1297 return (0); 1298 } 1299 1300 mutex_exit(&vdc->lock); 1301 1302 vdc_validate(vdc); 1303 1304 mutex_enter(&vdc->lock); 1305 1306 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1307 vdc->slice[slice].nblocks == 0) { 1308 vdc_mark_closed(vdc, slice, flag, otyp); 1309 status = EIO; 1310 } 1311 1312 mutex_exit(&vdc->lock); 1313 1314 return (status); 1315 } 1316 1317 static int 1318 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1319 { 1320 _NOTE(ARGUNUSED(cred)) 1321 1322 int instance; 1323 int slice; 1324 int rv, rval; 1325 vdc_t *vdc; 1326 1327 instance = VDCUNIT(dev); 1328 1329 if (otyp >= OTYPCNT) 1330 return (EINVAL); 1331 1332 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1333 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1334 return (ENXIO); 1335 } 1336 1337 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1338 1339 slice = VDCPART(dev); 1340 1341 /* 1342 * Attempt to flush the W$ on a close operation. If this is 1343 * not a supported IOCTL command or the backing device is read-only 1344 * do not fail the close operation. 1345 */ 1346 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1347 1348 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1349 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1350 instance, rv); 1351 return (EIO); 1352 } 1353 1354 mutex_enter(&vdc->lock); 1355 vdc_mark_closed(vdc, slice, flag, otyp); 1356 mutex_exit(&vdc->lock); 1357 1358 return (0); 1359 } 1360 1361 static int 1362 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1363 { 1364 _NOTE(ARGUNUSED(credp)) 1365 1366 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1367 } 1368 1369 static int 1370 vdc_print(dev_t dev, char *str) 1371 { 1372 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1373 return (0); 1374 } 1375 1376 static int 1377 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1378 { 1379 int rv; 1380 size_t nbytes = nblk * DEV_BSIZE; 1381 int instance = VDCUNIT(dev); 1382 vdc_t *vdc = NULL; 1383 1384 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1385 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1386 return (ENXIO); 1387 } 1388 1389 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1390 instance, nbytes, blkno, (void *)addr); 1391 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1392 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1393 if (rv) { 1394 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1395 return (rv); 1396 } 1397 1398 if (ddi_in_panic()) 1399 (void) vdc_drain_response(vdc, NULL); 1400 1401 DMSG(vdc, 0, "[%d] End\n", instance); 1402 1403 return (0); 1404 } 1405 1406 /* -------------------------------------------------------------------------- */ 1407 1408 /* 1409 * Disk access routines 1410 * 1411 */ 1412 1413 /* 1414 * vdc_strategy() 1415 * 1416 * Return Value: 1417 * 0: As per strategy(9E), the strategy() function must return 0 1418 * [ bioerror(9f) sets b_flags to the proper error code ] 1419 */ 1420 static int 1421 vdc_strategy(struct buf *buf) 1422 { 1423 int rv = -1; 1424 vdc_t *vdc = NULL; 1425 int instance = VDCUNIT(buf->b_edev); 1426 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1427 int slice; 1428 1429 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1430 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1431 bioerror(buf, ENXIO); 1432 biodone(buf); 1433 return (0); 1434 } 1435 1436 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1437 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1438 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1439 1440 bp_mapin(buf); 1441 1442 if ((long)buf->b_private == VD_SLICE_NONE) { 1443 /* I/O using an absolute disk offset */ 1444 slice = VD_SLICE_NONE; 1445 } else { 1446 slice = VDCPART(buf->b_edev); 1447 } 1448 1449 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1450 buf->b_bcount, slice, buf->b_lblkno, 1451 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1452 VIO_write_dir); 1453 1454 /* 1455 * If the request was successfully sent, the strategy call returns and 1456 * the ACK handler calls the bioxxx functions when the vDisk server is 1457 * done otherwise we handle the error here. 1458 */ 1459 if (rv) { 1460 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1461 bioerror(buf, rv); 1462 biodone(buf); 1463 } else if (ddi_in_panic()) { 1464 (void) vdc_drain_response(vdc, buf); 1465 } 1466 1467 return (0); 1468 } 1469 1470 /* 1471 * Function: 1472 * vdc_min 1473 * 1474 * Description: 1475 * Routine to limit the size of a data transfer. Used in 1476 * conjunction with physio(9F). 1477 * 1478 * Arguments: 1479 * bp - pointer to the indicated buf(9S) struct. 1480 * 1481 */ 1482 static void 1483 vdc_min(struct buf *bufp) 1484 { 1485 vdc_t *vdc = NULL; 1486 int instance = VDCUNIT(bufp->b_edev); 1487 1488 vdc = ddi_get_soft_state(vdc_state, instance); 1489 VERIFY(vdc != NULL); 1490 1491 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1492 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1493 } 1494 } 1495 1496 static int 1497 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1498 { 1499 _NOTE(ARGUNUSED(cred)) 1500 1501 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1502 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1503 } 1504 1505 static int 1506 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1507 { 1508 _NOTE(ARGUNUSED(cred)) 1509 1510 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1511 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1512 } 1513 1514 static int 1515 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1516 { 1517 _NOTE(ARGUNUSED(cred)) 1518 1519 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1520 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1521 } 1522 1523 static int 1524 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1525 { 1526 _NOTE(ARGUNUSED(cred)) 1527 1528 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1529 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1530 } 1531 1532 1533 /* -------------------------------------------------------------------------- */ 1534 1535 /* 1536 * Handshake support 1537 */ 1538 1539 1540 /* 1541 * Function: 1542 * vdc_init_ver_negotiation() 1543 * 1544 * Description: 1545 * 1546 * Arguments: 1547 * vdc - soft state pointer for this instance of the device driver. 1548 * 1549 * Return Code: 1550 * 0 - Success 1551 */ 1552 static int 1553 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1554 { 1555 vio_ver_msg_t pkt; 1556 size_t msglen = sizeof (pkt); 1557 int status = -1; 1558 1559 ASSERT(vdc != NULL); 1560 ASSERT(mutex_owned(&vdc->lock)); 1561 1562 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1563 1564 /* 1565 * set the Session ID to a unique value 1566 * (the lower 32 bits of the clock tick) 1567 */ 1568 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1569 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1570 1571 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1572 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1573 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1574 pkt.tag.vio_sid = vdc->session_id; 1575 pkt.dev_class = VDEV_DISK; 1576 pkt.ver_major = ver.major; 1577 pkt.ver_minor = ver.minor; 1578 1579 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1580 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1581 vdc->instance, status); 1582 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1583 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1584 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1585 vdc->curr_server->ldc_handle, status, msglen); 1586 if (msglen != sizeof (vio_ver_msg_t)) 1587 status = ENOMSG; 1588 } 1589 1590 return (status); 1591 } 1592 1593 /* 1594 * Function: 1595 * vdc_ver_negotiation() 1596 * 1597 * Description: 1598 * 1599 * Arguments: 1600 * vdcp - soft state pointer for this instance of the device driver. 1601 * 1602 * Return Code: 1603 * 0 - Success 1604 */ 1605 static int 1606 vdc_ver_negotiation(vdc_t *vdcp) 1607 { 1608 vio_msg_t vio_msg; 1609 int status; 1610 1611 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1612 return (status); 1613 1614 /* release lock and wait for response */ 1615 mutex_exit(&vdcp->lock); 1616 status = vdc_wait_for_response(vdcp, &vio_msg); 1617 mutex_enter(&vdcp->lock); 1618 if (status) { 1619 DMSG(vdcp, 0, 1620 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1621 vdcp->instance, status); 1622 return (status); 1623 } 1624 1625 /* check type and sub_type ... */ 1626 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1627 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1628 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1629 vdcp->instance); 1630 return (EPROTO); 1631 } 1632 1633 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1634 } 1635 1636 /* 1637 * Function: 1638 * vdc_init_attr_negotiation() 1639 * 1640 * Description: 1641 * 1642 * Arguments: 1643 * vdc - soft state pointer for this instance of the device driver. 1644 * 1645 * Return Code: 1646 * 0 - Success 1647 */ 1648 static int 1649 vdc_init_attr_negotiation(vdc_t *vdc) 1650 { 1651 vd_attr_msg_t pkt; 1652 size_t msglen = sizeof (pkt); 1653 int status; 1654 1655 ASSERT(vdc != NULL); 1656 ASSERT(mutex_owned(&vdc->lock)); 1657 1658 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1659 1660 /* fill in tag */ 1661 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1662 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1663 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1664 pkt.tag.vio_sid = vdc->session_id; 1665 /* fill in payload */ 1666 pkt.max_xfer_sz = vdc->max_xfer_sz; 1667 pkt.vdisk_block_size = vdc->block_size; 1668 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1669 pkt.operations = 0; /* server will set bits of valid operations */ 1670 pkt.vdisk_type = 0; /* server will set to valid device type */ 1671 pkt.vdisk_media = 0; /* server will set to valid media type */ 1672 pkt.vdisk_size = 0; /* server will set to valid size */ 1673 1674 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1675 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1676 1677 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1678 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1679 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1680 vdc->curr_server->ldc_handle, status, msglen); 1681 if (msglen != sizeof (vd_attr_msg_t)) 1682 status = ENOMSG; 1683 } 1684 1685 return (status); 1686 } 1687 1688 /* 1689 * Function: 1690 * vdc_attr_negotiation() 1691 * 1692 * Description: 1693 * 1694 * Arguments: 1695 * vdc - soft state pointer for this instance of the device driver. 1696 * 1697 * Return Code: 1698 * 0 - Success 1699 */ 1700 static int 1701 vdc_attr_negotiation(vdc_t *vdcp) 1702 { 1703 int status; 1704 vio_msg_t vio_msg; 1705 1706 if (status = vdc_init_attr_negotiation(vdcp)) 1707 return (status); 1708 1709 /* release lock and wait for response */ 1710 mutex_exit(&vdcp->lock); 1711 status = vdc_wait_for_response(vdcp, &vio_msg); 1712 mutex_enter(&vdcp->lock); 1713 if (status) { 1714 DMSG(vdcp, 0, 1715 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1716 vdcp->instance, status); 1717 return (status); 1718 } 1719 1720 /* check type and sub_type ... */ 1721 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1722 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1723 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1724 vdcp->instance); 1725 return (EPROTO); 1726 } 1727 1728 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1729 } 1730 1731 1732 /* 1733 * Function: 1734 * vdc_init_dring_negotiate() 1735 * 1736 * Description: 1737 * 1738 * Arguments: 1739 * vdc - soft state pointer for this instance of the device driver. 1740 * 1741 * Return Code: 1742 * 0 - Success 1743 */ 1744 static int 1745 vdc_init_dring_negotiate(vdc_t *vdc) 1746 { 1747 vio_dring_reg_msg_t pkt; 1748 size_t msglen = sizeof (pkt); 1749 int status = -1; 1750 int retry; 1751 int nretries = 10; 1752 1753 ASSERT(vdc != NULL); 1754 ASSERT(mutex_owned(&vdc->lock)); 1755 1756 for (retry = 0; retry < nretries; retry++) { 1757 status = vdc_init_descriptor_ring(vdc); 1758 if (status != EAGAIN) 1759 break; 1760 drv_usecwait(vdc_min_timeout_ldc); 1761 } 1762 1763 if (status != 0) { 1764 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1765 vdc->instance, status); 1766 return (status); 1767 } 1768 1769 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1770 vdc->instance, status); 1771 1772 /* fill in tag */ 1773 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1774 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1775 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1776 pkt.tag.vio_sid = vdc->session_id; 1777 /* fill in payload */ 1778 pkt.dring_ident = 0; 1779 pkt.num_descriptors = vdc->dring_len; 1780 pkt.descriptor_size = vdc->dring_entry_size; 1781 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1782 pkt.ncookies = vdc->dring_cookie_count; 1783 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1784 1785 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1786 if (status != 0) { 1787 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1788 vdc->instance, status); 1789 } 1790 1791 return (status); 1792 } 1793 1794 1795 /* 1796 * Function: 1797 * vdc_dring_negotiation() 1798 * 1799 * Description: 1800 * 1801 * Arguments: 1802 * vdc - soft state pointer for this instance of the device driver. 1803 * 1804 * Return Code: 1805 * 0 - Success 1806 */ 1807 static int 1808 vdc_dring_negotiation(vdc_t *vdcp) 1809 { 1810 int status; 1811 vio_msg_t vio_msg; 1812 1813 if (status = vdc_init_dring_negotiate(vdcp)) 1814 return (status); 1815 1816 /* release lock and wait for response */ 1817 mutex_exit(&vdcp->lock); 1818 status = vdc_wait_for_response(vdcp, &vio_msg); 1819 mutex_enter(&vdcp->lock); 1820 if (status) { 1821 DMSG(vdcp, 0, 1822 "[%d] Failed waiting for Dring negotiation response," 1823 " rv(%d)", vdcp->instance, status); 1824 return (status); 1825 } 1826 1827 /* check type and sub_type ... */ 1828 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1829 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1830 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1831 vdcp->instance); 1832 return (EPROTO); 1833 } 1834 1835 return (vdc_handle_dring_reg_msg(vdcp, 1836 (vio_dring_reg_msg_t *)&vio_msg)); 1837 } 1838 1839 1840 /* 1841 * Function: 1842 * vdc_send_rdx() 1843 * 1844 * Description: 1845 * 1846 * Arguments: 1847 * vdc - soft state pointer for this instance of the device driver. 1848 * 1849 * Return Code: 1850 * 0 - Success 1851 */ 1852 static int 1853 vdc_send_rdx(vdc_t *vdcp) 1854 { 1855 vio_msg_t msg; 1856 size_t msglen = sizeof (vio_msg_t); 1857 int status; 1858 1859 /* 1860 * Send an RDX message to vds to indicate we are ready 1861 * to send data 1862 */ 1863 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1864 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1865 msg.tag.vio_subtype_env = VIO_RDX; 1866 msg.tag.vio_sid = vdcp->session_id; 1867 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1868 if (status != 0) { 1869 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1870 vdcp->instance, status); 1871 } 1872 1873 return (status); 1874 } 1875 1876 /* 1877 * Function: 1878 * vdc_handle_rdx() 1879 * 1880 * Description: 1881 * 1882 * Arguments: 1883 * vdc - soft state pointer for this instance of the device driver. 1884 * msgp - received msg 1885 * 1886 * Return Code: 1887 * 0 - Success 1888 */ 1889 static int 1890 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1891 { 1892 _NOTE(ARGUNUSED(vdcp)) 1893 _NOTE(ARGUNUSED(msgp)) 1894 1895 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1896 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1897 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1898 1899 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1900 1901 return (0); 1902 } 1903 1904 /* 1905 * Function: 1906 * vdc_rdx_exchange() 1907 * 1908 * Description: 1909 * 1910 * Arguments: 1911 * vdc - soft state pointer for this instance of the device driver. 1912 * 1913 * Return Code: 1914 * 0 - Success 1915 */ 1916 static int 1917 vdc_rdx_exchange(vdc_t *vdcp) 1918 { 1919 int status; 1920 vio_msg_t vio_msg; 1921 1922 if (status = vdc_send_rdx(vdcp)) 1923 return (status); 1924 1925 /* release lock and wait for response */ 1926 mutex_exit(&vdcp->lock); 1927 status = vdc_wait_for_response(vdcp, &vio_msg); 1928 mutex_enter(&vdcp->lock); 1929 if (status) { 1930 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1931 vdcp->instance, status); 1932 return (status); 1933 } 1934 1935 /* check type and sub_type ... */ 1936 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1937 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1938 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1939 return (EPROTO); 1940 } 1941 1942 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1943 } 1944 1945 1946 /* -------------------------------------------------------------------------- */ 1947 1948 /* 1949 * LDC helper routines 1950 */ 1951 1952 static int 1953 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1954 { 1955 int status; 1956 boolean_t q_has_pkts = B_FALSE; 1957 uint64_t delay_time; 1958 size_t len; 1959 1960 mutex_enter(&vdc->read_lock); 1961 1962 if (vdc->read_state == VDC_READ_IDLE) 1963 vdc->read_state = VDC_READ_WAITING; 1964 1965 while (vdc->read_state != VDC_READ_PENDING) { 1966 1967 /* detect if the connection has been reset */ 1968 if (vdc->read_state == VDC_READ_RESET) { 1969 status = ECONNRESET; 1970 goto done; 1971 } 1972 1973 cv_wait(&vdc->read_cv, &vdc->read_lock); 1974 } 1975 1976 /* 1977 * Until we get a blocking ldc read we have to retry 1978 * until the entire LDC message has arrived before 1979 * ldc_read() will succeed. Note we also bail out if 1980 * the channel is reset or goes away. 1981 */ 1982 delay_time = vdc_ldc_read_init_delay; 1983 loop: 1984 len = *nbytesp; 1985 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1986 switch (status) { 1987 case EAGAIN: 1988 delay_time *= 2; 1989 if (delay_time >= vdc_ldc_read_max_delay) 1990 delay_time = vdc_ldc_read_max_delay; 1991 delay(delay_time); 1992 goto loop; 1993 1994 case 0: 1995 if (len == 0) { 1996 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1997 "no error!\n", vdc->instance); 1998 goto loop; 1999 } 2000 2001 *nbytesp = len; 2002 2003 /* 2004 * If there are pending messages, leave the 2005 * read state as pending. Otherwise, set the state 2006 * back to idle. 2007 */ 2008 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2009 if (status == 0 && !q_has_pkts) 2010 vdc->read_state = VDC_READ_IDLE; 2011 2012 break; 2013 default: 2014 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2015 break; 2016 } 2017 2018 done: 2019 mutex_exit(&vdc->read_lock); 2020 2021 return (status); 2022 } 2023 2024 2025 2026 #ifdef DEBUG 2027 void 2028 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2029 { 2030 char *ms, *ss, *ses; 2031 switch (msg->tag.vio_msgtype) { 2032 #define Q(_s) case _s : ms = #_s; break; 2033 Q(VIO_TYPE_CTRL) 2034 Q(VIO_TYPE_DATA) 2035 Q(VIO_TYPE_ERR) 2036 #undef Q 2037 default: ms = "unknown"; break; 2038 } 2039 2040 switch (msg->tag.vio_subtype) { 2041 #define Q(_s) case _s : ss = #_s; break; 2042 Q(VIO_SUBTYPE_INFO) 2043 Q(VIO_SUBTYPE_ACK) 2044 Q(VIO_SUBTYPE_NACK) 2045 #undef Q 2046 default: ss = "unknown"; break; 2047 } 2048 2049 switch (msg->tag.vio_subtype_env) { 2050 #define Q(_s) case _s : ses = #_s; break; 2051 Q(VIO_VER_INFO) 2052 Q(VIO_ATTR_INFO) 2053 Q(VIO_DRING_REG) 2054 Q(VIO_DRING_UNREG) 2055 Q(VIO_RDX) 2056 Q(VIO_PKT_DATA) 2057 Q(VIO_DESC_DATA) 2058 Q(VIO_DRING_DATA) 2059 #undef Q 2060 default: ses = "unknown"; break; 2061 } 2062 2063 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2064 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2065 msg->tag.vio_subtype_env, ms, ss, ses); 2066 } 2067 #endif 2068 2069 /* 2070 * Function: 2071 * vdc_send() 2072 * 2073 * Description: 2074 * The function encapsulates the call to write a message using LDC. 2075 * If LDC indicates that the call failed due to the queue being full, 2076 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2077 * 2078 * Arguments: 2079 * ldc_handle - LDC handle for the channel this instance of vdc uses 2080 * pkt - address of LDC message to be sent 2081 * msglen - the size of the message being sent. When the function 2082 * returns, this contains the number of bytes written. 2083 * 2084 * Return Code: 2085 * 0 - Success. 2086 * EINVAL - pkt or msglen were NULL 2087 * ECONNRESET - The connection was not up. 2088 * EWOULDBLOCK - LDC queue is full 2089 * xxx - other error codes returned by ldc_write 2090 */ 2091 static int 2092 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2093 { 2094 size_t size = 0; 2095 int status = 0; 2096 clock_t delay_ticks; 2097 2098 ASSERT(vdc != NULL); 2099 ASSERT(mutex_owned(&vdc->lock)); 2100 ASSERT(msglen != NULL); 2101 ASSERT(*msglen != 0); 2102 2103 #ifdef DEBUG 2104 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2105 #endif 2106 /* 2107 * Wait indefinitely to send if channel 2108 * is busy, but bail out if we succeed or 2109 * if the channel closes or is reset. 2110 */ 2111 delay_ticks = vdc_hz_min_ldc_delay; 2112 do { 2113 size = *msglen; 2114 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2115 if (status == EWOULDBLOCK) { 2116 delay(delay_ticks); 2117 /* geometric backoff */ 2118 delay_ticks *= 2; 2119 if (delay_ticks > vdc_hz_max_ldc_delay) 2120 delay_ticks = vdc_hz_max_ldc_delay; 2121 } 2122 } while (status == EWOULDBLOCK); 2123 2124 /* if LDC had serious issues --- reset vdc state */ 2125 if (status == EIO || status == ECONNRESET) { 2126 /* LDC had serious issues --- reset vdc state */ 2127 mutex_enter(&vdc->read_lock); 2128 if ((vdc->read_state == VDC_READ_WAITING) || 2129 (vdc->read_state == VDC_READ_RESET)) 2130 cv_signal(&vdc->read_cv); 2131 vdc->read_state = VDC_READ_RESET; 2132 mutex_exit(&vdc->read_lock); 2133 2134 /* wake up any waiters in the reset thread */ 2135 if (vdc->state == VDC_STATE_INIT_WAITING) { 2136 DMSG(vdc, 0, "[%d] write reset - " 2137 "vdc is resetting ..\n", vdc->instance); 2138 vdc->state = VDC_STATE_RESETTING; 2139 cv_signal(&vdc->initwait_cv); 2140 } 2141 2142 return (ECONNRESET); 2143 } 2144 2145 /* return the last size written */ 2146 *msglen = size; 2147 2148 return (status); 2149 } 2150 2151 /* 2152 * Function: 2153 * vdc_get_md_node 2154 * 2155 * Description: 2156 * Get the MD, the device node for the given disk instance. The 2157 * caller is responsible for cleaning up the reference to the 2158 * returned MD (mdpp) by calling md_fini_handle(). 2159 * 2160 * Arguments: 2161 * dip - dev info pointer for this instance of the device driver. 2162 * mdpp - the returned MD. 2163 * vd_nodep - the returned device node. 2164 * 2165 * Return Code: 2166 * 0 - Success. 2167 * ENOENT - Expected node or property did not exist. 2168 * ENXIO - Unexpected error communicating with MD framework 2169 */ 2170 static int 2171 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2172 { 2173 int status = ENOENT; 2174 char *node_name = NULL; 2175 md_t *mdp = NULL; 2176 int num_nodes; 2177 int num_vdevs; 2178 mde_cookie_t rootnode; 2179 mde_cookie_t *listp = NULL; 2180 boolean_t found_inst = B_FALSE; 2181 int listsz; 2182 int idx; 2183 uint64_t md_inst; 2184 int obp_inst; 2185 int instance = ddi_get_instance(dip); 2186 2187 /* 2188 * Get the OBP instance number for comparison with the MD instance 2189 * 2190 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2191 * notion of "instance", or unique identifier, for that node; OBP 2192 * stores the value of the "cfg-handle" MD property as the value of 2193 * the "reg" property on the node in the device tree it builds from 2194 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2195 * "reg" property value to uniquely identify this device instance. 2196 * If the "reg" property cannot be found, the device tree state is 2197 * presumably so broken that there is no point in continuing. 2198 */ 2199 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2200 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2201 return (ENOENT); 2202 } 2203 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2204 OBP_REG, -1); 2205 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2206 2207 /* 2208 * We now walk the MD nodes to find the node for this vdisk. 2209 */ 2210 if ((mdp = md_get_handle()) == NULL) { 2211 cmn_err(CE_WARN, "unable to init machine description"); 2212 return (ENXIO); 2213 } 2214 2215 num_nodes = md_node_count(mdp); 2216 ASSERT(num_nodes > 0); 2217 2218 listsz = num_nodes * sizeof (mde_cookie_t); 2219 2220 /* allocate memory for nodes */ 2221 listp = kmem_zalloc(listsz, KM_SLEEP); 2222 2223 rootnode = md_root_node(mdp); 2224 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2225 2226 /* 2227 * Search for all the virtual devices, we will then check to see which 2228 * ones are disk nodes. 2229 */ 2230 num_vdevs = md_scan_dag(mdp, rootnode, 2231 md_find_name(mdp, VDC_MD_VDEV_NAME), 2232 md_find_name(mdp, "fwd"), listp); 2233 2234 if (num_vdevs <= 0) { 2235 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2236 status = ENOENT; 2237 goto done; 2238 } 2239 2240 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2241 for (idx = 0; idx < num_vdevs; idx++) { 2242 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2243 if ((status != 0) || (node_name == NULL)) { 2244 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2245 ": err %d", VDC_MD_VDEV_NAME, status); 2246 continue; 2247 } 2248 2249 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2250 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2251 status = md_get_prop_val(mdp, listp[idx], 2252 VDC_MD_CFG_HDL, &md_inst); 2253 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2254 instance, md_inst); 2255 if ((status == 0) && (md_inst == obp_inst)) { 2256 found_inst = B_TRUE; 2257 break; 2258 } 2259 } 2260 } 2261 2262 if (!found_inst) { 2263 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2264 status = ENOENT; 2265 goto done; 2266 } 2267 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2268 2269 *vd_nodep = listp[idx]; 2270 *mdpp = mdp; 2271 done: 2272 kmem_free(listp, listsz); 2273 return (status); 2274 } 2275 2276 /* 2277 * Function: 2278 * vdc_init_ports 2279 * 2280 * Description: 2281 * Initialize all the ports for this vdisk instance. 2282 * 2283 * Arguments: 2284 * vdc - soft state pointer for this instance of the device driver. 2285 * mdp - md pointer 2286 * vd_nodep - device md node. 2287 * 2288 * Return Code: 2289 * 0 - Success. 2290 * ENOENT - Expected node or property did not exist. 2291 */ 2292 static int 2293 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2294 { 2295 int status = 0; 2296 int idx; 2297 int num_nodes; 2298 int num_vports; 2299 int num_chans; 2300 int listsz; 2301 mde_cookie_t vd_port; 2302 mde_cookie_t *chanp = NULL; 2303 mde_cookie_t *portp = NULL; 2304 vdc_server_t *srvr; 2305 vdc_server_t *prev_srvr = NULL; 2306 2307 /* 2308 * We now walk the MD nodes to find the port nodes for this vdisk. 2309 */ 2310 num_nodes = md_node_count(mdp); 2311 ASSERT(num_nodes > 0); 2312 2313 listsz = num_nodes * sizeof (mde_cookie_t); 2314 2315 /* allocate memory for nodes */ 2316 portp = kmem_zalloc(listsz, KM_SLEEP); 2317 chanp = kmem_zalloc(listsz, KM_SLEEP); 2318 2319 num_vports = md_scan_dag(mdp, vd_nodep, 2320 md_find_name(mdp, VDC_MD_PORT_NAME), 2321 md_find_name(mdp, "fwd"), portp); 2322 if (num_vports == 0) { 2323 DMSGX(0, "Found no '%s' node for '%s' port\n", 2324 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2325 status = ENOENT; 2326 goto done; 2327 } 2328 2329 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2330 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2331 2332 vdc->num_servers = 0; 2333 for (idx = 0; idx < num_vports; idx++) { 2334 2335 /* initialize this port */ 2336 vd_port = portp[idx]; 2337 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2338 srvr->vdcp = vdc; 2339 2340 /* get port id */ 2341 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2342 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2343 VDC_MD_ID); 2344 kmem_free(srvr, sizeof (vdc_server_t)); 2345 continue; 2346 } 2347 2348 /* set the connection timeout */ 2349 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2350 &srvr->ctimeout) != 0) { 2351 srvr->ctimeout = 0; 2352 } 2353 2354 /* get the ldc id */ 2355 num_chans = md_scan_dag(mdp, vd_port, 2356 md_find_name(mdp, VDC_MD_CHAN_NAME), 2357 md_find_name(mdp, "fwd"), chanp); 2358 2359 /* expecting at least one channel */ 2360 if (num_chans <= 0) { 2361 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2362 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2363 kmem_free(srvr, sizeof (vdc_server_t)); 2364 continue; 2365 } else if (num_chans != 1) { 2366 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2367 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2368 num_chans); 2369 } 2370 2371 /* 2372 * We use the first channel found (index 0), irrespective of how 2373 * many are there in total. 2374 */ 2375 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2376 &srvr->ldc_id) != 0) { 2377 cmn_err(CE_NOTE, "Channel '%s' property not found", 2378 VDC_MD_ID); 2379 kmem_free(srvr, sizeof (vdc_server_t)); 2380 continue; 2381 } 2382 2383 /* 2384 * now initialise LDC channel which will be used to 2385 * communicate with this server 2386 */ 2387 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2388 kmem_free(srvr, sizeof (vdc_server_t)); 2389 continue; 2390 } 2391 2392 /* add server to list */ 2393 if (prev_srvr) 2394 prev_srvr->next = srvr; 2395 else 2396 vdc->server_list = srvr; 2397 2398 prev_srvr = srvr; 2399 2400 /* inc numbers of servers */ 2401 vdc->num_servers++; 2402 } 2403 2404 /* 2405 * Adjust the max number of handshake retries to match 2406 * the number of vdisk servers. 2407 */ 2408 if (vdc_hshake_retries < vdc->num_servers) 2409 vdc_hshake_retries = vdc->num_servers; 2410 2411 /* pick first server as current server */ 2412 if (vdc->server_list != NULL) { 2413 vdc->curr_server = vdc->server_list; 2414 status = 0; 2415 } else { 2416 status = ENOENT; 2417 } 2418 2419 done: 2420 kmem_free(chanp, listsz); 2421 kmem_free(portp, listsz); 2422 return (status); 2423 } 2424 2425 2426 /* 2427 * Function: 2428 * vdc_do_ldc_up 2429 * 2430 * Description: 2431 * Bring the channel for the current server up. 2432 * 2433 * Arguments: 2434 * vdc - soft state pointer for this instance of the device driver. 2435 * 2436 * Return Code: 2437 * 0 - Success. 2438 * EINVAL - Driver is detaching / LDC error 2439 * ECONNREFUSED - Other end is not listening 2440 */ 2441 static int 2442 vdc_do_ldc_up(vdc_t *vdc) 2443 { 2444 int status; 2445 ldc_status_t ldc_state; 2446 2447 ASSERT(MUTEX_HELD(&vdc->lock)); 2448 2449 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2450 vdc->instance, vdc->curr_server->ldc_id); 2451 2452 if (vdc->lifecycle == VDC_LC_DETACHING) 2453 return (EINVAL); 2454 2455 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2456 switch (status) { 2457 case ECONNREFUSED: /* listener not ready at other end */ 2458 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2459 vdc->instance, vdc->curr_server->ldc_id, status); 2460 status = 0; 2461 break; 2462 default: 2463 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2464 "channel=%ld, err=%d", vdc->instance, 2465 vdc->curr_server->ldc_id, status); 2466 break; 2467 } 2468 } 2469 2470 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2471 vdc->curr_server->ldc_state = ldc_state; 2472 if (ldc_state == LDC_UP) { 2473 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2474 vdc->instance); 2475 vdc->seq_num = 1; 2476 vdc->seq_num_reply = 0; 2477 } 2478 } 2479 2480 return (status); 2481 } 2482 2483 /* 2484 * Function: 2485 * vdc_terminate_ldc() 2486 * 2487 * Description: 2488 * 2489 * Arguments: 2490 * vdc - soft state pointer for this instance of the device driver. 2491 * srvr - vdc per-server info structure 2492 * 2493 * Return Code: 2494 * None 2495 */ 2496 static void 2497 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2498 { 2499 int instance = ddi_get_instance(vdc->dip); 2500 2501 if (srvr->state & VDC_LDC_OPEN) { 2502 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2503 (void) ldc_close(srvr->ldc_handle); 2504 } 2505 if (srvr->state & VDC_LDC_CB) { 2506 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2507 (void) ldc_unreg_callback(srvr->ldc_handle); 2508 } 2509 if (srvr->state & VDC_LDC_INIT) { 2510 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2511 (void) ldc_fini(srvr->ldc_handle); 2512 srvr->ldc_handle = NULL; 2513 } 2514 2515 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2516 } 2517 2518 /* 2519 * Function: 2520 * vdc_fini_ports() 2521 * 2522 * Description: 2523 * Finalize all ports by closing the channel associated with each 2524 * port and also freeing the server structure. 2525 * 2526 * Arguments: 2527 * vdc - soft state pointer for this instance of the device driver. 2528 * 2529 * Return Code: 2530 * None 2531 */ 2532 static void 2533 vdc_fini_ports(vdc_t *vdc) 2534 { 2535 int instance = ddi_get_instance(vdc->dip); 2536 vdc_server_t *srvr, *prev_srvr; 2537 2538 ASSERT(vdc != NULL); 2539 ASSERT(mutex_owned(&vdc->lock)); 2540 2541 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2542 2543 srvr = vdc->server_list; 2544 2545 while (srvr) { 2546 2547 vdc_terminate_ldc(vdc, srvr); 2548 2549 /* next server */ 2550 prev_srvr = srvr; 2551 srvr = srvr->next; 2552 2553 /* free server */ 2554 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2555 } 2556 2557 vdc->server_list = NULL; 2558 } 2559 2560 /* -------------------------------------------------------------------------- */ 2561 2562 /* 2563 * Descriptor Ring helper routines 2564 */ 2565 2566 /* 2567 * Function: 2568 * vdc_init_descriptor_ring() 2569 * 2570 * Description: 2571 * 2572 * Arguments: 2573 * vdc - soft state pointer for this instance of the device driver. 2574 * 2575 * Return Code: 2576 * 0 - Success 2577 */ 2578 static int 2579 vdc_init_descriptor_ring(vdc_t *vdc) 2580 { 2581 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2582 int status = 0; 2583 int i; 2584 2585 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2586 2587 ASSERT(vdc != NULL); 2588 ASSERT(mutex_owned(&vdc->lock)); 2589 2590 /* ensure we have enough room to store max sized block */ 2591 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2592 2593 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2594 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2595 /* 2596 * Calculate the maximum block size we can transmit using one 2597 * Descriptor Ring entry from the attributes returned by the 2598 * vDisk server. This is subject to a minimum of 'maxphys' 2599 * as we do not have the capability to split requests over 2600 * multiple DRing entries. 2601 */ 2602 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2603 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2604 vdc->instance); 2605 vdc->dring_max_cookies = maxphys / PAGESIZE; 2606 } else { 2607 vdc->dring_max_cookies = 2608 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2609 } 2610 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2611 (sizeof (ldc_mem_cookie_t) * 2612 (vdc->dring_max_cookies - 1))); 2613 vdc->dring_len = VD_DRING_LEN; 2614 2615 status = ldc_mem_dring_create(vdc->dring_len, 2616 vdc->dring_entry_size, &vdc->dring_hdl); 2617 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2618 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2619 vdc->instance); 2620 return (status); 2621 } 2622 vdc->initialized |= VDC_DRING_INIT; 2623 } 2624 2625 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2626 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2627 vdc->dring_cookie = 2628 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2629 2630 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2631 vdc->dring_hdl, 2632 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2633 &vdc->dring_cookie[0], 2634 &vdc->dring_cookie_count); 2635 if (status != 0) { 2636 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2637 "(%lx) to channel (%lx) status=%d\n", 2638 vdc->instance, vdc->dring_hdl, 2639 vdc->curr_server->ldc_handle, status); 2640 return (status); 2641 } 2642 ASSERT(vdc->dring_cookie_count == 1); 2643 vdc->initialized |= VDC_DRING_BOUND; 2644 } 2645 2646 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2647 if (status != 0) { 2648 DMSG(vdc, 0, 2649 "[%d] Failed to get info for descriptor ring (%lx)\n", 2650 vdc->instance, vdc->dring_hdl); 2651 return (status); 2652 } 2653 2654 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2655 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2656 2657 /* Allocate the local copy of this dring */ 2658 vdc->local_dring = 2659 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2660 KM_SLEEP); 2661 vdc->initialized |= VDC_DRING_LOCAL; 2662 } 2663 2664 /* 2665 * Mark all DRing entries as free and initialize the private 2666 * descriptor's memory handles. If any entry is initialized, 2667 * we need to free it later so we set the bit in 'initialized' 2668 * at the start. 2669 */ 2670 vdc->initialized |= VDC_DRING_ENTRY; 2671 for (i = 0; i < vdc->dring_len; i++) { 2672 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2673 dep->hdr.dstate = VIO_DESC_FREE; 2674 2675 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2676 &vdc->local_dring[i].desc_mhdl); 2677 if (status != 0) { 2678 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2679 " descriptor %d", vdc->instance, i); 2680 return (status); 2681 } 2682 vdc->local_dring[i].is_free = B_TRUE; 2683 vdc->local_dring[i].dep = dep; 2684 } 2685 2686 /* Initialize the starting index */ 2687 vdc->dring_curr_idx = 0; 2688 2689 return (status); 2690 } 2691 2692 /* 2693 * Function: 2694 * vdc_destroy_descriptor_ring() 2695 * 2696 * Description: 2697 * 2698 * Arguments: 2699 * vdc - soft state pointer for this instance of the device driver. 2700 * 2701 * Return Code: 2702 * None 2703 */ 2704 static void 2705 vdc_destroy_descriptor_ring(vdc_t *vdc) 2706 { 2707 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2708 ldc_mem_handle_t mhdl = NULL; 2709 ldc_mem_info_t minfo; 2710 int status = -1; 2711 int i; /* loop */ 2712 2713 ASSERT(vdc != NULL); 2714 ASSERT(mutex_owned(&vdc->lock)); 2715 2716 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2717 2718 if (vdc->initialized & VDC_DRING_ENTRY) { 2719 DMSG(vdc, 0, 2720 "[%d] Removing Local DRing entries\n", vdc->instance); 2721 for (i = 0; i < vdc->dring_len; i++) { 2722 ldep = &vdc->local_dring[i]; 2723 mhdl = ldep->desc_mhdl; 2724 2725 if (mhdl == NULL) 2726 continue; 2727 2728 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2729 DMSG(vdc, 0, 2730 "ldc_mem_info returned an error: %d\n", 2731 status); 2732 2733 /* 2734 * This must mean that the mem handle 2735 * is not valid. Clear it out so that 2736 * no one tries to use it. 2737 */ 2738 ldep->desc_mhdl = NULL; 2739 continue; 2740 } 2741 2742 if (minfo.status == LDC_BOUND) { 2743 (void) ldc_mem_unbind_handle(mhdl); 2744 } 2745 2746 (void) ldc_mem_free_handle(mhdl); 2747 2748 ldep->desc_mhdl = NULL; 2749 } 2750 vdc->initialized &= ~VDC_DRING_ENTRY; 2751 } 2752 2753 if (vdc->initialized & VDC_DRING_LOCAL) { 2754 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2755 kmem_free(vdc->local_dring, 2756 vdc->dring_len * sizeof (vdc_local_desc_t)); 2757 vdc->initialized &= ~VDC_DRING_LOCAL; 2758 } 2759 2760 if (vdc->initialized & VDC_DRING_BOUND) { 2761 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2762 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2763 if (status == 0) { 2764 vdc->initialized &= ~VDC_DRING_BOUND; 2765 } else { 2766 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2767 vdc->instance, status, vdc->dring_hdl); 2768 } 2769 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2770 } 2771 2772 if (vdc->initialized & VDC_DRING_INIT) { 2773 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2774 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2775 if (status == 0) { 2776 vdc->dring_hdl = NULL; 2777 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2778 vdc->initialized &= ~VDC_DRING_INIT; 2779 } else { 2780 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2781 vdc->instance, status, vdc->dring_hdl); 2782 } 2783 } 2784 } 2785 2786 /* 2787 * Function: 2788 * vdc_map_to_shared_dring() 2789 * 2790 * Description: 2791 * Copy contents of the local descriptor to the shared 2792 * memory descriptor. 2793 * 2794 * Arguments: 2795 * vdcp - soft state pointer for this instance of the device driver. 2796 * idx - descriptor ring index 2797 * 2798 * Return Code: 2799 * None 2800 */ 2801 static int 2802 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2803 { 2804 vdc_local_desc_t *ldep; 2805 vd_dring_entry_t *dep; 2806 int rv; 2807 2808 ldep = &(vdcp->local_dring[idx]); 2809 2810 /* for now leave in the old pop_mem_hdl stuff */ 2811 if (ldep->nbytes > 0) { 2812 rv = vdc_populate_mem_hdl(vdcp, ldep); 2813 if (rv) { 2814 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2815 vdcp->instance); 2816 return (rv); 2817 } 2818 } 2819 2820 /* 2821 * fill in the data details into the DRing 2822 */ 2823 dep = ldep->dep; 2824 ASSERT(dep != NULL); 2825 2826 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2827 dep->payload.operation = ldep->operation; 2828 dep->payload.addr = ldep->offset; 2829 dep->payload.nbytes = ldep->nbytes; 2830 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2831 dep->payload.slice = ldep->slice; 2832 dep->hdr.dstate = VIO_DESC_READY; 2833 dep->hdr.ack = 1; /* request an ACK for every message */ 2834 2835 return (0); 2836 } 2837 2838 /* 2839 * Function: 2840 * vdc_send_request 2841 * 2842 * Description: 2843 * This routine writes the data to be transmitted to vds into the 2844 * descriptor, notifies vds that the ring has been updated and 2845 * then waits for the request to be processed. 2846 * 2847 * Arguments: 2848 * vdcp - the soft state pointer 2849 * operation - operation we want vds to perform (VD_OP_XXX) 2850 * addr - address of data buf to be read/written. 2851 * nbytes - number of bytes to read/write 2852 * slice - the disk slice this request is for 2853 * offset - relative disk offset 2854 * cb_type - type of call - STRATEGY or SYNC 2855 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2856 * . mode for ioctl(9e) 2857 * . LP64 diskaddr_t (block I/O) 2858 * dir - direction of operation (READ/WRITE/BOTH) 2859 * 2860 * Return Codes: 2861 * 0 2862 * ENXIO 2863 */ 2864 static int 2865 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2866 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2867 void *cb_arg, vio_desc_direction_t dir) 2868 { 2869 int rv = 0; 2870 2871 ASSERT(vdcp != NULL); 2872 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2873 2874 mutex_enter(&vdcp->lock); 2875 2876 /* 2877 * If this is a block read/write operation we update the I/O statistics 2878 * to indicate that the request is being put on the waitq to be 2879 * serviced. 2880 * 2881 * We do it here (a common routine for both synchronous and strategy 2882 * calls) for performance reasons - we are already holding vdc->lock 2883 * so there is no extra locking overhead. We would have to explicitly 2884 * grab the 'lock' mutex to update the stats if we were to do this 2885 * higher up the stack in vdc_strategy() et. al. 2886 */ 2887 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2888 DTRACE_IO1(start, buf_t *, cb_arg); 2889 VD_KSTAT_WAITQ_ENTER(vdcp); 2890 } 2891 2892 do { 2893 while (vdcp->state != VDC_STATE_RUNNING) { 2894 2895 /* return error if detaching */ 2896 if (vdcp->state == VDC_STATE_DETACH) { 2897 rv = ENXIO; 2898 goto done; 2899 } 2900 2901 /* fail request if connection timeout is reached */ 2902 if (vdcp->ctimeout_reached) { 2903 rv = EIO; 2904 goto done; 2905 } 2906 2907 /* 2908 * If we are panicking and the disk is not ready then 2909 * we can't send any request because we can't complete 2910 * the handshake now. 2911 */ 2912 if (ddi_in_panic()) { 2913 rv = EIO; 2914 goto done; 2915 } 2916 2917 cv_wait(&vdcp->running_cv, &vdcp->lock); 2918 } 2919 2920 } while (vdc_populate_descriptor(vdcp, operation, addr, 2921 nbytes, slice, offset, cb_type, cb_arg, dir)); 2922 2923 done: 2924 /* 2925 * If this is a block read/write we update the I/O statistics kstat 2926 * to indicate that this request has been placed on the queue for 2927 * processing (i.e sent to the vDisk server) - iostat(1M) will 2928 * report the time waiting for the vDisk server under the %b column 2929 * In the case of an error we simply take it off the wait queue. 2930 */ 2931 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2932 if (rv == 0) { 2933 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2934 DTRACE_PROBE1(send, buf_t *, cb_arg); 2935 } else { 2936 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2937 VD_KSTAT_WAITQ_EXIT(vdcp); 2938 DTRACE_IO1(done, buf_t *, cb_arg); 2939 } 2940 } 2941 2942 mutex_exit(&vdcp->lock); 2943 2944 return (rv); 2945 } 2946 2947 2948 /* 2949 * Function: 2950 * vdc_populate_descriptor 2951 * 2952 * Description: 2953 * This routine writes the data to be transmitted to vds into the 2954 * descriptor, notifies vds that the ring has been updated and 2955 * then waits for the request to be processed. 2956 * 2957 * Arguments: 2958 * vdcp - the soft state pointer 2959 * operation - operation we want vds to perform (VD_OP_XXX) 2960 * addr - address of data buf to be read/written. 2961 * nbytes - number of bytes to read/write 2962 * slice - the disk slice this request is for 2963 * offset - relative disk offset 2964 * cb_type - type of call - STRATEGY or SYNC 2965 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2966 * . mode for ioctl(9e) 2967 * . LP64 diskaddr_t (block I/O) 2968 * dir - direction of operation (READ/WRITE/BOTH) 2969 * 2970 * Return Codes: 2971 * 0 2972 * EAGAIN 2973 * ECONNRESET 2974 * ENXIO 2975 */ 2976 static int 2977 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2978 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2979 void *cb_arg, vio_desc_direction_t dir) 2980 { 2981 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2982 int idx; /* Index of DRing entry used */ 2983 int next_idx; 2984 vio_dring_msg_t dmsg; 2985 size_t msglen; 2986 int rv; 2987 2988 ASSERT(MUTEX_HELD(&vdcp->lock)); 2989 vdcp->threads_pending++; 2990 loop: 2991 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2992 2993 /* Get next available D-Ring entry */ 2994 idx = vdcp->dring_curr_idx; 2995 local_dep = &(vdcp->local_dring[idx]); 2996 2997 if (!local_dep->is_free) { 2998 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2999 vdcp->instance); 3000 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3001 if (vdcp->state == VDC_STATE_RUNNING || 3002 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3003 goto loop; 3004 } 3005 vdcp->threads_pending--; 3006 return (ECONNRESET); 3007 } 3008 3009 next_idx = idx + 1; 3010 if (next_idx >= vdcp->dring_len) 3011 next_idx = 0; 3012 vdcp->dring_curr_idx = next_idx; 3013 3014 ASSERT(local_dep->is_free); 3015 3016 local_dep->operation = operation; 3017 local_dep->addr = addr; 3018 local_dep->nbytes = nbytes; 3019 local_dep->slice = slice; 3020 local_dep->offset = offset; 3021 local_dep->cb_type = cb_type; 3022 local_dep->cb_arg = cb_arg; 3023 local_dep->dir = dir; 3024 3025 local_dep->is_free = B_FALSE; 3026 3027 rv = vdc_map_to_shared_dring(vdcp, idx); 3028 if (rv) { 3029 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3030 vdcp->instance); 3031 /* free the descriptor */ 3032 local_dep->is_free = B_TRUE; 3033 vdcp->dring_curr_idx = idx; 3034 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3035 if (vdcp->state == VDC_STATE_RUNNING || 3036 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3037 goto loop; 3038 } 3039 vdcp->threads_pending--; 3040 return (ECONNRESET); 3041 } 3042 3043 /* 3044 * Send a msg with the DRing details to vds 3045 */ 3046 VIO_INIT_DRING_DATA_TAG(dmsg); 3047 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3048 dmsg.dring_ident = vdcp->dring_ident; 3049 dmsg.start_idx = idx; 3050 dmsg.end_idx = idx; 3051 vdcp->seq_num++; 3052 3053 DTRACE_PROBE2(populate, int, vdcp->instance, 3054 vdc_local_desc_t *, local_dep); 3055 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3056 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3057 3058 /* 3059 * note we're still holding the lock here to 3060 * make sure the message goes out in order !!!... 3061 */ 3062 msglen = sizeof (dmsg); 3063 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3064 switch (rv) { 3065 case ECONNRESET: 3066 /* 3067 * vdc_send initiates the reset on failure. 3068 * Since the transaction has already been put 3069 * on the local dring, it will automatically get 3070 * retried when the channel is reset. Given that, 3071 * it is ok to just return success even though the 3072 * send failed. 3073 */ 3074 rv = 0; 3075 break; 3076 3077 case 0: /* EOK */ 3078 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3079 break; 3080 3081 default: 3082 goto cleanup_and_exit; 3083 } 3084 3085 vdcp->threads_pending--; 3086 return (rv); 3087 3088 cleanup_and_exit: 3089 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3090 return (ENXIO); 3091 } 3092 3093 /* 3094 * Function: 3095 * vdc_do_sync_op 3096 * 3097 * Description: 3098 * Wrapper around vdc_populate_descriptor that blocks until the 3099 * response to the message is available. 3100 * 3101 * Arguments: 3102 * vdcp - the soft state pointer 3103 * operation - operation we want vds to perform (VD_OP_XXX) 3104 * addr - address of data buf to be read/written. 3105 * nbytes - number of bytes to read/write 3106 * slice - the disk slice this request is for 3107 * offset - relative disk offset 3108 * cb_type - type of call - STRATEGY or SYNC 3109 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3110 * . mode for ioctl(9e) 3111 * . LP64 diskaddr_t (block I/O) 3112 * dir - direction of operation (READ/WRITE/BOTH) 3113 * rconflict - check for reservation conflict in case of failure 3114 * 3115 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3116 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3117 * result of a successful operation with vd_scsi_status(). 3118 * 3119 * Return Codes: 3120 * 0 3121 * EAGAIN 3122 * EFAULT 3123 * ENXIO 3124 * EIO 3125 */ 3126 static int 3127 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3128 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3129 vio_desc_direction_t dir, boolean_t rconflict) 3130 { 3131 int status; 3132 vdc_io_t *vio; 3133 boolean_t check_resv_conflict = B_FALSE; 3134 3135 ASSERT(cb_type == CB_SYNC); 3136 3137 /* 3138 * Grab the lock, if blocked wait until the server 3139 * response causes us to wake up again. 3140 */ 3141 mutex_enter(&vdcp->lock); 3142 vdcp->sync_op_cnt++; 3143 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3144 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3145 3146 if (vdcp->state == VDC_STATE_DETACH) { 3147 cv_broadcast(&vdcp->sync_blocked_cv); 3148 vdcp->sync_op_cnt--; 3149 mutex_exit(&vdcp->lock); 3150 return (ENXIO); 3151 } 3152 3153 /* now block anyone other thread entering after us */ 3154 vdcp->sync_op_blocked = B_TRUE; 3155 vdcp->sync_op_pending = B_TRUE; 3156 mutex_exit(&vdcp->lock); 3157 3158 status = vdc_send_request(vdcp, operation, addr, 3159 nbytes, slice, offset, cb_type, cb_arg, dir); 3160 3161 mutex_enter(&vdcp->lock); 3162 3163 if (status != 0) { 3164 vdcp->sync_op_pending = B_FALSE; 3165 } else { 3166 /* 3167 * block until our transaction completes. 3168 * Also anyone else waiting also gets to go next. 3169 */ 3170 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3171 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3172 3173 DMSG(vdcp, 2, ": operation returned %d\n", 3174 vdcp->sync_op_status); 3175 if (vdcp->state == VDC_STATE_DETACH) { 3176 vdcp->sync_op_pending = B_FALSE; 3177 status = ENXIO; 3178 } else { 3179 status = vdcp->sync_op_status; 3180 if (status != 0 && vdcp->failfast_interval != 0) { 3181 /* 3182 * Operation has failed and failfast is enabled. 3183 * We need to check if the failure is due to a 3184 * reservation conflict if this was requested. 3185 */ 3186 check_resv_conflict = rconflict; 3187 } 3188 3189 } 3190 } 3191 3192 vdcp->sync_op_status = 0; 3193 vdcp->sync_op_blocked = B_FALSE; 3194 vdcp->sync_op_cnt--; 3195 3196 /* signal the next waiting thread */ 3197 cv_signal(&vdcp->sync_blocked_cv); 3198 3199 /* 3200 * We have to check for reservation conflict after unblocking sync 3201 * operations because some sync operations will be used to do this 3202 * check. 3203 */ 3204 if (check_resv_conflict) { 3205 vio = vdc_failfast_io_queue(vdcp, NULL); 3206 while (vio->vio_qtime != 0) 3207 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3208 kmem_free(vio, sizeof (vdc_io_t)); 3209 } 3210 3211 mutex_exit(&vdcp->lock); 3212 3213 return (status); 3214 } 3215 3216 3217 /* 3218 * Function: 3219 * vdc_drain_response() 3220 * 3221 * Description: 3222 * When a guest is panicking, the completion of requests needs to be 3223 * handled differently because interrupts are disabled and vdc 3224 * will not get messages. We have to poll for the messages instead. 3225 * 3226 * Note: since we are panicking we don't implement the io:::done 3227 * DTrace probe or update the I/O statistics kstats. 3228 * 3229 * Arguments: 3230 * vdc - soft state pointer for this instance of the device driver. 3231 * buf - if buf is NULL then we drain all responses, otherwise we 3232 * poll until we receive a ACK/NACK for the specific I/O 3233 * described by buf. 3234 * 3235 * Return Code: 3236 * 0 - Success 3237 */ 3238 static int 3239 vdc_drain_response(vdc_t *vdc, struct buf *buf) 3240 { 3241 int rv, idx, retries; 3242 size_t msglen; 3243 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3244 vio_dring_msg_t dmsg; 3245 struct buf *mbuf; 3246 3247 mutex_enter(&vdc->lock); 3248 3249 retries = 0; 3250 for (;;) { 3251 msglen = sizeof (dmsg); 3252 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3253 &msglen); 3254 if (rv) { 3255 rv = EINVAL; 3256 break; 3257 } 3258 3259 /* 3260 * if there are no packets wait and check again 3261 */ 3262 if ((rv == 0) && (msglen == 0)) { 3263 if (retries++ > vdc_dump_retries) { 3264 rv = EAGAIN; 3265 break; 3266 } 3267 3268 drv_usecwait(vdc_usec_timeout_dump); 3269 continue; 3270 } 3271 3272 /* 3273 * Ignore all messages that are not ACKs/NACKs to 3274 * DRing requests. 3275 */ 3276 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3277 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3278 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3279 dmsg.tag.vio_msgtype, 3280 dmsg.tag.vio_subtype, 3281 dmsg.tag.vio_subtype_env); 3282 continue; 3283 } 3284 3285 /* 3286 * set the appropriate return value for the current request. 3287 */ 3288 switch (dmsg.tag.vio_subtype) { 3289 case VIO_SUBTYPE_ACK: 3290 rv = 0; 3291 break; 3292 case VIO_SUBTYPE_NACK: 3293 rv = EAGAIN; 3294 break; 3295 default: 3296 continue; 3297 } 3298 3299 idx = dmsg.start_idx; 3300 if (idx >= vdc->dring_len) { 3301 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3302 vdc->instance, idx); 3303 continue; 3304 } 3305 ldep = &vdc->local_dring[idx]; 3306 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3307 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3308 vdc->instance, idx, ldep->dep->hdr.dstate); 3309 continue; 3310 } 3311 3312 if (buf != NULL && ldep->cb_type == CB_STRATEGY) { 3313 mbuf = ldep->cb_arg; 3314 mbuf->b_resid = mbuf->b_bcount - 3315 ldep->dep->payload.nbytes; 3316 bioerror(mbuf, (rv == EAGAIN)? EIO: 3317 ldep->dep->payload.status); 3318 biodone(mbuf); 3319 } else { 3320 mbuf = NULL; 3321 } 3322 3323 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3324 vdc->instance, idx, ldep->dep->hdr.dstate); 3325 3326 rv = vdc_depopulate_descriptor(vdc, idx); 3327 if (rv) { 3328 DMSG(vdc, 0, 3329 "[%d] Entry @ %d - depopulate failed ..\n", 3330 vdc->instance, idx); 3331 } 3332 3333 /* we have received an ACK/NACK for the specified buffer */ 3334 if (buf != NULL && buf == mbuf) { 3335 rv = 0; 3336 break; 3337 } 3338 3339 /* if this is the last descriptor - break out of loop */ 3340 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3341 if (buf != NULL) { 3342 /* 3343 * We never got a response for the specified 3344 * buffer so we fail the I/O. 3345 */ 3346 bioerror(buf, EIO); 3347 biodone(buf); 3348 } 3349 break; 3350 } 3351 } 3352 3353 mutex_exit(&vdc->lock); 3354 DMSG(vdc, 0, "End idx=%d\n", idx); 3355 3356 return (rv); 3357 } 3358 3359 3360 /* 3361 * Function: 3362 * vdc_depopulate_descriptor() 3363 * 3364 * Description: 3365 * 3366 * Arguments: 3367 * vdc - soft state pointer for this instance of the device driver. 3368 * idx - Index of the Descriptor Ring entry being modified 3369 * 3370 * Return Code: 3371 * 0 - Success 3372 */ 3373 static int 3374 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3375 { 3376 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3377 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3378 int status = ENXIO; 3379 int rv = 0; 3380 3381 ASSERT(vdc != NULL); 3382 ASSERT(idx < vdc->dring_len); 3383 ldep = &vdc->local_dring[idx]; 3384 ASSERT(ldep != NULL); 3385 ASSERT(MUTEX_HELD(&vdc->lock)); 3386 3387 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3388 DMSG(vdc, 2, ": idx = %d\n", idx); 3389 3390 dep = ldep->dep; 3391 ASSERT(dep != NULL); 3392 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3393 (dep->payload.status == ECANCELED)); 3394 3395 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3396 3397 ldep->is_free = B_TRUE; 3398 status = dep->payload.status; 3399 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3400 3401 /* 3402 * If no buffers were used to transfer information to the server when 3403 * populating the descriptor then no memory handles need to be unbound 3404 * and we can return now. 3405 */ 3406 if (ldep->nbytes == 0) { 3407 cv_signal(&vdc->dring_free_cv); 3408 return (status); 3409 } 3410 3411 /* 3412 * If the upper layer passed in a misaligned address we copied the 3413 * data into an aligned buffer before sending it to LDC - we now 3414 * copy it back to the original buffer. 3415 */ 3416 if (ldep->align_addr) { 3417 ASSERT(ldep->addr != NULL); 3418 3419 if (dep->payload.nbytes > 0) 3420 bcopy(ldep->align_addr, ldep->addr, 3421 dep->payload.nbytes); 3422 kmem_free(ldep->align_addr, 3423 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3424 ldep->align_addr = NULL; 3425 } 3426 3427 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3428 if (rv != 0) { 3429 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3430 vdc->instance, ldep->desc_mhdl, idx, rv); 3431 /* 3432 * The error returned by the vDisk server is more informative 3433 * and thus has a higher priority but if it isn't set we ensure 3434 * that this function returns an error. 3435 */ 3436 if (status == 0) 3437 status = EINVAL; 3438 } 3439 3440 cv_signal(&vdc->membind_cv); 3441 cv_signal(&vdc->dring_free_cv); 3442 3443 return (status); 3444 } 3445 3446 /* 3447 * Function: 3448 * vdc_populate_mem_hdl() 3449 * 3450 * Description: 3451 * 3452 * Arguments: 3453 * vdc - soft state pointer for this instance of the device driver. 3454 * idx - Index of the Descriptor Ring entry being modified 3455 * addr - virtual address being mapped in 3456 * nybtes - number of bytes in 'addr' 3457 * operation - the vDisk operation being performed (VD_OP_xxx) 3458 * 3459 * Return Code: 3460 * 0 - Success 3461 */ 3462 static int 3463 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3464 { 3465 vd_dring_entry_t *dep = NULL; 3466 ldc_mem_handle_t mhdl; 3467 caddr_t vaddr; 3468 size_t nbytes; 3469 uint8_t perm = LDC_MEM_RW; 3470 uint8_t maptype; 3471 int rv = 0; 3472 int i; 3473 3474 ASSERT(vdcp != NULL); 3475 3476 dep = ldep->dep; 3477 mhdl = ldep->desc_mhdl; 3478 3479 switch (ldep->dir) { 3480 case VIO_read_dir: 3481 perm = LDC_MEM_W; 3482 break; 3483 3484 case VIO_write_dir: 3485 perm = LDC_MEM_R; 3486 break; 3487 3488 case VIO_both_dir: 3489 perm = LDC_MEM_RW; 3490 break; 3491 3492 default: 3493 ASSERT(0); /* catch bad programming in vdc */ 3494 } 3495 3496 /* 3497 * LDC expects any addresses passed in to be 8-byte aligned. We need 3498 * to copy the contents of any misaligned buffers to a newly allocated 3499 * buffer and bind it instead (and copy the the contents back to the 3500 * original buffer passed in when depopulating the descriptor) 3501 */ 3502 vaddr = ldep->addr; 3503 nbytes = ldep->nbytes; 3504 if (((uint64_t)vaddr & 0x7) != 0) { 3505 ASSERT(ldep->align_addr == NULL); 3506 ldep->align_addr = 3507 kmem_alloc(sizeof (caddr_t) * 3508 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3509 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3510 "(buf=%p nb=%ld op=%d)\n", 3511 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3512 nbytes, ldep->operation); 3513 if (perm != LDC_MEM_W) 3514 bcopy(vaddr, ldep->align_addr, nbytes); 3515 vaddr = ldep->align_addr; 3516 } 3517 3518 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3519 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3520 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3521 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3522 vdcp->instance, dep->payload.ncookies); 3523 if (rv != 0) { 3524 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3525 "(mhdl=%p, buf=%p, err=%d)\n", 3526 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3527 if (ldep->align_addr) { 3528 kmem_free(ldep->align_addr, 3529 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3530 ldep->align_addr = NULL; 3531 } 3532 return (EAGAIN); 3533 } 3534 3535 /* 3536 * Get the other cookies (if any). 3537 */ 3538 for (i = 1; i < dep->payload.ncookies; i++) { 3539 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3540 if (rv != 0) { 3541 (void) ldc_mem_unbind_handle(mhdl); 3542 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3543 "(mhdl=%lx cnum=%d), err=%d", 3544 vdcp->instance, mhdl, i, rv); 3545 if (ldep->align_addr) { 3546 kmem_free(ldep->align_addr, 3547 sizeof (caddr_t) * ldep->nbytes); 3548 ldep->align_addr = NULL; 3549 } 3550 return (EAGAIN); 3551 } 3552 } 3553 3554 return (rv); 3555 } 3556 3557 /* 3558 * Interrupt handlers for messages from LDC 3559 */ 3560 3561 /* 3562 * Function: 3563 * vdc_handle_cb() 3564 * 3565 * Description: 3566 * 3567 * Arguments: 3568 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3569 * arg - soft state pointer for this instance of the device driver. 3570 * 3571 * Return Code: 3572 * 0 - Success 3573 */ 3574 static uint_t 3575 vdc_handle_cb(uint64_t event, caddr_t arg) 3576 { 3577 ldc_status_t ldc_state; 3578 int rv = 0; 3579 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3580 vdc_t *vdc = srvr->vdcp; 3581 3582 ASSERT(vdc != NULL); 3583 3584 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3585 3586 /* If callback is not for the current server, ignore it */ 3587 mutex_enter(&vdc->lock); 3588 3589 if (vdc->curr_server != srvr) { 3590 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3591 vdc->instance, event, srvr->id); 3592 mutex_exit(&vdc->lock); 3593 return (LDC_SUCCESS); 3594 } 3595 3596 /* 3597 * Depending on the type of event that triggered this callback, 3598 * we modify the handshake state or read the data. 3599 * 3600 * NOTE: not done as a switch() as event could be triggered by 3601 * a state change and a read request. Also the ordering of the 3602 * check for the event types is deliberate. 3603 */ 3604 if (event & LDC_EVT_UP) { 3605 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3606 3607 /* get LDC state */ 3608 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3609 if (rv != 0) { 3610 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3611 vdc->instance, rv); 3612 mutex_exit(&vdc->lock); 3613 return (LDC_SUCCESS); 3614 } 3615 if (srvr->ldc_state != LDC_UP && 3616 ldc_state == LDC_UP) { 3617 /* 3618 * Reset the transaction sequence numbers when 3619 * LDC comes up. We then kick off the handshake 3620 * negotiation with the vDisk server. 3621 */ 3622 vdc->seq_num = 1; 3623 vdc->seq_num_reply = 0; 3624 srvr->ldc_state = ldc_state; 3625 cv_signal(&vdc->initwait_cv); 3626 } 3627 } 3628 3629 if (event & LDC_EVT_READ) { 3630 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3631 mutex_enter(&vdc->read_lock); 3632 cv_signal(&vdc->read_cv); 3633 vdc->read_state = VDC_READ_PENDING; 3634 mutex_exit(&vdc->read_lock); 3635 mutex_exit(&vdc->lock); 3636 3637 /* that's all we have to do - no need to handle DOWN/RESET */ 3638 return (LDC_SUCCESS); 3639 } 3640 3641 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3642 3643 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3644 3645 /* 3646 * Need to wake up any readers so they will 3647 * detect that a reset has occurred. 3648 */ 3649 mutex_enter(&vdc->read_lock); 3650 if ((vdc->read_state == VDC_READ_WAITING) || 3651 (vdc->read_state == VDC_READ_RESET)) 3652 cv_signal(&vdc->read_cv); 3653 vdc->read_state = VDC_READ_RESET; 3654 mutex_exit(&vdc->read_lock); 3655 3656 /* wake up any threads waiting for connection to come up */ 3657 if (vdc->state == VDC_STATE_INIT_WAITING) { 3658 vdc->state = VDC_STATE_RESETTING; 3659 cv_signal(&vdc->initwait_cv); 3660 } 3661 3662 } 3663 3664 mutex_exit(&vdc->lock); 3665 3666 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3667 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3668 vdc->instance, event); 3669 3670 return (LDC_SUCCESS); 3671 } 3672 3673 /* 3674 * Function: 3675 * vdc_wait_for_response() 3676 * 3677 * Description: 3678 * Block waiting for a response from the server. If there is 3679 * no data the thread block on the read_cv that is signalled 3680 * by the callback when an EVT_READ occurs. 3681 * 3682 * Arguments: 3683 * vdcp - soft state pointer for this instance of the device driver. 3684 * 3685 * Return Code: 3686 * 0 - Success 3687 */ 3688 static int 3689 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3690 { 3691 size_t nbytes = sizeof (*msgp); 3692 int status; 3693 3694 ASSERT(vdcp != NULL); 3695 3696 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3697 3698 status = vdc_recv(vdcp, msgp, &nbytes); 3699 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3700 status, (int)nbytes); 3701 if (status) { 3702 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3703 vdcp->instance, status); 3704 return (status); 3705 } 3706 3707 if (nbytes < sizeof (vio_msg_tag_t)) { 3708 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3709 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3710 return (ENOMSG); 3711 } 3712 3713 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3714 msgp->tag.vio_msgtype, 3715 msgp->tag.vio_subtype, 3716 msgp->tag.vio_subtype_env); 3717 3718 /* 3719 * Verify the Session ID of the message 3720 * 3721 * Every message after the Version has been negotiated should 3722 * have the correct session ID set. 3723 */ 3724 if ((msgp->tag.vio_sid != vdcp->session_id) && 3725 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3726 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3727 "expected 0x%lx [seq num %lx @ %d]", 3728 vdcp->instance, msgp->tag.vio_sid, 3729 vdcp->session_id, 3730 ((vio_dring_msg_t *)msgp)->seq_num, 3731 ((vio_dring_msg_t *)msgp)->start_idx); 3732 return (ENOMSG); 3733 } 3734 return (0); 3735 } 3736 3737 3738 /* 3739 * Function: 3740 * vdc_resubmit_backup_dring() 3741 * 3742 * Description: 3743 * Resubmit each descriptor in the backed up dring to 3744 * vDisk server. The Dring was backed up during connection 3745 * reset. 3746 * 3747 * Arguments: 3748 * vdcp - soft state pointer for this instance of the device driver. 3749 * 3750 * Return Code: 3751 * 0 - Success 3752 */ 3753 static int 3754 vdc_resubmit_backup_dring(vdc_t *vdcp) 3755 { 3756 int processed = 0; 3757 int count; 3758 int b_idx; 3759 int rv = 0; 3760 int dring_size; 3761 int op; 3762 vio_msg_t vio_msg; 3763 vdc_local_desc_t *curr_ldep; 3764 3765 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3766 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3767 3768 if (vdcp->local_dring_backup == NULL) { 3769 /* the pending requests have already been processed */ 3770 return (0); 3771 } 3772 3773 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3774 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3775 3776 /* 3777 * Walk the backup copy of the local descriptor ring and 3778 * resubmit all the outstanding transactions. 3779 */ 3780 b_idx = vdcp->local_dring_backup_tail; 3781 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3782 3783 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3784 3785 /* only resubmit outstanding transactions */ 3786 if (!curr_ldep->is_free) { 3787 /* 3788 * If we are retrying a block read/write operation we 3789 * need to update the I/O statistics to indicate that 3790 * the request is being put back on the waitq to be 3791 * serviced (it will have been taken off after the 3792 * error was reported). 3793 */ 3794 mutex_enter(&vdcp->lock); 3795 op = curr_ldep->operation; 3796 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3797 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3798 VD_KSTAT_WAITQ_ENTER(vdcp); 3799 } 3800 3801 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3802 rv = vdc_populate_descriptor(vdcp, op, 3803 curr_ldep->addr, curr_ldep->nbytes, 3804 curr_ldep->slice, curr_ldep->offset, 3805 curr_ldep->cb_type, curr_ldep->cb_arg, 3806 curr_ldep->dir); 3807 3808 if (rv) { 3809 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3810 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3811 VD_KSTAT_WAITQ_EXIT(vdcp); 3812 DTRACE_IO1(done, buf_t *, 3813 curr_ldep->cb_arg); 3814 } 3815 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3816 vdcp->instance, b_idx); 3817 mutex_exit(&vdcp->lock); 3818 goto done; 3819 } 3820 3821 /* 3822 * If this is a block read/write we update the I/O 3823 * statistics kstat to indicate that the request 3824 * has been sent back to the vDisk server and should 3825 * now be put on the run queue. 3826 */ 3827 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3828 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3829 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3830 } 3831 mutex_exit(&vdcp->lock); 3832 3833 /* Wait for the response message. */ 3834 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3835 b_idx); 3836 rv = vdc_wait_for_response(vdcp, &vio_msg); 3837 if (rv) { 3838 /* 3839 * If this is a block read/write we update 3840 * the I/O statistics kstat to take it 3841 * off the run queue. 3842 */ 3843 mutex_enter(&vdcp->lock); 3844 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3845 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3846 VD_KSTAT_RUNQ_EXIT(vdcp); 3847 DTRACE_IO1(done, buf_t *, 3848 curr_ldep->cb_arg); 3849 } 3850 DMSG(vdcp, 1, "[%d] wait_for_response " 3851 "returned err=%d\n", vdcp->instance, 3852 rv); 3853 mutex_exit(&vdcp->lock); 3854 goto done; 3855 } 3856 3857 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3858 rv = vdc_process_data_msg(vdcp, &vio_msg); 3859 if (rv) { 3860 DMSG(vdcp, 1, "[%d] process_data_msg " 3861 "returned err=%d\n", vdcp->instance, 3862 rv); 3863 goto done; 3864 } 3865 /* 3866 * Mark this entry as free so that we will not resubmit 3867 * this "done" request again, if we were to use the same 3868 * backup_dring again in future. This could happen when 3869 * a reset happens while processing the backup_dring. 3870 */ 3871 curr_ldep->is_free = B_TRUE; 3872 processed++; 3873 } 3874 3875 /* get the next element to submit */ 3876 if (++b_idx >= vdcp->local_dring_backup_len) 3877 b_idx = 0; 3878 } 3879 3880 /* all done - now clear up pending dring copy */ 3881 dring_size = vdcp->local_dring_backup_len * 3882 sizeof (vdcp->local_dring_backup[0]); 3883 3884 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3885 3886 vdcp->local_dring_backup = NULL; 3887 3888 done: 3889 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3890 3891 return (rv); 3892 } 3893 3894 /* 3895 * Function: 3896 * vdc_cancel_backup_dring 3897 * 3898 * Description: 3899 * Cancel each descriptor in the backed up dring to vDisk server. 3900 * The Dring was backed up during connection reset. 3901 * 3902 * Arguments: 3903 * vdcp - soft state pointer for this instance of the device driver. 3904 * 3905 * Return Code: 3906 * None 3907 */ 3908 void 3909 vdc_cancel_backup_dring(vdc_t *vdcp) 3910 { 3911 vdc_local_desc_t *ldep; 3912 struct buf *bufp; 3913 int count; 3914 int b_idx; 3915 int dring_size; 3916 int cancelled = 0; 3917 3918 ASSERT(MUTEX_HELD(&vdcp->lock)); 3919 ASSERT(vdcp->state == VDC_STATE_INIT || 3920 vdcp->state == VDC_STATE_INIT_WAITING || 3921 vdcp->state == VDC_STATE_NEGOTIATE || 3922 vdcp->state == VDC_STATE_RESETTING); 3923 3924 if (vdcp->local_dring_backup == NULL) { 3925 /* the pending requests have already been processed */ 3926 return; 3927 } 3928 3929 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3930 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3931 3932 /* 3933 * Walk the backup copy of the local descriptor ring and 3934 * cancel all the outstanding transactions. 3935 */ 3936 b_idx = vdcp->local_dring_backup_tail; 3937 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3938 3939 ldep = &(vdcp->local_dring_backup[b_idx]); 3940 3941 /* only cancel outstanding transactions */ 3942 if (!ldep->is_free) { 3943 3944 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3945 cancelled++; 3946 3947 /* 3948 * All requests have already been cleared from the 3949 * local descriptor ring and the LDC channel has been 3950 * reset so we will never get any reply for these 3951 * requests. Now we just have to notify threads waiting 3952 * for replies that the request has failed. 3953 */ 3954 switch (ldep->cb_type) { 3955 case CB_SYNC: 3956 ASSERT(vdcp->sync_op_pending); 3957 vdcp->sync_op_status = EIO; 3958 vdcp->sync_op_pending = B_FALSE; 3959 cv_signal(&vdcp->sync_pending_cv); 3960 break; 3961 3962 case CB_STRATEGY: 3963 bufp = ldep->cb_arg; 3964 ASSERT(bufp != NULL); 3965 bufp->b_resid = bufp->b_bcount; 3966 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3967 VD_KSTAT_RUNQ_EXIT(vdcp); 3968 DTRACE_IO1(done, buf_t *, bufp); 3969 bioerror(bufp, EIO); 3970 biodone(bufp); 3971 break; 3972 3973 default: 3974 ASSERT(0); 3975 } 3976 3977 } 3978 3979 /* get the next element to cancel */ 3980 if (++b_idx >= vdcp->local_dring_backup_len) 3981 b_idx = 0; 3982 } 3983 3984 /* all done - now clear up pending dring copy */ 3985 dring_size = vdcp->local_dring_backup_len * 3986 sizeof (vdcp->local_dring_backup[0]); 3987 3988 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3989 3990 vdcp->local_dring_backup = NULL; 3991 3992 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3993 } 3994 3995 /* 3996 * Function: 3997 * vdc_connection_timeout 3998 * 3999 * Description: 4000 * This function is invoked if the timeout set to establish the connection 4001 * with vds expires. This will happen if we spend too much time in the 4002 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4003 * cancel any pending request and mark them as failed. 4004 * 4005 * If the timeout does not expire, it will be cancelled when we reach the 4006 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4007 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4008 * VDC_STATE_RESETTING state in which case we do nothing because the 4009 * timeout is being cancelled. 4010 * 4011 * Arguments: 4012 * arg - argument of the timeout function actually a soft state 4013 * pointer for the instance of the device driver. 4014 * 4015 * Return Code: 4016 * None 4017 */ 4018 void 4019 vdc_connection_timeout(void *arg) 4020 { 4021 vdc_t *vdcp = (vdc_t *)arg; 4022 4023 mutex_enter(&vdcp->lock); 4024 4025 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4026 vdcp->state == VDC_STATE_DETACH) { 4027 /* 4028 * The connection has just been re-established or 4029 * we are detaching. 4030 */ 4031 vdcp->ctimeout_reached = B_FALSE; 4032 mutex_exit(&vdcp->lock); 4033 return; 4034 } 4035 4036 vdcp->ctimeout_reached = B_TRUE; 4037 4038 /* notify requests waiting for sending */ 4039 cv_broadcast(&vdcp->running_cv); 4040 4041 /* cancel requests waiting for a result */ 4042 vdc_cancel_backup_dring(vdcp); 4043 4044 mutex_exit(&vdcp->lock); 4045 4046 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4047 vdcp->instance); 4048 } 4049 4050 /* 4051 * Function: 4052 * vdc_backup_local_dring() 4053 * 4054 * Description: 4055 * Backup the current dring in the event of a reset. The Dring 4056 * transactions will be resubmitted to the server when the 4057 * connection is restored. 4058 * 4059 * Arguments: 4060 * vdcp - soft state pointer for this instance of the device driver. 4061 * 4062 * Return Code: 4063 * NONE 4064 */ 4065 static void 4066 vdc_backup_local_dring(vdc_t *vdcp) 4067 { 4068 int dring_size; 4069 4070 ASSERT(MUTEX_HELD(&vdcp->lock)); 4071 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4072 4073 /* 4074 * If the backup dring is stil around, it means 4075 * that the last restore did not complete. However, 4076 * since we never got back into the running state, 4077 * the backup copy we have is still valid. 4078 */ 4079 if (vdcp->local_dring_backup != NULL) { 4080 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4081 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4082 vdcp->local_dring_backup_tail); 4083 return; 4084 } 4085 4086 /* 4087 * The backup dring can be NULL and the local dring may not be 4088 * initialized. This can happen if we had a reset while establishing 4089 * a new connection but after the connection has timed out. In that 4090 * case the backup dring is NULL because the requests have been 4091 * cancelled and the request occured before the local dring is 4092 * initialized. 4093 */ 4094 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4095 return; 4096 4097 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4098 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4099 4100 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4101 4102 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4103 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4104 4105 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4106 vdcp->local_dring_backup_len = vdcp->dring_len; 4107 } 4108 4109 static void 4110 vdc_switch_server(vdc_t *vdcp) 4111 { 4112 int rv; 4113 vdc_server_t *curr_server, *new_server; 4114 4115 ASSERT(MUTEX_HELD(&vdcp->lock)); 4116 4117 /* if there is only one server return back */ 4118 if (vdcp->num_servers == 1) { 4119 return; 4120 } 4121 4122 /* Get current and next server */ 4123 curr_server = vdcp->curr_server; 4124 new_server = 4125 (curr_server->next) ? curr_server->next : vdcp->server_list; 4126 ASSERT(curr_server != new_server); 4127 4128 /* bring current server's channel down */ 4129 rv = ldc_down(curr_server->ldc_handle); 4130 if (rv) { 4131 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4132 vdcp->instance, curr_server->id); 4133 return; 4134 } 4135 4136 /* switch the server */ 4137 vdcp->curr_server = new_server; 4138 4139 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4140 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4141 } 4142 4143 /* -------------------------------------------------------------------------- */ 4144 4145 /* 4146 * The following functions process the incoming messages from vds 4147 */ 4148 4149 /* 4150 * Function: 4151 * vdc_process_msg_thread() 4152 * 4153 * Description: 4154 * 4155 * Main VDC message processing thread. Each vDisk instance 4156 * consists of a copy of this thread. This thread triggers 4157 * all the handshakes and data exchange with the server. It 4158 * also handles all channel resets 4159 * 4160 * Arguments: 4161 * vdc - soft state pointer for this instance of the device driver. 4162 * 4163 * Return Code: 4164 * None 4165 */ 4166 static void 4167 vdc_process_msg_thread(vdc_t *vdcp) 4168 { 4169 int status; 4170 int ctimeout; 4171 timeout_id_t tmid = 0; 4172 clock_t ldcup_timeout = 0; 4173 4174 mutex_enter(&vdcp->lock); 4175 4176 for (;;) { 4177 4178 #define Q(_s) (vdcp->state == _s) ? #_s : 4179 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4180 Q(VDC_STATE_INIT) 4181 Q(VDC_STATE_INIT_WAITING) 4182 Q(VDC_STATE_NEGOTIATE) 4183 Q(VDC_STATE_HANDLE_PENDING) 4184 Q(VDC_STATE_RUNNING) 4185 Q(VDC_STATE_RESETTING) 4186 Q(VDC_STATE_DETACH) 4187 "UNKNOWN"); 4188 4189 switch (vdcp->state) { 4190 case VDC_STATE_INIT: 4191 4192 /* 4193 * If requested, start a timeout to check if the 4194 * connection with vds is established in the 4195 * specified delay. If the timeout expires, we 4196 * will cancel any pending request. 4197 * 4198 * If some reset have occurred while establishing 4199 * the connection, we already have a timeout armed 4200 * and in that case we don't need to arm a new one. 4201 * 4202 * The same rule applies when there are multiple vds'. 4203 * If either a connection cannot be established or 4204 * the handshake times out, the connection thread will 4205 * try another server. The 'ctimeout' will report 4206 * back an error after it expires irrespective of 4207 * whether the vdisk is trying to connect to just 4208 * one or multiple servers. 4209 */ 4210 ctimeout = (vdc_timeout != 0)? 4211 vdc_timeout : vdcp->curr_server->ctimeout; 4212 4213 if (ctimeout != 0 && tmid == 0) { 4214 tmid = timeout(vdc_connection_timeout, vdcp, 4215 ctimeout * drv_usectohz(MICROSEC)); 4216 } 4217 4218 /* Check if we are re-initializing repeatedly */ 4219 if (vdcp->hshake_cnt > vdc_hshake_retries && 4220 vdcp->lifecycle != VDC_LC_ONLINE) { 4221 4222 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4223 vdcp->instance, vdcp->hshake_cnt); 4224 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4225 vdcp->instance); 4226 vdcp->state = VDC_STATE_DETACH; 4227 break; 4228 } 4229 4230 /* Switch to STATE_DETACH if drv is detaching */ 4231 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4232 vdcp->state = VDC_STATE_DETACH; 4233 break; 4234 } 4235 4236 /* Switch server */ 4237 if (vdcp->hshake_cnt > 0) 4238 vdc_switch_server(vdcp); 4239 vdcp->hshake_cnt++; 4240 4241 /* Bring up connection with vds via LDC */ 4242 status = vdc_start_ldc_connection(vdcp); 4243 if (status != EINVAL) { 4244 vdcp->state = VDC_STATE_INIT_WAITING; 4245 } 4246 break; 4247 4248 case VDC_STATE_INIT_WAITING: 4249 4250 /* if channel is UP, start negotiation */ 4251 if (vdcp->curr_server->ldc_state == LDC_UP) { 4252 vdcp->state = VDC_STATE_NEGOTIATE; 4253 break; 4254 } 4255 4256 /* check if only one server exists */ 4257 if (vdcp->num_servers == 1) { 4258 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4259 } else { 4260 /* 4261 * wait for LDC_UP, if it times out, switch 4262 * to another server. 4263 */ 4264 ldcup_timeout = ddi_get_lbolt() + 4265 (vdc_ldcup_timeout * 4266 drv_usectohz(MICROSEC)); 4267 status = cv_timedwait(&vdcp->initwait_cv, 4268 &vdcp->lock, ldcup_timeout); 4269 if (status == -1 && 4270 vdcp->state == VDC_STATE_INIT_WAITING && 4271 vdcp->curr_server->ldc_state != LDC_UP) { 4272 /* timed out & still waiting */ 4273 vdcp->state = VDC_STATE_INIT; 4274 break; 4275 } 4276 } 4277 4278 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4279 DMSG(vdcp, 0, 4280 "state moved to %d out from under us...\n", 4281 vdcp->state); 4282 } 4283 break; 4284 4285 case VDC_STATE_NEGOTIATE: 4286 switch (status = vdc_ver_negotiation(vdcp)) { 4287 case 0: 4288 break; 4289 default: 4290 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4291 status); 4292 goto reset; 4293 } 4294 4295 switch (status = vdc_attr_negotiation(vdcp)) { 4296 case 0: 4297 break; 4298 default: 4299 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4300 status); 4301 goto reset; 4302 } 4303 4304 switch (status = vdc_dring_negotiation(vdcp)) { 4305 case 0: 4306 break; 4307 default: 4308 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4309 status); 4310 goto reset; 4311 } 4312 4313 switch (status = vdc_rdx_exchange(vdcp)) { 4314 case 0: 4315 vdcp->state = VDC_STATE_HANDLE_PENDING; 4316 goto done; 4317 default: 4318 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4319 status); 4320 goto reset; 4321 } 4322 reset: 4323 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4324 status); 4325 vdcp->state = VDC_STATE_RESETTING; 4326 vdcp->self_reset = B_TRUE; 4327 done: 4328 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4329 vdcp->state); 4330 break; 4331 4332 case VDC_STATE_HANDLE_PENDING: 4333 4334 if (vdcp->ctimeout_reached) { 4335 /* 4336 * The connection timeout had been reached so 4337 * pending requests have been cancelled. Now 4338 * that the connection is back we can reset 4339 * the timeout. 4340 */ 4341 ASSERT(vdcp->local_dring_backup == NULL); 4342 ASSERT(tmid != 0); 4343 tmid = 0; 4344 vdcp->ctimeout_reached = B_FALSE; 4345 vdcp->state = VDC_STATE_RUNNING; 4346 DMSG(vdcp, 0, "[%d] connection to service " 4347 "domain is up", vdcp->instance); 4348 break; 4349 } 4350 4351 mutex_exit(&vdcp->lock); 4352 if (tmid != 0) { 4353 (void) untimeout(tmid); 4354 tmid = 0; 4355 } 4356 status = vdc_resubmit_backup_dring(vdcp); 4357 mutex_enter(&vdcp->lock); 4358 4359 if (status) 4360 vdcp->state = VDC_STATE_RESETTING; 4361 else 4362 vdcp->state = VDC_STATE_RUNNING; 4363 4364 break; 4365 4366 /* enter running state */ 4367 case VDC_STATE_RUNNING: 4368 /* 4369 * Signal anyone waiting for the connection 4370 * to come on line. 4371 */ 4372 vdcp->hshake_cnt = 0; 4373 cv_broadcast(&vdcp->running_cv); 4374 4375 /* failfast has to been checked after reset */ 4376 cv_signal(&vdcp->failfast_cv); 4377 4378 /* ownership is lost during reset */ 4379 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4380 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4381 cv_signal(&vdcp->ownership_cv); 4382 4383 cmn_err(CE_CONT, "?vdisk@%d is online using " 4384 "ldc@%ld,%ld\n", vdcp->instance, 4385 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4386 4387 mutex_exit(&vdcp->lock); 4388 4389 for (;;) { 4390 vio_msg_t msg; 4391 status = vdc_wait_for_response(vdcp, &msg); 4392 if (status) break; 4393 4394 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4395 vdcp->instance); 4396 status = vdc_process_data_msg(vdcp, &msg); 4397 if (status) { 4398 DMSG(vdcp, 1, "[%d] process_data_msg " 4399 "returned err=%d\n", vdcp->instance, 4400 status); 4401 break; 4402 } 4403 4404 } 4405 4406 mutex_enter(&vdcp->lock); 4407 4408 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4409 vdcp->instance); 4410 4411 vdcp->state = VDC_STATE_RESETTING; 4412 vdcp->self_reset = B_TRUE; 4413 break; 4414 4415 case VDC_STATE_RESETTING: 4416 /* 4417 * When we reach this state, we either come from the 4418 * VDC_STATE_RUNNING state and we can have pending 4419 * request but no timeout is armed; or we come from 4420 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4421 * VDC_HANDLE_PENDING state and there is no pending 4422 * request or pending requests have already been copied 4423 * into the backup dring. So we can safely keep the 4424 * connection timeout armed while we are in this state. 4425 */ 4426 4427 DMSG(vdcp, 0, "Initiating channel reset " 4428 "(pending = %d)\n", (int)vdcp->threads_pending); 4429 4430 if (vdcp->self_reset) { 4431 DMSG(vdcp, 0, 4432 "[%d] calling stop_ldc_connection.\n", 4433 vdcp->instance); 4434 status = vdc_stop_ldc_connection(vdcp); 4435 vdcp->self_reset = B_FALSE; 4436 } 4437 4438 /* 4439 * Wait for all threads currently waiting 4440 * for a free dring entry to use. 4441 */ 4442 while (vdcp->threads_pending) { 4443 cv_broadcast(&vdcp->membind_cv); 4444 cv_broadcast(&vdcp->dring_free_cv); 4445 mutex_exit(&vdcp->lock); 4446 /* give the waiters enough time to wake up */ 4447 delay(vdc_hz_min_ldc_delay); 4448 mutex_enter(&vdcp->lock); 4449 } 4450 4451 ASSERT(vdcp->threads_pending == 0); 4452 4453 /* Sanity check that no thread is receiving */ 4454 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4455 4456 vdcp->read_state = VDC_READ_IDLE; 4457 4458 vdc_backup_local_dring(vdcp); 4459 4460 /* cleanup the old d-ring */ 4461 vdc_destroy_descriptor_ring(vdcp); 4462 4463 /* go and start again */ 4464 vdcp->state = VDC_STATE_INIT; 4465 4466 break; 4467 4468 case VDC_STATE_DETACH: 4469 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4470 vdcp->instance); 4471 4472 /* cancel any pending timeout */ 4473 mutex_exit(&vdcp->lock); 4474 if (tmid != 0) { 4475 (void) untimeout(tmid); 4476 tmid = 0; 4477 } 4478 mutex_enter(&vdcp->lock); 4479 4480 /* 4481 * Signal anyone waiting for connection 4482 * to come online 4483 */ 4484 cv_broadcast(&vdcp->running_cv); 4485 4486 while (vdcp->sync_op_pending) { 4487 cv_signal(&vdcp->sync_pending_cv); 4488 cv_signal(&vdcp->sync_blocked_cv); 4489 mutex_exit(&vdcp->lock); 4490 /* give the waiters enough time to wake up */ 4491 delay(vdc_hz_min_ldc_delay); 4492 mutex_enter(&vdcp->lock); 4493 } 4494 4495 mutex_exit(&vdcp->lock); 4496 4497 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4498 vdcp->instance); 4499 thread_exit(); 4500 break; 4501 } 4502 } 4503 } 4504 4505 4506 /* 4507 * Function: 4508 * vdc_process_data_msg() 4509 * 4510 * Description: 4511 * This function is called by the message processing thread each time 4512 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4513 * be an ACK or NACK from vds[1] which vdc handles as follows. 4514 * ACK - wake up the waiting thread 4515 * NACK - resend any messages necessary 4516 * 4517 * [1] Although the message format allows it, vds should not send a 4518 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4519 * some bizarre reason it does, vdc will reset the connection. 4520 * 4521 * Arguments: 4522 * vdc - soft state pointer for this instance of the device driver. 4523 * msg - the LDC message sent by vds 4524 * 4525 * Return Code: 4526 * 0 - Success. 4527 * > 0 - error value returned by LDC 4528 */ 4529 static int 4530 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4531 { 4532 int status = 0; 4533 vio_dring_msg_t *dring_msg; 4534 vdc_local_desc_t *ldep = NULL; 4535 int start, end; 4536 int idx; 4537 int op; 4538 4539 dring_msg = (vio_dring_msg_t *)msg; 4540 4541 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4542 ASSERT(vdcp != NULL); 4543 4544 mutex_enter(&vdcp->lock); 4545 4546 /* 4547 * Check to see if the message has bogus data 4548 */ 4549 idx = start = dring_msg->start_idx; 4550 end = dring_msg->end_idx; 4551 if ((start >= vdcp->dring_len) || 4552 (end >= vdcp->dring_len) || (end < -1)) { 4553 /* 4554 * Update the I/O statistics to indicate that an error ocurred. 4555 * No need to update the wait/run queues as no specific read or 4556 * write request is being completed in response to this 'msg'. 4557 */ 4558 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4559 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4560 vdcp->instance, start, end); 4561 mutex_exit(&vdcp->lock); 4562 return (EINVAL); 4563 } 4564 4565 /* 4566 * Verify that the sequence number is what vdc expects. 4567 */ 4568 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4569 case VDC_SEQ_NUM_TODO: 4570 break; /* keep processing this message */ 4571 case VDC_SEQ_NUM_SKIP: 4572 mutex_exit(&vdcp->lock); 4573 return (0); 4574 case VDC_SEQ_NUM_INVALID: 4575 /* 4576 * Update the I/O statistics to indicate that an error ocurred. 4577 * No need to update the wait/run queues as no specific read or 4578 * write request is being completed in response to this 'msg'. 4579 */ 4580 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4581 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4582 mutex_exit(&vdcp->lock); 4583 return (ENXIO); 4584 } 4585 4586 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4587 /* 4588 * Update the I/O statistics to indicate that an error ocurred. 4589 * 4590 * We need to update the run queue if a read or write request 4591 * is being NACKed - otherwise there will appear to be an 4592 * indefinite outstanding request and statistics reported by 4593 * iostat(1M) will be incorrect. The transaction will be 4594 * resubmitted from the backup DRing following the reset 4595 * and the wait/run queues will be entered again. 4596 */ 4597 ldep = &vdcp->local_dring[idx]; 4598 op = ldep->operation; 4599 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4600 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4601 VD_KSTAT_RUNQ_EXIT(vdcp); 4602 } 4603 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4604 VDC_DUMP_DRING_MSG(dring_msg); 4605 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4606 mutex_exit(&vdcp->lock); 4607 return (EIO); 4608 4609 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4610 /* 4611 * Update the I/O statistics to indicate that an error occurred. 4612 * No need to update the wait/run queues as no specific read or 4613 * write request is being completed in response to this 'msg'. 4614 */ 4615 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4616 mutex_exit(&vdcp->lock); 4617 return (EPROTO); 4618 } 4619 4620 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4621 ASSERT(start == end); 4622 4623 ldep = &vdcp->local_dring[idx]; 4624 4625 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4626 ldep->dep->hdr.dstate, ldep->cb_type); 4627 4628 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4629 struct buf *bufp; 4630 4631 switch (ldep->cb_type) { 4632 case CB_SYNC: 4633 ASSERT(vdcp->sync_op_pending); 4634 4635 status = vdc_depopulate_descriptor(vdcp, idx); 4636 vdcp->sync_op_status = status; 4637 vdcp->sync_op_pending = B_FALSE; 4638 cv_signal(&vdcp->sync_pending_cv); 4639 break; 4640 4641 case CB_STRATEGY: 4642 bufp = ldep->cb_arg; 4643 ASSERT(bufp != NULL); 4644 bufp->b_resid = 4645 bufp->b_bcount - ldep->dep->payload.nbytes; 4646 status = ldep->dep->payload.status; /* Future:ntoh */ 4647 if (status != 0) { 4648 DMSG(vdcp, 1, "strategy status=%d\n", status); 4649 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4650 bioerror(bufp, status); 4651 } 4652 4653 (void) vdc_depopulate_descriptor(vdcp, idx); 4654 4655 DMSG(vdcp, 1, 4656 "strategy complete req=%ld bytes resp=%ld bytes\n", 4657 bufp->b_bcount, ldep->dep->payload.nbytes); 4658 4659 if (status != 0 && vdcp->failfast_interval != 0) { 4660 /* 4661 * The I/O has failed and failfast is enabled. 4662 * We need the failfast thread to check if the 4663 * failure is due to a reservation conflict. 4664 */ 4665 (void) vdc_failfast_io_queue(vdcp, bufp); 4666 } else { 4667 if (status == 0) { 4668 op = (bufp->b_flags & B_READ) ? 4669 VD_OP_BREAD : VD_OP_BWRITE; 4670 VD_UPDATE_IO_STATS(vdcp, op, 4671 ldep->dep->payload.nbytes); 4672 } 4673 VD_KSTAT_RUNQ_EXIT(vdcp); 4674 DTRACE_IO1(done, buf_t *, bufp); 4675 biodone(bufp); 4676 } 4677 break; 4678 4679 default: 4680 ASSERT(0); 4681 } 4682 } 4683 4684 /* let the arrival signal propogate */ 4685 mutex_exit(&vdcp->lock); 4686 4687 /* probe gives the count of how many entries were processed */ 4688 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4689 4690 return (0); 4691 } 4692 4693 4694 /* 4695 * Function: 4696 * vdc_handle_ver_msg() 4697 * 4698 * Description: 4699 * 4700 * Arguments: 4701 * vdc - soft state pointer for this instance of the device driver. 4702 * ver_msg - LDC message sent by vDisk server 4703 * 4704 * Return Code: 4705 * 0 - Success 4706 */ 4707 static int 4708 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4709 { 4710 int status = 0; 4711 4712 ASSERT(vdc != NULL); 4713 ASSERT(mutex_owned(&vdc->lock)); 4714 4715 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4716 return (EPROTO); 4717 } 4718 4719 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4720 return (EINVAL); 4721 } 4722 4723 switch (ver_msg->tag.vio_subtype) { 4724 case VIO_SUBTYPE_ACK: 4725 /* 4726 * We check to see if the version returned is indeed supported 4727 * (The server may have also adjusted the minor number downwards 4728 * and if so 'ver_msg' will contain the actual version agreed) 4729 */ 4730 if (vdc_is_supported_version(ver_msg)) { 4731 vdc->ver.major = ver_msg->ver_major; 4732 vdc->ver.minor = ver_msg->ver_minor; 4733 ASSERT(vdc->ver.major > 0); 4734 } else { 4735 status = EPROTO; 4736 } 4737 break; 4738 4739 case VIO_SUBTYPE_NACK: 4740 /* 4741 * call vdc_is_supported_version() which will return the next 4742 * supported version (if any) in 'ver_msg' 4743 */ 4744 (void) vdc_is_supported_version(ver_msg); 4745 if (ver_msg->ver_major > 0) { 4746 size_t len = sizeof (*ver_msg); 4747 4748 ASSERT(vdc->ver.major > 0); 4749 4750 /* reset the necessary fields and resend */ 4751 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4752 ver_msg->dev_class = VDEV_DISK; 4753 4754 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4755 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4756 vdc->instance, status); 4757 if (len != sizeof (*ver_msg)) 4758 status = EBADMSG; 4759 } else { 4760 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4761 vdc->instance); 4762 status = ENOTSUP; 4763 } 4764 4765 break; 4766 case VIO_SUBTYPE_INFO: 4767 /* 4768 * Handle the case where vds starts handshake 4769 * (for now only vdc is the instigator) 4770 */ 4771 status = ENOTSUP; 4772 break; 4773 4774 default: 4775 status = EINVAL; 4776 break; 4777 } 4778 4779 return (status); 4780 } 4781 4782 /* 4783 * Function: 4784 * vdc_handle_attr_msg() 4785 * 4786 * Description: 4787 * 4788 * Arguments: 4789 * vdc - soft state pointer for this instance of the device driver. 4790 * attr_msg - LDC message sent by vDisk server 4791 * 4792 * Return Code: 4793 * 0 - Success 4794 */ 4795 static int 4796 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4797 { 4798 int status = 0; 4799 4800 ASSERT(vdc != NULL); 4801 ASSERT(mutex_owned(&vdc->lock)); 4802 4803 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4804 return (EPROTO); 4805 } 4806 4807 switch (attr_msg->tag.vio_subtype) { 4808 case VIO_SUBTYPE_ACK: 4809 /* 4810 * We now verify the attributes sent by vds. 4811 */ 4812 if (attr_msg->vdisk_size == 0) { 4813 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4814 vdc->instance); 4815 status = EINVAL; 4816 break; 4817 } 4818 4819 if (attr_msg->max_xfer_sz == 0) { 4820 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4821 vdc->instance); 4822 status = EINVAL; 4823 break; 4824 } 4825 4826 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4827 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4828 vdc->instance); 4829 attr_msg->vdisk_size = 0; 4830 } 4831 /* update disk, block and transfer sizes */ 4832 vdc_update_size(vdc, attr_msg->vdisk_size, 4833 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 4834 vdc->vdisk_type = attr_msg->vdisk_type; 4835 vdc->operations = attr_msg->operations; 4836 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4837 vdc->vdisk_media = attr_msg->vdisk_media; 4838 else 4839 vdc->vdisk_media = 0; 4840 4841 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4842 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4843 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4844 vdc->instance, vdc->block_size, 4845 attr_msg->vdisk_block_size); 4846 4847 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4848 (attr_msg->vdisk_size > INT64_MAX) || 4849 (attr_msg->operations == 0) || 4850 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4851 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4852 vdc->instance); 4853 status = EINVAL; 4854 break; 4855 } 4856 4857 /* 4858 * Now that we have received all attributes we can create a 4859 * fake geometry for the disk. 4860 */ 4861 vdc_create_fake_geometry(vdc); 4862 break; 4863 4864 case VIO_SUBTYPE_NACK: 4865 /* 4866 * vds could not handle the attributes we sent so we 4867 * stop negotiating. 4868 */ 4869 status = EPROTO; 4870 break; 4871 4872 case VIO_SUBTYPE_INFO: 4873 /* 4874 * Handle the case where vds starts the handshake 4875 * (for now; vdc is the only supported instigatior) 4876 */ 4877 status = ENOTSUP; 4878 break; 4879 4880 default: 4881 status = ENOTSUP; 4882 break; 4883 } 4884 4885 return (status); 4886 } 4887 4888 /* 4889 * Function: 4890 * vdc_handle_dring_reg_msg() 4891 * 4892 * Description: 4893 * 4894 * Arguments: 4895 * vdc - soft state pointer for this instance of the driver. 4896 * dring_msg - LDC message sent by vDisk server 4897 * 4898 * Return Code: 4899 * 0 - Success 4900 */ 4901 static int 4902 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4903 { 4904 int status = 0; 4905 4906 ASSERT(vdc != NULL); 4907 ASSERT(mutex_owned(&vdc->lock)); 4908 4909 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4910 return (EPROTO); 4911 } 4912 4913 switch (dring_msg->tag.vio_subtype) { 4914 case VIO_SUBTYPE_ACK: 4915 /* save the received dring_ident */ 4916 vdc->dring_ident = dring_msg->dring_ident; 4917 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4918 vdc->instance, vdc->dring_ident); 4919 break; 4920 4921 case VIO_SUBTYPE_NACK: 4922 /* 4923 * vds could not handle the DRing info we sent so we 4924 * stop negotiating. 4925 */ 4926 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4927 vdc->instance); 4928 status = EPROTO; 4929 break; 4930 4931 case VIO_SUBTYPE_INFO: 4932 /* 4933 * Handle the case where vds starts handshake 4934 * (for now only vdc is the instigatior) 4935 */ 4936 status = ENOTSUP; 4937 break; 4938 default: 4939 status = ENOTSUP; 4940 } 4941 4942 return (status); 4943 } 4944 4945 /* 4946 * Function: 4947 * vdc_verify_seq_num() 4948 * 4949 * Description: 4950 * This functions verifies that the sequence number sent back by the vDisk 4951 * server with the latest message is what is expected (i.e. it is greater 4952 * than the last seq num sent by the vDisk server and less than or equal 4953 * to the last seq num generated by vdc). 4954 * 4955 * It then checks the request ID to see if any requests need processing 4956 * in the DRing. 4957 * 4958 * Arguments: 4959 * vdc - soft state pointer for this instance of the driver. 4960 * dring_msg - pointer to the LDC message sent by vds 4961 * 4962 * Return Code: 4963 * VDC_SEQ_NUM_TODO - Message needs to be processed 4964 * VDC_SEQ_NUM_SKIP - Message has already been processed 4965 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4966 * vdc cannot deal with them 4967 */ 4968 static int 4969 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4970 { 4971 ASSERT(vdc != NULL); 4972 ASSERT(dring_msg != NULL); 4973 ASSERT(mutex_owned(&vdc->lock)); 4974 4975 /* 4976 * Check to see if the messages were responded to in the correct 4977 * order by vds. 4978 */ 4979 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4980 (dring_msg->seq_num > vdc->seq_num)) { 4981 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4982 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4983 vdc->instance, dring_msg->seq_num, 4984 vdc->seq_num_reply, vdc->seq_num, 4985 vdc->req_id_proc, vdc->req_id); 4986 return (VDC_SEQ_NUM_INVALID); 4987 } 4988 vdc->seq_num_reply = dring_msg->seq_num; 4989 4990 if (vdc->req_id_proc < vdc->req_id) 4991 return (VDC_SEQ_NUM_TODO); 4992 else 4993 return (VDC_SEQ_NUM_SKIP); 4994 } 4995 4996 4997 /* 4998 * Function: 4999 * vdc_is_supported_version() 5000 * 5001 * Description: 5002 * This routine checks if the major/minor version numbers specified in 5003 * 'ver_msg' are supported. If not it finds the next version that is 5004 * in the supported version list 'vdc_version[]' and sets the fields in 5005 * 'ver_msg' to those values 5006 * 5007 * Arguments: 5008 * ver_msg - LDC message sent by vDisk server 5009 * 5010 * Return Code: 5011 * B_TRUE - Success 5012 * B_FALSE - Version not supported 5013 */ 5014 static boolean_t 5015 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5016 { 5017 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5018 5019 for (int i = 0; i < vdc_num_versions; i++) { 5020 ASSERT(vdc_version[i].major > 0); 5021 ASSERT((i == 0) || 5022 (vdc_version[i].major < vdc_version[i-1].major)); 5023 5024 /* 5025 * If the major versions match, adjust the minor version, if 5026 * necessary, down to the highest value supported by this 5027 * client. The server should support all minor versions lower 5028 * than the value it sent 5029 */ 5030 if (ver_msg->ver_major == vdc_version[i].major) { 5031 if (ver_msg->ver_minor > vdc_version[i].minor) { 5032 DMSGX(0, 5033 "Adjusting minor version from %u to %u", 5034 ver_msg->ver_minor, vdc_version[i].minor); 5035 ver_msg->ver_minor = vdc_version[i].minor; 5036 } 5037 return (B_TRUE); 5038 } 5039 5040 /* 5041 * If the message contains a higher major version number, set 5042 * the message's major/minor versions to the current values 5043 * and return false, so this message will get resent with 5044 * these values, and the server will potentially try again 5045 * with the same or a lower version 5046 */ 5047 if (ver_msg->ver_major > vdc_version[i].major) { 5048 ver_msg->ver_major = vdc_version[i].major; 5049 ver_msg->ver_minor = vdc_version[i].minor; 5050 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5051 ver_msg->ver_major, ver_msg->ver_minor); 5052 5053 return (B_FALSE); 5054 } 5055 5056 /* 5057 * Otherwise, the message's major version is less than the 5058 * current major version, so continue the loop to the next 5059 * (lower) supported version 5060 */ 5061 } 5062 5063 /* 5064 * No common version was found; "ground" the version pair in the 5065 * message to terminate negotiation 5066 */ 5067 ver_msg->ver_major = 0; 5068 ver_msg->ver_minor = 0; 5069 5070 return (B_FALSE); 5071 } 5072 /* -------------------------------------------------------------------------- */ 5073 5074 /* 5075 * DKIO(7) support 5076 */ 5077 5078 typedef struct vdc_dk_arg { 5079 struct dk_callback dkc; 5080 int mode; 5081 dev_t dev; 5082 vdc_t *vdc; 5083 } vdc_dk_arg_t; 5084 5085 /* 5086 * Function: 5087 * vdc_dkio_flush_cb() 5088 * 5089 * Description: 5090 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5091 * by kernel code. 5092 * 5093 * Arguments: 5094 * arg - a pointer to a vdc_dk_arg_t structure. 5095 */ 5096 void 5097 vdc_dkio_flush_cb(void *arg) 5098 { 5099 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5100 struct dk_callback *dkc = NULL; 5101 vdc_t *vdc = NULL; 5102 int rv; 5103 5104 if (dk_arg == NULL) { 5105 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5106 return; 5107 } 5108 dkc = &dk_arg->dkc; 5109 vdc = dk_arg->vdc; 5110 ASSERT(vdc != NULL); 5111 5112 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5113 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5114 if (rv != 0) { 5115 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5116 vdc->instance, rv, 5117 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5118 } 5119 5120 /* 5121 * Trigger the call back to notify the caller the the ioctl call has 5122 * been completed. 5123 */ 5124 if ((dk_arg->mode & FKIOCTL) && 5125 (dkc != NULL) && 5126 (dkc->dkc_callback != NULL)) { 5127 ASSERT(dkc->dkc_cookie != NULL); 5128 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5129 } 5130 5131 /* Indicate that one less DKIO write flush is outstanding */ 5132 mutex_enter(&vdc->lock); 5133 vdc->dkio_flush_pending--; 5134 ASSERT(vdc->dkio_flush_pending >= 0); 5135 mutex_exit(&vdc->lock); 5136 5137 /* free the mem that was allocated when the callback was dispatched */ 5138 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5139 } 5140 5141 /* 5142 * Function: 5143 * vdc_dkio_gapart() 5144 * 5145 * Description: 5146 * This function implements the DKIOCGAPART ioctl. 5147 * 5148 * Arguments: 5149 * vdc - soft state pointer 5150 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5151 * flag - ioctl flags 5152 */ 5153 static int 5154 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5155 { 5156 struct dk_geom *geom; 5157 struct extvtoc *vtoc; 5158 union { 5159 struct dk_map map[NDKMAP]; 5160 struct dk_map32 map32[NDKMAP]; 5161 } data; 5162 int i, rv, size; 5163 5164 mutex_enter(&vdc->lock); 5165 5166 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5167 mutex_exit(&vdc->lock); 5168 return (rv); 5169 } 5170 5171 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) { 5172 mutex_exit(&vdc->lock); 5173 return (EOVERFLOW); 5174 } 5175 5176 vtoc = vdc->vtoc; 5177 geom = vdc->geom; 5178 5179 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5180 5181 for (i = 0; i < vtoc->v_nparts; i++) { 5182 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5183 (geom->dkg_nhead * geom->dkg_nsect); 5184 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5185 } 5186 size = NDKMAP * sizeof (struct dk_map32); 5187 5188 } else { 5189 5190 for (i = 0; i < vtoc->v_nparts; i++) { 5191 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5192 (geom->dkg_nhead * geom->dkg_nsect); 5193 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5194 } 5195 size = NDKMAP * sizeof (struct dk_map); 5196 5197 } 5198 5199 mutex_exit(&vdc->lock); 5200 5201 if (ddi_copyout(&data, arg, size, flag) != 0) 5202 return (EFAULT); 5203 5204 return (0); 5205 } 5206 5207 /* 5208 * Function: 5209 * vdc_dkio_partition() 5210 * 5211 * Description: 5212 * This function implements the DKIOCPARTITION ioctl. 5213 * 5214 * Arguments: 5215 * vdc - soft state pointer 5216 * arg - a pointer to a struct partition64 structure 5217 * flag - ioctl flags 5218 */ 5219 static int 5220 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5221 { 5222 struct partition64 p64; 5223 efi_gpt_t *gpt; 5224 efi_gpe_t *gpe; 5225 vd_efi_dev_t edev; 5226 uint_t partno; 5227 int rv; 5228 5229 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5230 return (EFAULT); 5231 } 5232 5233 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5234 5235 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5236 return (rv); 5237 } 5238 5239 partno = p64.p_partno; 5240 5241 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5242 vd_efi_free(&edev, gpt, gpe); 5243 return (ESRCH); 5244 } 5245 5246 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5247 sizeof (struct uuid)); 5248 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5249 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5250 5251 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5252 vd_efi_free(&edev, gpt, gpe); 5253 return (EFAULT); 5254 } 5255 5256 vd_efi_free(&edev, gpt, gpe); 5257 return (0); 5258 } 5259 5260 /* 5261 * Function: 5262 * vdc_dioctl_rwcmd() 5263 * 5264 * Description: 5265 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5266 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5267 * 5268 * Arguments: 5269 * dev - device 5270 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5271 * flag - ioctl flags 5272 */ 5273 static int 5274 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5275 { 5276 struct dadkio_rwcmd32 rwcmd32; 5277 struct dadkio_rwcmd rwcmd; 5278 struct iovec aiov; 5279 struct uio auio; 5280 int rw, status; 5281 struct buf *buf; 5282 5283 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5284 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5285 sizeof (struct dadkio_rwcmd32), flag)) { 5286 return (EFAULT); 5287 } 5288 rwcmd.cmd = rwcmd32.cmd; 5289 rwcmd.flags = rwcmd32.flags; 5290 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5291 rwcmd.buflen = rwcmd32.buflen; 5292 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5293 } else { 5294 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5295 sizeof (struct dadkio_rwcmd), flag)) { 5296 return (EFAULT); 5297 } 5298 } 5299 5300 switch (rwcmd.cmd) { 5301 case DADKIO_RWCMD_READ: 5302 rw = B_READ; 5303 break; 5304 case DADKIO_RWCMD_WRITE: 5305 rw = B_WRITE; 5306 break; 5307 default: 5308 return (EINVAL); 5309 } 5310 5311 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5312 aiov.iov_base = rwcmd.bufaddr; 5313 aiov.iov_len = rwcmd.buflen; 5314 5315 bzero((caddr_t)&auio, sizeof (struct uio)); 5316 auio.uio_iov = &aiov; 5317 auio.uio_iovcnt = 1; 5318 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5319 auio.uio_resid = rwcmd.buflen; 5320 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5321 5322 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5323 bioinit(buf); 5324 /* 5325 * We use the private field of buf to specify that this is an 5326 * I/O using an absolute offset. 5327 */ 5328 buf->b_private = (void *)VD_SLICE_NONE; 5329 5330 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5331 5332 biofini(buf); 5333 kmem_free(buf, sizeof (buf_t)); 5334 5335 return (status); 5336 } 5337 5338 /* 5339 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5340 * buffer is returned in alloc_len. 5341 */ 5342 static vd_scsi_t * 5343 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5344 int *alloc_len) 5345 { 5346 vd_scsi_t *vd_scsi; 5347 int vd_scsi_len = VD_SCSI_SIZE; 5348 5349 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5350 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5351 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5352 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5353 5354 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5355 5356 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5357 5358 vd_scsi->cdb_len = cdb_len; 5359 vd_scsi->sense_len = sense_len; 5360 vd_scsi->datain_len = datain_len; 5361 vd_scsi->dataout_len = dataout_len; 5362 5363 *alloc_len = vd_scsi_len; 5364 5365 return (vd_scsi); 5366 } 5367 5368 /* 5369 * Convert the status of a SCSI command to a Solaris return code. 5370 * 5371 * Arguments: 5372 * vd_scsi - The SCSI operation buffer. 5373 * log_error - indicate if an error message should be logged. 5374 * 5375 * Note that our SCSI error messages are rather primitive for the moment 5376 * and could be improved by decoding some data like the SCSI command and 5377 * the sense key. 5378 * 5379 * Return value: 5380 * 0 - Status is good. 5381 * EACCES - Status reports a reservation conflict. 5382 * ENOTSUP - Status reports a check condition and sense key 5383 * reports an illegal request. 5384 * EIO - Any other status. 5385 */ 5386 static int 5387 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5388 { 5389 int rv; 5390 char path_str[MAXPATHLEN]; 5391 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5392 union scsi_cdb *cdb; 5393 struct scsi_extended_sense *sense; 5394 5395 if (vd_scsi->cmd_status == STATUS_GOOD) 5396 /* no error */ 5397 return (0); 5398 5399 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5400 if (vdc_scsi_log_error) 5401 log_error = B_TRUE; 5402 5403 if (log_error) { 5404 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5405 ddi_pathname(vdc->dip, path_str), vdc->instance, 5406 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5407 } 5408 5409 /* default returned value */ 5410 rv = EIO; 5411 5412 switch (vd_scsi->cmd_status) { 5413 5414 case STATUS_CHECK: 5415 case STATUS_TERMINATED: 5416 if (log_error) 5417 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5418 5419 /* check sense buffer */ 5420 if (vd_scsi->sense_len == 0 || 5421 vd_scsi->sense_status != STATUS_GOOD) { 5422 if (log_error) 5423 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5424 break; 5425 } 5426 5427 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5428 5429 if (log_error) { 5430 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5431 "\tASC: 0x%x, ASCQ: 0x%x\n", 5432 scsi_sense_key((uint8_t *)sense), 5433 scsi_sense_asc((uint8_t *)sense), 5434 scsi_sense_ascq((uint8_t *)sense)); 5435 } 5436 5437 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5438 rv = ENOTSUP; 5439 break; 5440 5441 case STATUS_BUSY: 5442 if (log_error) 5443 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5444 break; 5445 5446 case STATUS_RESERVATION_CONFLICT: 5447 /* 5448 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5449 * reservation conflict could be due to various reasons like 5450 * incorrect keys, not registered or not reserved etc. So, 5451 * we should not panic in that case. 5452 */ 5453 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5454 if (vdc->failfast_interval != 0 && 5455 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5456 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5457 /* failfast is enabled so we have to panic */ 5458 (void) snprintf(panic_str, sizeof (panic_str), 5459 VDC_RESV_CONFLICT_FMT_STR "%s", 5460 ddi_pathname(vdc->dip, path_str)); 5461 panic(panic_str); 5462 } 5463 if (log_error) 5464 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5465 rv = EACCES; 5466 break; 5467 5468 case STATUS_QFULL: 5469 if (log_error) 5470 cmn_err(CE_NOTE, "\tQueue Full\n"); 5471 break; 5472 5473 case STATUS_MET: 5474 case STATUS_INTERMEDIATE: 5475 case STATUS_SCSI2: 5476 case STATUS_INTERMEDIATE_MET: 5477 case STATUS_ACA_ACTIVE: 5478 if (log_error) 5479 cmn_err(CE_CONT, 5480 "\tUnexpected SCSI status received: 0x%x\n", 5481 vd_scsi->cmd_status); 5482 break; 5483 5484 default: 5485 if (log_error) 5486 cmn_err(CE_CONT, 5487 "\tInvalid SCSI status received: 0x%x\n", 5488 vd_scsi->cmd_status); 5489 break; 5490 } 5491 5492 return (rv); 5493 } 5494 5495 /* 5496 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5497 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5498 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5499 * converted to a VD_OP_RESET operation. 5500 */ 5501 static int 5502 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5503 { 5504 struct uscsi_cmd uscsi; 5505 struct uscsi_cmd32 uscsi32; 5506 vd_scsi_t *vd_scsi; 5507 int vd_scsi_len; 5508 union scsi_cdb *cdb; 5509 struct scsi_extended_sense *sense; 5510 char *datain, *dataout; 5511 size_t cdb_len, datain_len, dataout_len, sense_len; 5512 int rv; 5513 5514 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5515 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5516 mode) != 0) 5517 return (EFAULT); 5518 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5519 } else { 5520 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5521 mode) != 0) 5522 return (EFAULT); 5523 } 5524 5525 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5526 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5527 USCSI_RESET_ALL)) { 5528 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5529 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5530 return (rv); 5531 } 5532 5533 /* cdb buffer length */ 5534 cdb_len = uscsi.uscsi_cdblen; 5535 5536 /* data in and out buffers length */ 5537 if (uscsi.uscsi_flags & USCSI_READ) { 5538 datain_len = uscsi.uscsi_buflen; 5539 dataout_len = 0; 5540 } else { 5541 datain_len = 0; 5542 dataout_len = uscsi.uscsi_buflen; 5543 } 5544 5545 /* sense buffer length */ 5546 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5547 sense_len = uscsi.uscsi_rqlen; 5548 else 5549 sense_len = 0; 5550 5551 /* allocate buffer for the VD_SCSICMD_OP operation */ 5552 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5553 &vd_scsi_len); 5554 5555 /* 5556 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5557 * but basically they prevent a SCSI command from being retried in case 5558 * of an error. 5559 */ 5560 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5561 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5562 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5563 5564 /* set task attribute */ 5565 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5566 vd_scsi->task_attribute = 0; 5567 } else { 5568 if (uscsi.uscsi_flags & USCSI_HEAD) 5569 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5570 else if (uscsi.uscsi_flags & USCSI_HTAG) 5571 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5572 else if (uscsi.uscsi_flags & USCSI_OTAG) 5573 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5574 else 5575 vd_scsi->task_attribute = 0; 5576 } 5577 5578 /* set timeout */ 5579 vd_scsi->timeout = uscsi.uscsi_timeout; 5580 5581 /* copy-in cdb data */ 5582 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5583 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5584 rv = EFAULT; 5585 goto done; 5586 } 5587 5588 /* keep a pointer to the sense buffer */ 5589 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5590 5591 /* keep a pointer to the data-in buffer */ 5592 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5593 5594 /* copy-in request data to the data-out buffer */ 5595 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5596 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5597 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5598 mode)) { 5599 rv = EFAULT; 5600 goto done; 5601 } 5602 } 5603 5604 /* submit the request */ 5605 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5606 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5607 5608 if (rv != 0) 5609 goto done; 5610 5611 /* update scsi status */ 5612 uscsi.uscsi_status = vd_scsi->cmd_status; 5613 5614 /* update sense data */ 5615 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5616 (uscsi.uscsi_status == STATUS_CHECK || 5617 uscsi.uscsi_status == STATUS_TERMINATED)) { 5618 5619 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5620 5621 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5622 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5623 vd_scsi->sense_len; 5624 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5625 vd_scsi->sense_len, mode) != 0) { 5626 rv = EFAULT; 5627 goto done; 5628 } 5629 } 5630 } 5631 5632 /* update request data */ 5633 if (uscsi.uscsi_status == STATUS_GOOD) { 5634 if (uscsi.uscsi_flags & USCSI_READ) { 5635 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5636 vd_scsi->datain_len; 5637 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5638 vd_scsi->datain_len, mode) != 0) { 5639 rv = EFAULT; 5640 goto done; 5641 } 5642 } else { 5643 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5644 vd_scsi->dataout_len; 5645 } 5646 } 5647 5648 /* copy-out result */ 5649 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5650 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5651 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5652 mode) != 0) { 5653 rv = EFAULT; 5654 goto done; 5655 } 5656 } else { 5657 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5658 mode) != 0) { 5659 rv = EFAULT; 5660 goto done; 5661 } 5662 } 5663 5664 /* get the return code from the SCSI command status */ 5665 rv = vdc_scsi_status(vdc, vd_scsi, 5666 !(uscsi.uscsi_flags & USCSI_SILENT)); 5667 5668 done: 5669 kmem_free(vd_scsi, vd_scsi_len); 5670 return (rv); 5671 } 5672 5673 /* 5674 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5675 * 5676 * Arguments: 5677 * cmd - SCSI PERSISTENT IN command 5678 * len - length of the SCSI input buffer 5679 * vd_scsi_len - return the length of the allocated buffer 5680 * 5681 * Returned Value: 5682 * a pointer to the allocated VD_OP_SCSICMD buffer. 5683 */ 5684 static vd_scsi_t * 5685 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5686 { 5687 int cdb_len, sense_len, datain_len, dataout_len; 5688 vd_scsi_t *vd_scsi; 5689 union scsi_cdb *cdb; 5690 5691 cdb_len = CDB_GROUP1; 5692 sense_len = sizeof (struct scsi_extended_sense); 5693 datain_len = len; 5694 dataout_len = 0; 5695 5696 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5697 vd_scsi_len); 5698 5699 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5700 5701 /* set cdb */ 5702 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5703 cdb->cdb_opaque[1] = cmd; 5704 FORMG1COUNT(cdb, datain_len); 5705 5706 vd_scsi->timeout = vdc_scsi_timeout; 5707 5708 return (vd_scsi); 5709 } 5710 5711 /* 5712 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5713 * 5714 * Arguments: 5715 * cmd - SCSI PERSISTENT OUT command 5716 * len - length of the SCSI output buffer 5717 * vd_scsi_len - return the length of the allocated buffer 5718 * 5719 * Returned Code: 5720 * a pointer to the allocated VD_OP_SCSICMD buffer. 5721 */ 5722 static vd_scsi_t * 5723 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5724 { 5725 int cdb_len, sense_len, datain_len, dataout_len; 5726 vd_scsi_t *vd_scsi; 5727 union scsi_cdb *cdb; 5728 5729 cdb_len = CDB_GROUP1; 5730 sense_len = sizeof (struct scsi_extended_sense); 5731 datain_len = 0; 5732 dataout_len = len; 5733 5734 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5735 vd_scsi_len); 5736 5737 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5738 5739 /* set cdb */ 5740 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5741 cdb->cdb_opaque[1] = cmd; 5742 FORMG1COUNT(cdb, dataout_len); 5743 5744 vd_scsi->timeout = vdc_scsi_timeout; 5745 5746 return (vd_scsi); 5747 } 5748 5749 /* 5750 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5751 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5752 * server with a VD_OP_SCSICMD operation. 5753 */ 5754 static int 5755 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5756 { 5757 vd_scsi_t *vd_scsi; 5758 mhioc_inkeys_t inkeys; 5759 mhioc_key_list_t klist; 5760 struct mhioc_inkeys32 inkeys32; 5761 struct mhioc_key_list32 klist32; 5762 sd_prin_readkeys_t *scsi_keys; 5763 void *user_keys; 5764 int vd_scsi_len; 5765 int listsize, listlen, rv; 5766 5767 /* copyin arguments */ 5768 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5769 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5770 if (rv != 0) 5771 return (EFAULT); 5772 5773 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5774 sizeof (klist32), mode); 5775 if (rv != 0) 5776 return (EFAULT); 5777 5778 listsize = klist32.listsize; 5779 } else { 5780 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5781 if (rv != 0) 5782 return (EFAULT); 5783 5784 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5785 if (rv != 0) 5786 return (EFAULT); 5787 5788 listsize = klist.listsize; 5789 } 5790 5791 /* build SCSI VD_OP request */ 5792 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5793 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5794 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5795 5796 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5797 5798 /* submit the request */ 5799 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5800 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5801 5802 if (rv != 0) 5803 goto done; 5804 5805 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5806 5807 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5808 inkeys32.generation = scsi_keys->generation; 5809 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5810 if (rv != 0) { 5811 rv = EFAULT; 5812 goto done; 5813 } 5814 5815 klist32.listlen = listlen; 5816 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5817 sizeof (klist32), mode); 5818 if (rv != 0) { 5819 rv = EFAULT; 5820 goto done; 5821 } 5822 5823 user_keys = (caddr_t)(uintptr_t)klist32.list; 5824 } else { 5825 inkeys.generation = scsi_keys->generation; 5826 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5827 if (rv != 0) { 5828 rv = EFAULT; 5829 goto done; 5830 } 5831 5832 klist.listlen = listlen; 5833 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5834 if (rv != 0) { 5835 rv = EFAULT; 5836 goto done; 5837 } 5838 5839 user_keys = klist.list; 5840 } 5841 5842 /* copy out keys */ 5843 if (listlen > 0 && listsize > 0) { 5844 if (listsize < listlen) 5845 listlen = listsize; 5846 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5847 listlen * MHIOC_RESV_KEY_SIZE, mode); 5848 if (rv != 0) 5849 rv = EFAULT; 5850 } 5851 5852 if (rv == 0) 5853 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5854 5855 done: 5856 kmem_free(vd_scsi, vd_scsi_len); 5857 5858 return (rv); 5859 } 5860 5861 /* 5862 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5863 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5864 * the vdisk server with a VD_OP_SCSICMD operation. 5865 */ 5866 static int 5867 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5868 { 5869 vd_scsi_t *vd_scsi; 5870 mhioc_inresvs_t inresv; 5871 mhioc_resv_desc_list_t rlist; 5872 struct mhioc_inresvs32 inresv32; 5873 struct mhioc_resv_desc_list32 rlist32; 5874 mhioc_resv_desc_t mhd_resv; 5875 sd_prin_readresv_t *scsi_resv; 5876 sd_readresv_desc_t *resv; 5877 mhioc_resv_desc_t *user_resv; 5878 int vd_scsi_len; 5879 int listsize, listlen, i, rv; 5880 5881 /* copyin arguments */ 5882 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5883 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5884 if (rv != 0) 5885 return (EFAULT); 5886 5887 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5888 sizeof (rlist32), mode); 5889 if (rv != 0) 5890 return (EFAULT); 5891 5892 listsize = rlist32.listsize; 5893 } else { 5894 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5895 if (rv != 0) 5896 return (EFAULT); 5897 5898 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5899 if (rv != 0) 5900 return (EFAULT); 5901 5902 listsize = rlist.listsize; 5903 } 5904 5905 /* build SCSI VD_OP request */ 5906 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5907 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5908 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5909 5910 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5911 5912 /* submit the request */ 5913 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5914 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5915 5916 if (rv != 0) 5917 goto done; 5918 5919 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5920 5921 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5922 inresv32.generation = scsi_resv->generation; 5923 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5924 if (rv != 0) { 5925 rv = EFAULT; 5926 goto done; 5927 } 5928 5929 rlist32.listlen = listlen; 5930 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5931 sizeof (rlist32), mode); 5932 if (rv != 0) { 5933 rv = EFAULT; 5934 goto done; 5935 } 5936 5937 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5938 } else { 5939 inresv.generation = scsi_resv->generation; 5940 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5941 if (rv != 0) { 5942 rv = EFAULT; 5943 goto done; 5944 } 5945 5946 rlist.listlen = listlen; 5947 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5948 if (rv != 0) { 5949 rv = EFAULT; 5950 goto done; 5951 } 5952 5953 user_resv = rlist.list; 5954 } 5955 5956 /* copy out reservations */ 5957 if (listsize > 0 && listlen > 0) { 5958 if (listsize < listlen) 5959 listlen = listsize; 5960 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5961 5962 for (i = 0; i < listlen; i++) { 5963 mhd_resv.type = resv->type; 5964 mhd_resv.scope = resv->scope; 5965 mhd_resv.scope_specific_addr = 5966 BE_32(resv->scope_specific_addr); 5967 bcopy(&resv->resvkey, &mhd_resv.key, 5968 MHIOC_RESV_KEY_SIZE); 5969 5970 rv = ddi_copyout(&mhd_resv, user_resv, 5971 sizeof (mhd_resv), mode); 5972 if (rv != 0) { 5973 rv = EFAULT; 5974 goto done; 5975 } 5976 resv++; 5977 user_resv++; 5978 } 5979 } 5980 5981 if (rv == 0) 5982 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5983 5984 done: 5985 kmem_free(vd_scsi, vd_scsi_len); 5986 return (rv); 5987 } 5988 5989 /* 5990 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5991 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5992 * server with a VD_OP_SCSICMD operation. 5993 */ 5994 static int 5995 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5996 { 5997 vd_scsi_t *vd_scsi; 5998 sd_prout_t *scsi_prout; 5999 mhioc_register_t mhd_reg; 6000 int vd_scsi_len, rv; 6001 6002 /* copyin arguments */ 6003 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6004 if (rv != 0) 6005 return (EFAULT); 6006 6007 /* build SCSI VD_OP request */ 6008 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6009 sizeof (sd_prout_t), &vd_scsi_len); 6010 6011 /* set parameters */ 6012 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6013 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6014 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6015 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6016 6017 /* submit the request */ 6018 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6019 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6020 6021 if (rv == 0) 6022 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6023 6024 kmem_free(vd_scsi, vd_scsi_len); 6025 return (rv); 6026 } 6027 6028 /* 6029 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6030 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6031 * server with a VD_OP_SCSICMD operation. 6032 */ 6033 static int 6034 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6035 { 6036 union scsi_cdb *cdb; 6037 vd_scsi_t *vd_scsi; 6038 sd_prout_t *scsi_prout; 6039 mhioc_resv_desc_t mhd_resv; 6040 int vd_scsi_len, rv; 6041 6042 /* copyin arguments */ 6043 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6044 if (rv != 0) 6045 return (EFAULT); 6046 6047 /* build SCSI VD_OP request */ 6048 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6049 sizeof (sd_prout_t), &vd_scsi_len); 6050 6051 /* set parameters */ 6052 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6053 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6054 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6055 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6056 cdb->cdb_opaque[2] = mhd_resv.type; 6057 6058 /* submit the request */ 6059 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6060 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6061 6062 if (rv == 0) 6063 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6064 6065 kmem_free(vd_scsi, vd_scsi_len); 6066 return (rv); 6067 } 6068 6069 /* 6070 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6071 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6072 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6073 */ 6074 static int 6075 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6076 { 6077 union scsi_cdb *cdb; 6078 vd_scsi_t *vd_scsi; 6079 sd_prout_t *scsi_prout; 6080 mhioc_preemptandabort_t mhd_preempt; 6081 int vd_scsi_len, rv; 6082 6083 /* copyin arguments */ 6084 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6085 if (rv != 0) 6086 return (EFAULT); 6087 6088 /* build SCSI VD_OP request */ 6089 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6090 sizeof (sd_prout_t), &vd_scsi_len); 6091 6092 /* set parameters */ 6093 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6094 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6095 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6096 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6097 MHIOC_RESV_KEY_SIZE); 6098 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6099 MHIOC_RESV_KEY_SIZE); 6100 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6101 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6102 6103 /* submit the request */ 6104 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6105 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6106 6107 if (rv == 0) 6108 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6109 6110 kmem_free(vd_scsi, vd_scsi_len); 6111 return (rv); 6112 } 6113 6114 /* 6115 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6116 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6117 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6118 */ 6119 static int 6120 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6121 { 6122 vd_scsi_t *vd_scsi; 6123 sd_prout_t *scsi_prout; 6124 mhioc_registerandignorekey_t mhd_regi; 6125 int vd_scsi_len, rv; 6126 6127 /* copyin arguments */ 6128 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6129 if (rv != 0) 6130 return (EFAULT); 6131 6132 /* build SCSI VD_OP request */ 6133 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6134 sizeof (sd_prout_t), &vd_scsi_len); 6135 6136 /* set parameters */ 6137 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6138 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6139 MHIOC_RESV_KEY_SIZE); 6140 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6141 6142 /* submit the request */ 6143 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6144 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6145 6146 if (rv == 0) 6147 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6148 6149 kmem_free(vd_scsi, vd_scsi_len); 6150 return (rv); 6151 } 6152 6153 /* 6154 * This function is used by the failfast mechanism to send a SCSI command 6155 * to check for reservation conflict. 6156 */ 6157 static int 6158 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6159 { 6160 int cdb_len, sense_len, vd_scsi_len; 6161 vd_scsi_t *vd_scsi; 6162 union scsi_cdb *cdb; 6163 int rv; 6164 6165 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6166 6167 if (scmd == SCMD_WRITE_G1) 6168 cdb_len = CDB_GROUP1; 6169 else 6170 cdb_len = CDB_GROUP0; 6171 6172 sense_len = sizeof (struct scsi_extended_sense); 6173 6174 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6175 6176 /* set cdb */ 6177 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6178 cdb->scc_cmd = scmd; 6179 6180 vd_scsi->timeout = vdc_scsi_timeout; 6181 6182 /* 6183 * Submit the request. The last argument has to be B_FALSE so that 6184 * vdc_do_sync_op does not loop checking for reservation conflict if 6185 * the operation returns an error. 6186 */ 6187 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6188 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6189 6190 if (rv == 0) 6191 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6192 6193 kmem_free(vd_scsi, vd_scsi_len); 6194 return (rv); 6195 } 6196 6197 /* 6198 * This function is used by the failfast mechanism to check for reservation 6199 * conflict. It sends some SCSI commands which will fail with a reservation 6200 * conflict error if the system does not have access to the disk and this 6201 * will panic the system. 6202 * 6203 * Returned Code: 6204 * 0 - disk is accessible without reservation conflict error 6205 * != 0 - unable to check if disk is accessible 6206 */ 6207 int 6208 vdc_failfast_check_resv(vdc_t *vdc) 6209 { 6210 int failure = 0; 6211 6212 /* 6213 * Send a TEST UNIT READY command. The command will panic 6214 * the system if it fails with a reservation conflict. 6215 */ 6216 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6217 failure++; 6218 6219 /* 6220 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6221 * a reserved device, so we also do a WRITE(10) of zero byte in 6222 * order to provoke a Reservation Conflict status on those newer 6223 * devices. 6224 */ 6225 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6226 failure++; 6227 6228 return (failure); 6229 } 6230 6231 /* 6232 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6233 * queue when it has failed and failfast is enabled. Then we have to check 6234 * if it has failed because of a reservation conflict in which case we have 6235 * to panic the system. 6236 * 6237 * Async I/O should be queued with their block I/O data transfer structure 6238 * (buf). Sync I/O should be queued with buf = NULL. 6239 */ 6240 static vdc_io_t * 6241 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6242 { 6243 vdc_io_t *vio; 6244 6245 ASSERT(MUTEX_HELD(&vdc->lock)); 6246 6247 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6248 vio->vio_next = vdc->failfast_io_queue; 6249 vio->vio_buf = buf; 6250 vio->vio_qtime = ddi_get_lbolt(); 6251 6252 vdc->failfast_io_queue = vio; 6253 6254 /* notify the failfast thread that a new I/O is queued */ 6255 cv_signal(&vdc->failfast_cv); 6256 6257 return (vio); 6258 } 6259 6260 /* 6261 * Remove and complete I/O in the failfast I/O queue which have been 6262 * added after the indicated deadline. A deadline of 0 means that all 6263 * I/O have to be unqueued and marked as completed. 6264 */ 6265 static void 6266 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6267 { 6268 vdc_io_t *vio, *vio_tmp; 6269 6270 ASSERT(MUTEX_HELD(&vdc->lock)); 6271 6272 vio_tmp = NULL; 6273 vio = vdc->failfast_io_queue; 6274 6275 if (deadline != 0) { 6276 /* 6277 * Skip any io queued after the deadline. The failfast 6278 * I/O queue is ordered starting with the last I/O added 6279 * to the queue. 6280 */ 6281 while (vio != NULL && vio->vio_qtime > deadline) { 6282 vio_tmp = vio; 6283 vio = vio->vio_next; 6284 } 6285 } 6286 6287 if (vio == NULL) 6288 /* nothing to unqueue */ 6289 return; 6290 6291 /* update the queue */ 6292 if (vio_tmp == NULL) 6293 vdc->failfast_io_queue = NULL; 6294 else 6295 vio_tmp->vio_next = NULL; 6296 6297 /* 6298 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6299 * structure (buf) and they are completed by calling biodone(). Sync 6300 * I/O do not have a buf and they are completed by setting the 6301 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6302 * thread waiting for the I/O to complete is responsible for freeing 6303 * the vio structure. 6304 */ 6305 while (vio != NULL) { 6306 vio_tmp = vio->vio_next; 6307 if (vio->vio_buf != NULL) { 6308 VD_KSTAT_RUNQ_EXIT(vdc); 6309 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6310 biodone(vio->vio_buf); 6311 kmem_free(vio, sizeof (vdc_io_t)); 6312 } else { 6313 vio->vio_qtime = 0; 6314 } 6315 vio = vio_tmp; 6316 } 6317 6318 cv_broadcast(&vdc->failfast_io_cv); 6319 } 6320 6321 /* 6322 * Failfast Thread. 6323 * 6324 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6325 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6326 * we still have access to the disk. If a command fails with a RESERVATION 6327 * CONFLICT error then the system will immediatly panic. 6328 * 6329 * The failfast thread is also woken up when an I/O has failed. It then check 6330 * the access to the disk to ensure that the I/O failure was not due to a 6331 * reservation conflict. 6332 * 6333 * There is one failfast thread for each virtual disk for which failfast is 6334 * enabled. We could have only one thread sending requests for all disks but 6335 * this would need vdc to send asynchronous requests and to have callbacks to 6336 * process replies. 6337 */ 6338 static void 6339 vdc_failfast_thread(void *arg) 6340 { 6341 int status; 6342 vdc_t *vdc = (vdc_t *)arg; 6343 clock_t timeout, starttime; 6344 6345 mutex_enter(&vdc->lock); 6346 6347 while (vdc->failfast_interval != 0) { 6348 6349 starttime = ddi_get_lbolt(); 6350 6351 mutex_exit(&vdc->lock); 6352 6353 /* check for reservation conflict */ 6354 status = vdc_failfast_check_resv(vdc); 6355 6356 mutex_enter(&vdc->lock); 6357 /* 6358 * We have dropped the lock to send the SCSI command so we have 6359 * to check that failfast is still enabled. 6360 */ 6361 if (vdc->failfast_interval == 0) 6362 break; 6363 6364 /* 6365 * If we have successfully check the disk access and there was 6366 * no reservation conflict then we can complete any I/O queued 6367 * before the last check. 6368 */ 6369 if (status == 0) 6370 vdc_failfast_io_unqueue(vdc, starttime); 6371 6372 /* proceed again if some I/O are still in the queue */ 6373 if (vdc->failfast_io_queue != NULL) 6374 continue; 6375 6376 timeout = ddi_get_lbolt() + 6377 drv_usectohz(vdc->failfast_interval); 6378 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6379 } 6380 6381 /* 6382 * Failfast is being stop so we can complete any queued I/O. 6383 */ 6384 vdc_failfast_io_unqueue(vdc, 0); 6385 vdc->failfast_thread = NULL; 6386 mutex_exit(&vdc->lock); 6387 thread_exit(); 6388 } 6389 6390 /* 6391 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6392 */ 6393 static int 6394 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6395 { 6396 unsigned int mh_time; 6397 6398 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6399 return (EFAULT); 6400 6401 mutex_enter(&vdc->lock); 6402 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6403 vdc->failfast_thread = thread_create(NULL, 0, 6404 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6405 v.v_maxsyspri - 2); 6406 } 6407 6408 vdc->failfast_interval = mh_time * 1000; 6409 cv_signal(&vdc->failfast_cv); 6410 mutex_exit(&vdc->lock); 6411 6412 return (0); 6413 } 6414 6415 /* 6416 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6417 * converted to VD_OP_SET_ACCESS operations. 6418 */ 6419 static int 6420 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6421 { 6422 int rv; 6423 6424 /* submit owership command request */ 6425 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6426 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6427 VIO_both_dir, B_TRUE); 6428 6429 return (rv); 6430 } 6431 6432 /* 6433 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6434 * VD_OP_GET_ACCESS operation. 6435 */ 6436 static int 6437 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6438 { 6439 int rv; 6440 6441 /* submit owership command request */ 6442 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6443 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6444 VIO_both_dir, B_TRUE); 6445 6446 return (rv); 6447 } 6448 6449 /* 6450 * Disk Ownership Thread. 6451 * 6452 * When we have taken the ownership of a disk, this thread waits to be 6453 * notified when the LDC channel is reset so that it can recover the 6454 * ownership. 6455 * 6456 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6457 * can not be used to do the ownership recovery because it has to be 6458 * running to handle the reply message to the ownership operation. 6459 */ 6460 static void 6461 vdc_ownership_thread(void *arg) 6462 { 6463 vdc_t *vdc = (vdc_t *)arg; 6464 clock_t timeout; 6465 uint64_t status; 6466 6467 mutex_enter(&vdc->ownership_lock); 6468 mutex_enter(&vdc->lock); 6469 6470 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6471 6472 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6473 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6474 /* 6475 * There was a reset so the ownership has been lost, 6476 * try to recover. We do this without using the preempt 6477 * option so that we don't steal the ownership from 6478 * someone who has preempted us. 6479 */ 6480 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6481 vdc->instance); 6482 6483 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6484 VDC_OWNERSHIP_GRANTED); 6485 6486 mutex_exit(&vdc->lock); 6487 6488 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6489 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6490 6491 mutex_enter(&vdc->lock); 6492 6493 if (status == 0) { 6494 DMSG(vdc, 0, "[%d] Ownership recovered", 6495 vdc->instance); 6496 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6497 } else { 6498 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6499 vdc->instance); 6500 } 6501 6502 } 6503 6504 /* 6505 * If we have the ownership then we just wait for an event 6506 * to happen (LDC reset), otherwise we will retry to recover 6507 * after a delay. 6508 */ 6509 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6510 timeout = 0; 6511 else 6512 timeout = ddi_get_lbolt() + 6513 drv_usectohz(vdc_ownership_delay); 6514 6515 /* Release the ownership_lock and wait on the vdc lock */ 6516 mutex_exit(&vdc->ownership_lock); 6517 6518 if (timeout == 0) 6519 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6520 else 6521 (void) cv_timedwait(&vdc->ownership_cv, 6522 &vdc->lock, timeout); 6523 6524 mutex_exit(&vdc->lock); 6525 6526 mutex_enter(&vdc->ownership_lock); 6527 mutex_enter(&vdc->lock); 6528 } 6529 6530 vdc->ownership_thread = NULL; 6531 mutex_exit(&vdc->lock); 6532 mutex_exit(&vdc->ownership_lock); 6533 6534 thread_exit(); 6535 } 6536 6537 static void 6538 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6539 { 6540 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6541 6542 mutex_enter(&vdc->lock); 6543 vdc->ownership = ownership_flags; 6544 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6545 vdc->ownership_thread == NULL) { 6546 /* start ownership thread */ 6547 vdc->ownership_thread = thread_create(NULL, 0, 6548 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6549 v.v_maxsyspri - 2); 6550 } else { 6551 /* notify the ownership thread */ 6552 cv_signal(&vdc->ownership_cv); 6553 } 6554 mutex_exit(&vdc->lock); 6555 } 6556 6557 /* 6558 * Get the size and the block size of a virtual disk from the vdisk server. 6559 */ 6560 static int 6561 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6562 { 6563 int rv = 0; 6564 size_t alloc_len; 6565 vd_capacity_t *vd_cap; 6566 6567 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6568 6569 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6570 6571 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6572 6573 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6574 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6575 6576 *dsk_size = vd_cap->vdisk_size; 6577 *blk_size = vd_cap->vdisk_block_size; 6578 6579 kmem_free(vd_cap, alloc_len); 6580 return (rv); 6581 } 6582 6583 /* 6584 * Check the disk capacity. Disk size information is updated if size has 6585 * changed. 6586 * 6587 * Return 0 if the disk capacity is available, or non-zero if it is not. 6588 */ 6589 static int 6590 vdc_check_capacity(vdc_t *vdc) 6591 { 6592 size_t dsk_size, blk_size; 6593 int rv; 6594 6595 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6596 return (rv); 6597 6598 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0) 6599 return (EINVAL); 6600 6601 mutex_enter(&vdc->lock); 6602 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6603 mutex_exit(&vdc->lock); 6604 6605 return (0); 6606 } 6607 6608 /* 6609 * This structure is used in the DKIO(7I) array below. 6610 */ 6611 typedef struct vdc_dk_ioctl { 6612 uint8_t op; /* VD_OP_XXX value */ 6613 int cmd; /* Solaris ioctl operation number */ 6614 size_t nbytes; /* size of structure to be copied */ 6615 6616 /* function to convert between vDisk and Solaris structure formats */ 6617 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6618 int mode, int dir); 6619 } vdc_dk_ioctl_t; 6620 6621 /* 6622 * Subset of DKIO(7I) operations currently supported 6623 */ 6624 static vdc_dk_ioctl_t dk_ioctl[] = { 6625 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6626 vdc_null_copy_func}, 6627 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6628 vdc_get_wce_convert}, 6629 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6630 vdc_set_wce_convert}, 6631 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6632 vdc_get_vtoc_convert}, 6633 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6634 vdc_set_vtoc_convert}, 6635 {VD_OP_GET_VTOC, DKIOCGEXTVTOC, sizeof (vd_vtoc_t), 6636 vdc_get_extvtoc_convert}, 6637 {VD_OP_SET_VTOC, DKIOCSEXTVTOC, sizeof (vd_vtoc_t), 6638 vdc_set_extvtoc_convert}, 6639 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6640 vdc_get_geom_convert}, 6641 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6642 vdc_get_geom_convert}, 6643 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6644 vdc_get_geom_convert}, 6645 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6646 vdc_set_geom_convert}, 6647 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6648 vdc_get_efi_convert}, 6649 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6650 vdc_set_efi_convert}, 6651 6652 /* DIOCTL_RWCMD is converted to a read or a write */ 6653 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6654 6655 /* mhd(7I) non-shared multihost disks ioctls */ 6656 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6657 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6658 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6659 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6660 6661 /* mhd(7I) shared multihost disks ioctls */ 6662 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6663 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6664 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6665 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6666 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6667 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6668 6669 /* mhd(7I) failfast ioctl */ 6670 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6671 6672 /* 6673 * These particular ioctls are not sent to the server - vdc fakes up 6674 * the necessary info. 6675 */ 6676 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6677 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6678 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6679 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6680 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6681 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6682 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6683 }; 6684 6685 /* 6686 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6687 * function and forward them to the vdisk. 6688 */ 6689 static int 6690 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6691 { 6692 vdc_t *vdc = (vdc_t *)vdisk; 6693 dev_t dev; 6694 int rval; 6695 6696 dev = makedevice(ddi_driver_major(vdc->dip), 6697 VD_MAKE_DEV(vdc->instance, 0)); 6698 6699 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6700 } 6701 6702 /* 6703 * Function: 6704 * vd_process_ioctl() 6705 * 6706 * Description: 6707 * This routine processes disk specific ioctl calls 6708 * 6709 * Arguments: 6710 * dev - the device number 6711 * cmd - the operation [dkio(7I)] to be processed 6712 * arg - pointer to user provided structure 6713 * (contains data to be set or reference parameter for get) 6714 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6715 * rvalp - pointer to return value for calling process. 6716 * 6717 * Return Code: 6718 * 0 6719 * EFAULT 6720 * ENXIO 6721 * EIO 6722 * ENOTSUP 6723 */ 6724 static int 6725 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6726 { 6727 int instance = VDCUNIT(dev); 6728 vdc_t *vdc = NULL; 6729 int rv = -1; 6730 int idx = 0; /* index into dk_ioctl[] */ 6731 size_t len = 0; /* #bytes to send to vds */ 6732 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6733 caddr_t mem_p = NULL; 6734 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6735 vdc_dk_ioctl_t *iop; 6736 6737 vdc = ddi_get_soft_state(vdc_state, instance); 6738 if (vdc == NULL) { 6739 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6740 instance); 6741 return (ENXIO); 6742 } 6743 6744 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6745 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6746 6747 if (rvalp != NULL) { 6748 /* the return value of the ioctl is 0 by default */ 6749 *rvalp = 0; 6750 } 6751 6752 /* 6753 * Validate the ioctl operation to be performed. 6754 * 6755 * If we have looped through the array without finding a match then we 6756 * don't support this ioctl. 6757 */ 6758 for (idx = 0; idx < nioctls; idx++) { 6759 if (cmd == dk_ioctl[idx].cmd) 6760 break; 6761 } 6762 6763 if (idx >= nioctls) { 6764 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6765 vdc->instance, cmd); 6766 return (ENOTSUP); 6767 } 6768 6769 iop = &(dk_ioctl[idx]); 6770 6771 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6772 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6773 dk_efi_t dk_efi; 6774 6775 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6776 if (rv != 0) 6777 return (EFAULT); 6778 6779 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6780 } else { 6781 len = iop->nbytes; 6782 } 6783 6784 /* check if the ioctl is applicable */ 6785 switch (cmd) { 6786 case CDROMREADOFFSET: 6787 case DKIOCREMOVABLE: 6788 return (ENOTTY); 6789 6790 case USCSICMD: 6791 case MHIOCTKOWN: 6792 case MHIOCSTATUS: 6793 case MHIOCQRESERVE: 6794 case MHIOCRELEASE: 6795 case MHIOCGRP_INKEYS: 6796 case MHIOCGRP_INRESV: 6797 case MHIOCGRP_REGISTER: 6798 case MHIOCGRP_RESERVE: 6799 case MHIOCGRP_PREEMPTANDABORT: 6800 case MHIOCGRP_REGISTERANDIGNOREKEY: 6801 case MHIOCENFAILFAST: 6802 if (vdc->cinfo == NULL) 6803 return (ENXIO); 6804 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6805 return (ENOTTY); 6806 break; 6807 6808 case DIOCTL_RWCMD: 6809 if (vdc->cinfo == NULL) 6810 return (ENXIO); 6811 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6812 return (ENOTTY); 6813 break; 6814 6815 case DKIOCINFO: 6816 if (vdc->cinfo == NULL) 6817 return (ENXIO); 6818 break; 6819 6820 case DKIOCGMEDIAINFO: 6821 if (vdc->minfo == NULL) 6822 return (ENXIO); 6823 if (vdc_check_capacity(vdc) != 0) 6824 /* disk capacity is not available */ 6825 return (EIO); 6826 break; 6827 } 6828 6829 /* 6830 * Deal with ioctls which require a processing different than 6831 * converting ioctl arguments and sending a corresponding 6832 * VD operation. 6833 */ 6834 switch (cmd) { 6835 6836 case USCSICMD: 6837 { 6838 return (vdc_uscsi_cmd(vdc, arg, mode)); 6839 } 6840 6841 case MHIOCTKOWN: 6842 { 6843 mutex_enter(&vdc->ownership_lock); 6844 /* 6845 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6846 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6847 * while we are processing the ioctl. 6848 */ 6849 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6850 6851 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6852 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6853 if (rv == 0) { 6854 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6855 VDC_OWNERSHIP_GRANTED); 6856 } else { 6857 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6858 } 6859 mutex_exit(&vdc->ownership_lock); 6860 return (rv); 6861 } 6862 6863 case MHIOCRELEASE: 6864 { 6865 mutex_enter(&vdc->ownership_lock); 6866 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6867 if (rv == 0) { 6868 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6869 } 6870 mutex_exit(&vdc->ownership_lock); 6871 return (rv); 6872 } 6873 6874 case MHIOCSTATUS: 6875 { 6876 uint64_t status; 6877 6878 rv = vdc_access_get(vdc, &status, mode); 6879 if (rv == 0 && rvalp != NULL) 6880 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6881 return (rv); 6882 } 6883 6884 case MHIOCQRESERVE: 6885 { 6886 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6887 return (rv); 6888 } 6889 6890 case MHIOCGRP_INKEYS: 6891 { 6892 return (vdc_mhd_inkeys(vdc, arg, mode)); 6893 } 6894 6895 case MHIOCGRP_INRESV: 6896 { 6897 return (vdc_mhd_inresv(vdc, arg, mode)); 6898 } 6899 6900 case MHIOCGRP_REGISTER: 6901 { 6902 return (vdc_mhd_register(vdc, arg, mode)); 6903 } 6904 6905 case MHIOCGRP_RESERVE: 6906 { 6907 return (vdc_mhd_reserve(vdc, arg, mode)); 6908 } 6909 6910 case MHIOCGRP_PREEMPTANDABORT: 6911 { 6912 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6913 } 6914 6915 case MHIOCGRP_REGISTERANDIGNOREKEY: 6916 { 6917 return (vdc_mhd_registerignore(vdc, arg, mode)); 6918 } 6919 6920 case MHIOCENFAILFAST: 6921 { 6922 rv = vdc_failfast(vdc, arg, mode); 6923 return (rv); 6924 } 6925 6926 case DIOCTL_RWCMD: 6927 { 6928 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6929 } 6930 6931 case DKIOCGAPART: 6932 { 6933 return (vdc_dkio_gapart(vdc, arg, mode)); 6934 } 6935 6936 case DKIOCPARTITION: 6937 { 6938 return (vdc_dkio_partition(vdc, arg, mode)); 6939 } 6940 6941 case DKIOCINFO: 6942 { 6943 struct dk_cinfo cinfo; 6944 6945 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6946 cinfo.dki_partition = VDCPART(dev); 6947 6948 rv = ddi_copyout(&cinfo, (void *)arg, 6949 sizeof (struct dk_cinfo), mode); 6950 if (rv != 0) 6951 return (EFAULT); 6952 6953 return (0); 6954 } 6955 6956 case DKIOCGMEDIAINFO: 6957 { 6958 ASSERT(vdc->vdisk_size != 0); 6959 ASSERT(vdc->minfo->dki_capacity != 0); 6960 rv = ddi_copyout(vdc->minfo, (void *)arg, 6961 sizeof (struct dk_minfo), mode); 6962 if (rv != 0) 6963 return (EFAULT); 6964 6965 return (0); 6966 } 6967 6968 case DKIOCFLUSHWRITECACHE: 6969 { 6970 struct dk_callback *dkc = 6971 (struct dk_callback *)(uintptr_t)arg; 6972 vdc_dk_arg_t *dkarg = NULL; 6973 6974 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6975 instance, mode); 6976 6977 /* 6978 * If arg is NULL, then there is no callback function 6979 * registered and the call operates synchronously; we 6980 * break and continue with the rest of the function and 6981 * wait for vds to return (i.e. after the request to 6982 * vds returns successfully, all writes completed prior 6983 * to the ioctl will have been flushed from the disk 6984 * write cache to persistent media. 6985 * 6986 * If a callback function is registered, we dispatch 6987 * the request on a task queue and return immediately. 6988 * The callback will deal with informing the calling 6989 * thread that the flush request is completed. 6990 */ 6991 if (dkc == NULL) 6992 break; 6993 6994 /* 6995 * the asynchronous callback is only supported if 6996 * invoked from within the kernel 6997 */ 6998 if ((mode & FKIOCTL) == 0) 6999 return (ENOTSUP); 7000 7001 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7002 7003 dkarg->mode = mode; 7004 dkarg->dev = dev; 7005 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7006 7007 mutex_enter(&vdc->lock); 7008 vdc->dkio_flush_pending++; 7009 dkarg->vdc = vdc; 7010 mutex_exit(&vdc->lock); 7011 7012 /* put the request on a task queue */ 7013 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7014 (void *)dkarg, DDI_SLEEP); 7015 if (rv == NULL) { 7016 /* clean up if dispatch fails */ 7017 mutex_enter(&vdc->lock); 7018 vdc->dkio_flush_pending--; 7019 mutex_exit(&vdc->lock); 7020 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7021 } 7022 7023 return (rv == NULL ? ENOMEM : 0); 7024 } 7025 } 7026 7027 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7028 ASSERT(iop->op != 0); 7029 7030 /* check if the vDisk server handles the operation for this vDisk */ 7031 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7032 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7033 vdc->instance, iop->op); 7034 return (ENOTSUP); 7035 } 7036 7037 /* LDC requires that the memory being mapped is 8-byte aligned */ 7038 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7039 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7040 instance, len, alloc_len); 7041 7042 if (alloc_len > 0) 7043 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7044 7045 /* 7046 * Call the conversion function for this ioctl which, if necessary, 7047 * converts from the Solaris format to the format ARC'ed 7048 * as part of the vDisk protocol (FWARC 2006/195) 7049 */ 7050 ASSERT(iop->convert != NULL); 7051 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7052 if (rv != 0) { 7053 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7054 instance, rv, cmd); 7055 if (mem_p != NULL) 7056 kmem_free(mem_p, alloc_len); 7057 return (rv); 7058 } 7059 7060 /* 7061 * send request to vds to service the ioctl. 7062 */ 7063 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7064 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7065 VIO_both_dir, B_TRUE); 7066 7067 if (rv != 0) { 7068 /* 7069 * This is not necessarily an error. The ioctl could 7070 * be returning a value such as ENOTTY to indicate 7071 * that the ioctl is not applicable. 7072 */ 7073 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7074 instance, rv, cmd); 7075 if (mem_p != NULL) 7076 kmem_free(mem_p, alloc_len); 7077 7078 return (rv); 7079 } 7080 7081 /* 7082 * Call the conversion function (if it exists) for this ioctl 7083 * which converts from the format ARC'ed as part of the vDisk 7084 * protocol (FWARC 2006/195) back to a format understood by 7085 * the rest of Solaris. 7086 */ 7087 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7088 if (rv != 0) { 7089 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7090 instance, rv, cmd); 7091 if (mem_p != NULL) 7092 kmem_free(mem_p, alloc_len); 7093 return (rv); 7094 } 7095 7096 if (mem_p != NULL) 7097 kmem_free(mem_p, alloc_len); 7098 7099 return (rv); 7100 } 7101 7102 /* 7103 * Function: 7104 * 7105 * Description: 7106 * This is an empty conversion function used by ioctl calls which 7107 * do not need to convert the data being passed in/out to userland 7108 */ 7109 static int 7110 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7111 { 7112 _NOTE(ARGUNUSED(vdc)) 7113 _NOTE(ARGUNUSED(from)) 7114 _NOTE(ARGUNUSED(to)) 7115 _NOTE(ARGUNUSED(mode)) 7116 _NOTE(ARGUNUSED(dir)) 7117 7118 return (0); 7119 } 7120 7121 static int 7122 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7123 int mode, int dir) 7124 { 7125 _NOTE(ARGUNUSED(vdc)) 7126 7127 if (dir == VD_COPYIN) 7128 return (0); /* nothing to do */ 7129 7130 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7131 return (EFAULT); 7132 7133 return (0); 7134 } 7135 7136 static int 7137 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7138 int mode, int dir) 7139 { 7140 _NOTE(ARGUNUSED(vdc)) 7141 7142 if (dir == VD_COPYOUT) 7143 return (0); /* nothing to do */ 7144 7145 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7146 return (EFAULT); 7147 7148 return (0); 7149 } 7150 7151 /* 7152 * Function: 7153 * vdc_get_vtoc_convert() 7154 * 7155 * Description: 7156 * This routine performs the necessary convertions from the DKIOCGVTOC 7157 * Solaris structure to the format defined in FWARC 2006/195. 7158 * 7159 * In the struct vtoc definition, the timestamp field is marked as not 7160 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7161 * However SVM uses that field to check it can write into the VTOC, 7162 * so we fake up the info of that field. 7163 * 7164 * Arguments: 7165 * vdc - the vDisk client 7166 * from - the buffer containing the data to be copied from 7167 * to - the buffer to be copied to 7168 * mode - flags passed to ioctl() call 7169 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7170 * 7171 * Return Code: 7172 * 0 - Success 7173 * ENXIO - incorrect buffer passed in. 7174 * EFAULT - ddi_copyout routine encountered an error. 7175 */ 7176 static int 7177 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7178 { 7179 int i; 7180 struct vtoc vtoc; 7181 struct vtoc32 vtoc32; 7182 struct extvtoc evtoc; 7183 int rv; 7184 7185 if (dir != VD_COPYOUT) 7186 return (0); /* nothing to do */ 7187 7188 if ((from == NULL) || (to == NULL)) 7189 return (ENXIO); 7190 7191 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7192 return (EOVERFLOW); 7193 7194 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7195 7196 /* fake the VTOC timestamp field */ 7197 for (i = 0; i < V_NUMPAR; i++) { 7198 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7199 } 7200 7201 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7202 /* LINTED E_ASSIGN_NARROW_CONV */ 7203 extvtoctovtoc32(evtoc, vtoc32); 7204 rv = ddi_copyout(&vtoc32, to, sizeof (vtoc32), mode); 7205 if (rv != 0) 7206 rv = EFAULT; 7207 } else { 7208 extvtoctovtoc(evtoc, vtoc); 7209 rv = ddi_copyout(&vtoc, to, sizeof (vtoc), mode); 7210 if (rv != 0) 7211 rv = EFAULT; 7212 } 7213 7214 return (rv); 7215 } 7216 7217 /* 7218 * Function: 7219 * vdc_set_vtoc_convert() 7220 * 7221 * Description: 7222 * This routine performs the necessary convertions from the DKIOCSVTOC 7223 * Solaris structure to the format defined in FWARC 2006/195. 7224 * 7225 * Arguments: 7226 * vdc - the vDisk client 7227 * from - Buffer with data 7228 * to - Buffer where data is to be copied to 7229 * mode - flags passed to ioctl 7230 * dir - direction of copy (in or out) 7231 * 7232 * Return Code: 7233 * 0 - Success 7234 * ENXIO - Invalid buffer passed in 7235 * EFAULT - ddi_copyin of data failed 7236 */ 7237 static int 7238 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7239 { 7240 void *uvtoc; 7241 struct vtoc vtoc; 7242 struct vtoc32 vtoc32; 7243 struct extvtoc evtoc; 7244 int i, rv; 7245 7246 if ((from == NULL) || (to == NULL)) 7247 return (ENXIO); 7248 7249 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7250 return (EOVERFLOW); 7251 7252 uvtoc = (dir == VD_COPYIN)? from : to; 7253 7254 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7255 rv = ddi_copyin(uvtoc, &vtoc32, sizeof (vtoc32), mode); 7256 if (rv != 0) 7257 return (EFAULT); 7258 vtoc32toextvtoc(vtoc32, evtoc); 7259 } else { 7260 rv = ddi_copyin(uvtoc, &vtoc, sizeof (vtoc), mode); 7261 if (rv != 0) 7262 return (EFAULT); 7263 vtoctoextvtoc(vtoc, evtoc); 7264 } 7265 7266 if (dir == VD_COPYOUT) { 7267 /* 7268 * The disk label may have changed. Revalidate the disk 7269 * geometry. This will also update the device nodes. 7270 */ 7271 vdc_validate(vdc); 7272 7273 /* 7274 * We also need to keep track of the timestamp fields. 7275 */ 7276 for (i = 0; i < V_NUMPAR; i++) { 7277 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7278 } 7279 7280 } else { 7281 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7282 } 7283 7284 return (0); 7285 } 7286 7287 static int 7288 vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7289 { 7290 int i, rv; 7291 struct extvtoc evtoc; 7292 7293 if (dir != VD_COPYOUT) 7294 return (0); /* nothing to do */ 7295 7296 if ((from == NULL) || (to == NULL)) 7297 return (ENXIO); 7298 7299 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7300 7301 /* fake the VTOC timestamp field */ 7302 for (i = 0; i < V_NUMPAR; i++) { 7303 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7304 } 7305 7306 rv = ddi_copyout(&evtoc, to, sizeof (struct extvtoc), mode); 7307 if (rv != 0) 7308 rv = EFAULT; 7309 7310 return (rv); 7311 } 7312 7313 static int 7314 vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7315 { 7316 void *uvtoc; 7317 struct extvtoc evtoc; 7318 int i, rv; 7319 7320 if ((from == NULL) || (to == NULL)) 7321 return (ENXIO); 7322 7323 uvtoc = (dir == VD_COPYIN)? from : to; 7324 7325 rv = ddi_copyin(uvtoc, &evtoc, sizeof (struct extvtoc), mode); 7326 if (rv != 0) 7327 return (EFAULT); 7328 7329 if (dir == VD_COPYOUT) { 7330 /* 7331 * The disk label may have changed. Revalidate the disk 7332 * geometry. This will also update the device nodes. 7333 */ 7334 vdc_validate(vdc); 7335 7336 /* 7337 * We also need to keep track of the timestamp fields. 7338 */ 7339 for (i = 0; i < V_NUMPAR; i++) { 7340 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7341 } 7342 7343 } else { 7344 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7345 } 7346 7347 return (0); 7348 } 7349 7350 /* 7351 * Function: 7352 * vdc_get_geom_convert() 7353 * 7354 * Description: 7355 * This routine performs the necessary convertions from the DKIOCGGEOM, 7356 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7357 * defined in FWARC 2006/195 7358 * 7359 * Arguments: 7360 * vdc - the vDisk client 7361 * from - Buffer with data 7362 * to - Buffer where data is to be copied to 7363 * mode - flags passed to ioctl 7364 * dir - direction of copy (in or out) 7365 * 7366 * Return Code: 7367 * 0 - Success 7368 * ENXIO - Invalid buffer passed in 7369 * EFAULT - ddi_copyout of data failed 7370 */ 7371 static int 7372 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7373 { 7374 _NOTE(ARGUNUSED(vdc)) 7375 7376 struct dk_geom geom; 7377 int copy_len = sizeof (struct dk_geom); 7378 int rv = 0; 7379 7380 if (dir != VD_COPYOUT) 7381 return (0); /* nothing to do */ 7382 7383 if ((from == NULL) || (to == NULL)) 7384 return (ENXIO); 7385 7386 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7387 rv = ddi_copyout(&geom, to, copy_len, mode); 7388 if (rv != 0) 7389 rv = EFAULT; 7390 7391 return (rv); 7392 } 7393 7394 /* 7395 * Function: 7396 * vdc_set_geom_convert() 7397 * 7398 * Description: 7399 * This routine performs the necessary convertions from the DKIOCSGEOM 7400 * Solaris structure to the format defined in FWARC 2006/195. 7401 * 7402 * Arguments: 7403 * vdc - the vDisk client 7404 * from - Buffer with data 7405 * to - Buffer where data is to be copied to 7406 * mode - flags passed to ioctl 7407 * dir - direction of copy (in or out) 7408 * 7409 * Return Code: 7410 * 0 - Success 7411 * ENXIO - Invalid buffer passed in 7412 * EFAULT - ddi_copyin of data failed 7413 */ 7414 static int 7415 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7416 { 7417 _NOTE(ARGUNUSED(vdc)) 7418 7419 vd_geom_t vdgeom; 7420 void *tmp_mem = NULL; 7421 int copy_len = sizeof (struct dk_geom); 7422 int rv = 0; 7423 7424 if (dir != VD_COPYIN) 7425 return (0); /* nothing to do */ 7426 7427 if ((from == NULL) || (to == NULL)) 7428 return (ENXIO); 7429 7430 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7431 7432 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7433 if (rv != 0) { 7434 kmem_free(tmp_mem, copy_len); 7435 return (EFAULT); 7436 } 7437 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7438 bcopy(&vdgeom, to, sizeof (vdgeom)); 7439 kmem_free(tmp_mem, copy_len); 7440 7441 return (0); 7442 } 7443 7444 static int 7445 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7446 { 7447 _NOTE(ARGUNUSED(vdc)) 7448 7449 vd_efi_t *vd_efi; 7450 dk_efi_t dk_efi; 7451 int rv = 0; 7452 void *uaddr; 7453 7454 if ((from == NULL) || (to == NULL)) 7455 return (ENXIO); 7456 7457 if (dir == VD_COPYIN) { 7458 7459 vd_efi = (vd_efi_t *)to; 7460 7461 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7462 if (rv != 0) 7463 return (EFAULT); 7464 7465 vd_efi->lba = dk_efi.dki_lba; 7466 vd_efi->length = dk_efi.dki_length; 7467 bzero(vd_efi->data, vd_efi->length); 7468 7469 } else { 7470 7471 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7472 if (rv != 0) 7473 return (EFAULT); 7474 7475 uaddr = dk_efi.dki_data; 7476 7477 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7478 7479 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7480 7481 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7482 mode); 7483 if (rv != 0) 7484 return (EFAULT); 7485 7486 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7487 } 7488 7489 return (0); 7490 } 7491 7492 static int 7493 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7494 { 7495 _NOTE(ARGUNUSED(vdc)) 7496 7497 dk_efi_t dk_efi; 7498 void *uaddr; 7499 7500 if (dir == VD_COPYOUT) { 7501 /* 7502 * The disk label may have changed. Revalidate the disk 7503 * geometry. This will also update the device nodes. 7504 */ 7505 vdc_validate(vdc); 7506 return (0); 7507 } 7508 7509 if ((from == NULL) || (to == NULL)) 7510 return (ENXIO); 7511 7512 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7513 return (EFAULT); 7514 7515 uaddr = dk_efi.dki_data; 7516 7517 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7518 7519 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7520 return (EFAULT); 7521 7522 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7523 7524 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7525 7526 return (0); 7527 } 7528 7529 7530 /* -------------------------------------------------------------------------- */ 7531 7532 /* 7533 * Function: 7534 * vdc_create_fake_geometry() 7535 * 7536 * Description: 7537 * This routine fakes up the disk info needed for some DKIO ioctls such 7538 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7539 * 7540 * Note: This function must not be called until the vDisk attributes have 7541 * been exchanged as part of the handshake with the vDisk server. 7542 * 7543 * Arguments: 7544 * vdc - soft state pointer for this instance of the device driver. 7545 * 7546 * Return Code: 7547 * none. 7548 */ 7549 static void 7550 vdc_create_fake_geometry(vdc_t *vdc) 7551 { 7552 ASSERT(vdc != NULL); 7553 ASSERT(vdc->max_xfer_sz != 0); 7554 7555 /* 7556 * DKIOCINFO support 7557 */ 7558 if (vdc->cinfo == NULL) 7559 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7560 7561 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7562 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7563 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7564 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7565 7566 /* 7567 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7568 * operation is supported, otherwise the controller type is DKC_DIRECT. 7569 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7570 * controller type is always DKC_DIRECT in that case. 7571 * 7572 * If the virtual disk is backed by a physical CD/DVD device or 7573 * an ISO image, modify the controller type to indicate this 7574 */ 7575 switch (vdc->vdisk_media) { 7576 case VD_MEDIA_CD: 7577 case VD_MEDIA_DVD: 7578 vdc->cinfo->dki_ctype = DKC_CDROM; 7579 break; 7580 case VD_MEDIA_FIXED: 7581 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7582 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7583 else 7584 vdc->cinfo->dki_ctype = DKC_DIRECT; 7585 break; 7586 default: 7587 /* in the case of v1.0 we default to a fixed disk */ 7588 vdc->cinfo->dki_ctype = DKC_DIRECT; 7589 break; 7590 } 7591 vdc->cinfo->dki_flags = DKI_FMTVOL; 7592 vdc->cinfo->dki_cnum = 0; 7593 vdc->cinfo->dki_addr = 0; 7594 vdc->cinfo->dki_space = 0; 7595 vdc->cinfo->dki_prio = 0; 7596 vdc->cinfo->dki_vec = 0; 7597 vdc->cinfo->dki_unit = vdc->instance; 7598 vdc->cinfo->dki_slave = 0; 7599 /* 7600 * The partition number will be created on the fly depending on the 7601 * actual slice (i.e. minor node) that is used to request the data. 7602 */ 7603 vdc->cinfo->dki_partition = 0; 7604 7605 /* 7606 * DKIOCGMEDIAINFO support 7607 */ 7608 if (vdc->minfo == NULL) 7609 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7610 7611 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7612 vdc->minfo->dki_media_type = 7613 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7614 } else { 7615 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7616 } 7617 7618 vdc->minfo->dki_capacity = vdc->vdisk_size; 7619 vdc->minfo->dki_lbsize = vdc->block_size; 7620 } 7621 7622 static ushort_t 7623 vdc_lbl2cksum(struct dk_label *label) 7624 { 7625 int count; 7626 ushort_t sum, *sp; 7627 7628 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7629 sp = (ushort_t *)label; 7630 sum = 0; 7631 while (count--) { 7632 sum ^= *sp++; 7633 } 7634 7635 return (sum); 7636 } 7637 7638 static void 7639 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 7640 { 7641 vd_err_stats_t *stp; 7642 7643 ASSERT(MUTEX_HELD(&vdc->lock)); 7644 ASSERT(xfr_size != 0); 7645 7646 /* 7647 * If the disk size is unknown or sizes are unchanged then don't 7648 * update anything. 7649 */ 7650 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 7651 (blk_size == vdc->block_size && dsk_size == vdc->vdisk_size && 7652 xfr_size == vdc->max_xfer_sz)) 7653 return; 7654 7655 /* 7656 * We don't know at compile time what the vDisk server will think 7657 * are good values but we apply a large (arbitrary) upper bound to 7658 * prevent memory exhaustion in vdc if it was allocating a DRing 7659 * based of huge values sent by the server. We probably will never 7660 * exceed this except if the message was garbage. 7661 */ 7662 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 7663 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 7664 " using max supported by vdc", vdc->instance); 7665 xfr_size = maxphys / DEV_BSIZE; 7666 dsk_size = (dsk_size * blk_size) / DEV_BSIZE; 7667 blk_size = DEV_BSIZE; 7668 } 7669 7670 vdc->max_xfer_sz = xfr_size; 7671 vdc->block_size = blk_size; 7672 vdc->vdisk_size = dsk_size; 7673 7674 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 7675 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 7676 7677 vdc->minfo->dki_capacity = dsk_size; 7678 vdc->minfo->dki_lbsize = (uint_t)blk_size; 7679 } 7680 7681 /* 7682 * Function: 7683 * vdc_validate_geometry 7684 * 7685 * Description: 7686 * This routine discovers the label and geometry of the disk. It stores 7687 * the disk label and related information in the vdc structure. If it 7688 * fails to validate the geometry or to discover the disk label then 7689 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7690 * 7691 * Arguments: 7692 * vdc - soft state pointer for this instance of the device driver. 7693 * 7694 * Return Code: 7695 * 0 - success. 7696 * EINVAL - unknown disk label. 7697 * ENOTSUP - geometry not applicable (EFI label). 7698 * EIO - error accessing the disk. 7699 */ 7700 static int 7701 vdc_validate_geometry(vdc_t *vdc) 7702 { 7703 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7704 dev_t dev; 7705 int rv, rval; 7706 struct dk_label label; 7707 struct dk_geom geom; 7708 struct extvtoc vtoc; 7709 efi_gpt_t *gpt; 7710 efi_gpe_t *gpe; 7711 vd_efi_dev_t edev; 7712 7713 ASSERT(vdc != NULL); 7714 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7715 ASSERT(MUTEX_HELD(&vdc->lock)); 7716 7717 mutex_exit(&vdc->lock); 7718 /* 7719 * Check the disk capacity in case it has changed. If that fails then 7720 * we proceed and we will be using the disk size we currently have. 7721 */ 7722 (void) vdc_check_capacity(vdc); 7723 dev = makedevice(ddi_driver_major(vdc->dip), 7724 VD_MAKE_DEV(vdc->instance, 0)); 7725 7726 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7727 if (rv == 0) 7728 rv = vd_process_ioctl(dev, DKIOCGEXTVTOC, (caddr_t)&vtoc, 7729 FKIOCTL, &rval); 7730 7731 if (rv == ENOTSUP) { 7732 /* 7733 * If the device does not support VTOC then we try 7734 * to read an EFI label. 7735 * 7736 * We need to know the block size and the disk size to 7737 * be able to read an EFI label. 7738 */ 7739 if (vdc->vdisk_size == 0) { 7740 mutex_enter(&vdc->lock); 7741 vdc_store_label_unk(vdc); 7742 return (EIO); 7743 } 7744 7745 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7746 7747 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7748 7749 if (rv) { 7750 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7751 vdc->instance, rv); 7752 mutex_enter(&vdc->lock); 7753 vdc_store_label_unk(vdc); 7754 return (EIO); 7755 } 7756 7757 mutex_enter(&vdc->lock); 7758 vdc_store_label_efi(vdc, gpt, gpe); 7759 vd_efi_free(&edev, gpt, gpe); 7760 return (ENOTSUP); 7761 } 7762 7763 if (rv != 0) { 7764 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7765 vdc->instance, rv); 7766 mutex_enter(&vdc->lock); 7767 vdc_store_label_unk(vdc); 7768 if (rv != EINVAL) 7769 rv = EIO; 7770 return (rv); 7771 } 7772 7773 /* check that geometry and vtoc are valid */ 7774 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7775 vtoc.v_sanity != VTOC_SANE) { 7776 mutex_enter(&vdc->lock); 7777 vdc_store_label_unk(vdc); 7778 return (EINVAL); 7779 } 7780 7781 /* 7782 * We have a disk and a valid VTOC. However this does not mean 7783 * that the disk currently have a VTOC label. The returned VTOC may 7784 * be a default VTOC to be used for configuring the disk (this is 7785 * what is done for disk image). So we read the label from the 7786 * beginning of the disk to ensure we really have a VTOC label. 7787 * 7788 * FUTURE: This could be the default way for reading the VTOC 7789 * from the disk as opposed to sending the VD_OP_GET_VTOC 7790 * to the server. This will be the default if vdc is implemented 7791 * ontop of cmlb. 7792 */ 7793 7794 /* 7795 * Single slice disk does not support read using an absolute disk 7796 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7797 */ 7798 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7799 mutex_enter(&vdc->lock); 7800 if (vtoc.v_nparts != 1) { 7801 vdc_store_label_unk(vdc); 7802 return (EINVAL); 7803 } 7804 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7805 return (0); 7806 } 7807 7808 if (vtoc.v_nparts != V_NUMPAR) { 7809 mutex_enter(&vdc->lock); 7810 vdc_store_label_unk(vdc); 7811 return (EINVAL); 7812 } 7813 7814 /* 7815 * Read disk label from start of disk 7816 */ 7817 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7818 bioinit(buf); 7819 buf->b_un.b_addr = (caddr_t)&label; 7820 buf->b_bcount = DK_LABEL_SIZE; 7821 buf->b_flags = B_BUSY | B_READ; 7822 buf->b_dev = cmpdev(dev); 7823 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7824 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7825 if (rv) { 7826 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7827 vdc->instance); 7828 } else { 7829 rv = biowait(buf); 7830 biofini(buf); 7831 } 7832 kmem_free(buf, sizeof (buf_t)); 7833 7834 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7835 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7836 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7837 vdc->instance); 7838 mutex_enter(&vdc->lock); 7839 vdc_store_label_unk(vdc); 7840 return (EINVAL); 7841 } 7842 7843 mutex_enter(&vdc->lock); 7844 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7845 return (0); 7846 } 7847 7848 /* 7849 * Function: 7850 * vdc_validate 7851 * 7852 * Description: 7853 * This routine discovers the label of the disk and create the 7854 * appropriate device nodes if the label has changed. 7855 * 7856 * Arguments: 7857 * vdc - soft state pointer for this instance of the device driver. 7858 * 7859 * Return Code: 7860 * none. 7861 */ 7862 static void 7863 vdc_validate(vdc_t *vdc) 7864 { 7865 vd_disk_label_t old_label; 7866 vd_slice_t old_slice[V_NUMPAR]; 7867 int rv; 7868 7869 ASSERT(!MUTEX_HELD(&vdc->lock)); 7870 7871 mutex_enter(&vdc->lock); 7872 7873 /* save the current label and vtoc */ 7874 old_label = vdc->vdisk_label; 7875 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7876 7877 /* check the geometry */ 7878 (void) vdc_validate_geometry(vdc); 7879 7880 /* if the disk label has changed, update device nodes */ 7881 if (vdc->vdisk_label != old_label) { 7882 7883 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7884 rv = vdc_create_device_nodes_efi(vdc); 7885 else 7886 rv = vdc_create_device_nodes_vtoc(vdc); 7887 7888 if (rv != 0) { 7889 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7890 vdc->instance); 7891 } 7892 } 7893 7894 mutex_exit(&vdc->lock); 7895 } 7896 7897 static void 7898 vdc_validate_task(void *arg) 7899 { 7900 vdc_t *vdc = (vdc_t *)arg; 7901 7902 vdc_validate(vdc); 7903 7904 mutex_enter(&vdc->lock); 7905 ASSERT(vdc->validate_pending > 0); 7906 vdc->validate_pending--; 7907 mutex_exit(&vdc->lock); 7908 } 7909 7910 /* 7911 * Function: 7912 * vdc_setup_devid() 7913 * 7914 * Description: 7915 * This routine discovers the devid of a vDisk. It requests the devid of 7916 * the underlying device from the vDisk server, builds an encapsulated 7917 * devid based on the retrieved devid and registers that new devid to 7918 * the vDisk. 7919 * 7920 * Arguments: 7921 * vdc - soft state pointer for this instance of the device driver. 7922 * 7923 * Return Code: 7924 * 0 - A devid was succesfully registered for the vDisk 7925 */ 7926 static int 7927 vdc_setup_devid(vdc_t *vdc) 7928 { 7929 int rv; 7930 vd_devid_t *vd_devid; 7931 size_t bufsize, bufid_len; 7932 7933 /* 7934 * At first sight, we don't know the size of the devid that the 7935 * server will return but this size will be encoded into the 7936 * reply. So we do a first request using a default size then we 7937 * check if this size was large enough. If not then we do a second 7938 * request with the correct size returned by the server. Note that 7939 * ldc requires size to be 8-byte aligned. 7940 */ 7941 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7942 sizeof (uint64_t)); 7943 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7944 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7945 7946 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7947 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7948 7949 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7950 7951 if (rv) { 7952 kmem_free(vd_devid, bufsize); 7953 return (rv); 7954 } 7955 7956 if (vd_devid->length > bufid_len) { 7957 /* 7958 * The returned devid is larger than the buffer used. Try again 7959 * with a buffer with the right size. 7960 */ 7961 kmem_free(vd_devid, bufsize); 7962 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7963 sizeof (uint64_t)); 7964 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7965 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7966 7967 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7968 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7969 VIO_both_dir, B_TRUE); 7970 7971 if (rv) { 7972 kmem_free(vd_devid, bufsize); 7973 return (rv); 7974 } 7975 } 7976 7977 /* 7978 * The virtual disk should have the same device id as the one associated 7979 * with the physical disk it is mapped on, otherwise sharing a disk 7980 * between a LDom and a non-LDom may not work (for example for a shared 7981 * SVM disk set). 7982 * 7983 * The DDI framework does not allow creating a device id with any 7984 * type so we first create a device id of type DEVID_ENCAP and then 7985 * we restore the orignal type of the physical device. 7986 */ 7987 7988 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7989 7990 /* build an encapsulated devid based on the returned devid */ 7991 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7992 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7993 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7994 kmem_free(vd_devid, bufsize); 7995 return (1); 7996 } 7997 7998 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7999 8000 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 8001 8002 kmem_free(vd_devid, bufsize); 8003 8004 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 8005 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 8006 return (1); 8007 } 8008 8009 return (0); 8010 } 8011 8012 static void 8013 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 8014 { 8015 int i, nparts; 8016 8017 ASSERT(MUTEX_HELD(&vdc->lock)); 8018 8019 vdc->vdisk_label = VD_DISK_LABEL_EFI; 8020 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8021 bzero(vdc->geom, sizeof (struct dk_geom)); 8022 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8023 8024 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 8025 8026 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 8027 8028 if (gpe[i].efi_gpe_StartingLBA == 0 || 8029 gpe[i].efi_gpe_EndingLBA == 0) { 8030 continue; 8031 } 8032 8033 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 8034 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 8035 gpe[i].efi_gpe_StartingLBA + 1; 8036 } 8037 8038 ASSERT(vdc->vdisk_size != 0); 8039 vdc->slice[VD_EFI_WD_SLICE].start = 0; 8040 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 8041 8042 } 8043 8044 static void 8045 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct extvtoc *vtoc) 8046 { 8047 int i; 8048 8049 ASSERT(MUTEX_HELD(&vdc->lock)); 8050 ASSERT(vdc->block_size == vtoc->v_sectorsz); 8051 8052 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 8053 bcopy(vtoc, vdc->vtoc, sizeof (struct extvtoc)); 8054 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 8055 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8056 8057 for (i = 0; i < vtoc->v_nparts; i++) { 8058 vdc->slice[i].start = vtoc->v_part[i].p_start; 8059 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8060 } 8061 } 8062 8063 static void 8064 vdc_store_label_unk(vdc_t *vdc) 8065 { 8066 ASSERT(MUTEX_HELD(&vdc->lock)); 8067 8068 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8069 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8070 bzero(vdc->geom, sizeof (struct dk_geom)); 8071 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8072 } 8073