1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * LDoms virtual disk client (vdc) device driver 29 * 30 * This driver runs on a guest logical domain and communicates with the virtual 31 * disk server (vds) driver running on the service domain which is exporting 32 * virtualized "disks" to the guest logical domain. 33 * 34 * The driver can be divided into four sections: 35 * 36 * 1) generic device driver housekeeping 37 * _init, _fini, attach, detach, ops structures, etc. 38 * 39 * 2) communication channel setup 40 * Setup the communications link over the LDC channel that vdc uses to 41 * talk to the vDisk server. Initialise the descriptor ring which 42 * allows the LDC clients to transfer data via memory mappings. 43 * 44 * 3) Support exported to upper layers (filesystems, etc) 45 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 46 * ioctl calls. vdc will copy the data to be written to the descriptor 47 * ring or maps the buffer to store the data read by the vDisk 48 * server into the descriptor ring. It then sends a message to the 49 * vDisk server requesting it to complete the operation. 50 * 51 * 4) Handling responses from vDisk server. 52 * The vDisk server will ACK some or all of the messages vdc sends to it 53 * (this is configured during the handshake). Upon receipt of an ACK 54 * vdc will check the descriptor ring and signal to the upper layer 55 * code waiting on the IO. 56 */ 57 58 #include <sys/atomic.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/ddi.h> 62 #include <sys/dkio.h> 63 #include <sys/efi_partition.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/kstat.h> 67 #include <sys/mach_descrip.h> 68 #include <sys/modctl.h> 69 #include <sys/mdeg.h> 70 #include <sys/note.h> 71 #include <sys/open.h> 72 #include <sys/sdt.h> 73 #include <sys/stat.h> 74 #include <sys/sunddi.h> 75 #include <sys/types.h> 76 #include <sys/promif.h> 77 #include <sys/var.h> 78 #include <sys/vtoc.h> 79 #include <sys/archsystm.h> 80 #include <sys/sysmacros.h> 81 82 #include <sys/cdio.h> 83 #include <sys/dktp/fdisk.h> 84 #include <sys/dktp/dadkio.h> 85 #include <sys/mhd.h> 86 #include <sys/scsi/generic/sense.h> 87 #include <sys/scsi/impl/uscsi.h> 88 #include <sys/scsi/impl/services.h> 89 #include <sys/scsi/targets/sddef.h> 90 91 #include <sys/ldoms.h> 92 #include <sys/ldc.h> 93 #include <sys/vio_common.h> 94 #include <sys/vio_mailbox.h> 95 #include <sys/vio_util.h> 96 #include <sys/vdsk_common.h> 97 #include <sys/vdsk_mailbox.h> 98 #include <sys/vdc.h> 99 100 #define VD_OLDVTOC_LIMIT 0x7fffffff 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, 149 struct extvtoc *); 150 static void vdc_store_label_unk(vdc_t *vdc); 151 static boolean_t vdc_is_opened(vdc_t *vdc); 152 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 153 static int vdc_update_vio_bsize(vdc_t *vdc, uint32_t); 154 155 /* handshake with vds */ 156 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 157 static int vdc_ver_negotiation(vdc_t *vdcp); 158 static int vdc_init_attr_negotiation(vdc_t *vdc); 159 static int vdc_attr_negotiation(vdc_t *vdcp); 160 static int vdc_init_dring_negotiate(vdc_t *vdc); 161 static int vdc_dring_negotiation(vdc_t *vdcp); 162 static int vdc_send_rdx(vdc_t *vdcp); 163 static int vdc_rdx_exchange(vdc_t *vdcp); 164 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 165 166 /* processing incoming messages from vDisk server */ 167 static void vdc_process_msg_thread(vdc_t *vdc); 168 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 169 170 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 171 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 172 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 173 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 174 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 175 static int vdc_send_request(vdc_t *vdcp, int operation, 176 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 177 int cb_type, void *cb_arg, vio_desc_direction_t dir); 178 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 179 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 180 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 181 int cb_type, void *cb_arg, vio_desc_direction_t dir); 182 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 183 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 184 void *cb_arg, vio_desc_direction_t dir, boolean_t); 185 186 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 187 static int vdc_drain_response(vdc_t *vdcp, vio_cb_type_t cb_type, 188 struct buf *buf); 189 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 190 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 191 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 192 193 /* dkio */ 194 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 195 int *rvalp); 196 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 197 static void vdc_create_fake_geometry(vdc_t *vdc); 198 static int vdc_validate_geometry(vdc_t *vdc); 199 static void vdc_validate(vdc_t *vdc); 200 static void vdc_validate_task(void *arg); 201 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 216 int mode, int dir); 217 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 218 int mode, int dir); 219 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 220 int mode, int dir); 221 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 222 int mode, int dir); 223 224 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 225 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 226 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 227 static int vdc_failfast_check_resv(vdc_t *vdc); 228 229 /* 230 * Module variables 231 */ 232 233 /* 234 * Tunable variables to control how long vdc waits before timing out on 235 * various operations 236 */ 237 static int vdc_hshake_retries = 3; 238 239 static int vdc_timeout = 0; /* units: seconds */ 240 static int vdc_ldcup_timeout = 1; /* units: seconds */ 241 242 static uint64_t vdc_hz_min_ldc_delay; 243 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 244 static uint64_t vdc_hz_max_ldc_delay; 245 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 246 247 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 248 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 249 250 /* values for dumping - need to run in a tighter loop */ 251 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 252 static int vdc_dump_retries = 100; 253 254 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 255 256 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 257 258 /* Count of the number of vdc instances attached */ 259 static volatile uint32_t vdc_instance_count = 0; 260 261 /* Tunable to log all SCSI errors */ 262 static boolean_t vdc_scsi_log_error = B_FALSE; 263 264 /* Soft state pointer */ 265 static void *vdc_state; 266 267 /* 268 * Controlling the verbosity of the error/debug messages 269 * 270 * vdc_msglevel - controls level of messages 271 * vdc_matchinst - 64-bit variable where each bit corresponds 272 * to the vdc instance the vdc_msglevel applies. 273 */ 274 int vdc_msglevel = 0x0; 275 uint64_t vdc_matchinst = 0ull; 276 277 /* 278 * Supported vDisk protocol version pairs. 279 * 280 * The first array entry is the latest and preferred version. 281 */ 282 static const vio_ver_t vdc_version[] = {{1, 1}}; 283 284 static struct cb_ops vdc_cb_ops = { 285 vdc_open, /* cb_open */ 286 vdc_close, /* cb_close */ 287 vdc_strategy, /* cb_strategy */ 288 vdc_print, /* cb_print */ 289 vdc_dump, /* cb_dump */ 290 vdc_read, /* cb_read */ 291 vdc_write, /* cb_write */ 292 vdc_ioctl, /* cb_ioctl */ 293 nodev, /* cb_devmap */ 294 nodev, /* cb_mmap */ 295 nodev, /* cb_segmap */ 296 nochpoll, /* cb_chpoll */ 297 vdc_prop_op, /* cb_prop_op */ 298 NULL, /* cb_str */ 299 D_MP | D_64BIT, /* cb_flag */ 300 CB_REV, /* cb_rev */ 301 vdc_aread, /* cb_aread */ 302 vdc_awrite /* cb_awrite */ 303 }; 304 305 static struct dev_ops vdc_ops = { 306 DEVO_REV, /* devo_rev */ 307 0, /* devo_refcnt */ 308 vdc_getinfo, /* devo_getinfo */ 309 nulldev, /* devo_identify */ 310 nulldev, /* devo_probe */ 311 vdc_attach, /* devo_attach */ 312 vdc_detach, /* devo_detach */ 313 nodev, /* devo_reset */ 314 &vdc_cb_ops, /* devo_cb_ops */ 315 NULL, /* devo_bus_ops */ 316 nulldev, /* devo_power */ 317 ddi_quiesce_not_needed, /* devo_quiesce */ 318 }; 319 320 static struct modldrv modldrv = { 321 &mod_driverops, 322 "virtual disk client", 323 &vdc_ops, 324 }; 325 326 static struct modlinkage modlinkage = { 327 MODREV_1, 328 &modldrv, 329 NULL 330 }; 331 332 /* -------------------------------------------------------------------------- */ 333 334 /* 335 * Device Driver housekeeping and setup 336 */ 337 338 int 339 _init(void) 340 { 341 int status; 342 343 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 344 return (status); 345 if ((status = mod_install(&modlinkage)) != 0) 346 ddi_soft_state_fini(&vdc_state); 347 return (status); 348 } 349 350 int 351 _info(struct modinfo *modinfop) 352 { 353 return (mod_info(&modlinkage, modinfop)); 354 } 355 356 int 357 _fini(void) 358 { 359 int status; 360 361 if ((status = mod_remove(&modlinkage)) != 0) 362 return (status); 363 ddi_soft_state_fini(&vdc_state); 364 return (0); 365 } 366 367 static int 368 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 369 { 370 _NOTE(ARGUNUSED(dip)) 371 372 int instance = VDCUNIT((dev_t)arg); 373 vdc_t *vdc = NULL; 374 375 switch (cmd) { 376 case DDI_INFO_DEVT2DEVINFO: 377 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 378 *resultp = NULL; 379 return (DDI_FAILURE); 380 } 381 *resultp = vdc->dip; 382 return (DDI_SUCCESS); 383 case DDI_INFO_DEVT2INSTANCE: 384 *resultp = (void *)(uintptr_t)instance; 385 return (DDI_SUCCESS); 386 default: 387 *resultp = NULL; 388 return (DDI_FAILURE); 389 } 390 } 391 392 static int 393 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 394 { 395 kt_did_t failfast_tid, ownership_tid; 396 int instance; 397 int rv; 398 vdc_server_t *srvr; 399 vdc_t *vdc = NULL; 400 401 switch (cmd) { 402 case DDI_DETACH: 403 /* the real work happens below */ 404 break; 405 case DDI_SUSPEND: 406 /* nothing to do for this non-device */ 407 return (DDI_SUCCESS); 408 default: 409 return (DDI_FAILURE); 410 } 411 412 ASSERT(cmd == DDI_DETACH); 413 instance = ddi_get_instance(dip); 414 DMSGX(1, "[%d] Entered\n", instance); 415 416 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 417 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 418 return (DDI_FAILURE); 419 } 420 421 /* 422 * This function is called when vdc is detached or if it has failed to 423 * attach. In that case, the attach may have fail before the vdisk type 424 * has been set so we can't call vdc_is_opened(). However as the attach 425 * has failed, we know that the vdisk is not opened and we can safely 426 * detach. 427 */ 428 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 429 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 430 return (DDI_FAILURE); 431 } 432 433 if (vdc->dkio_flush_pending) { 434 DMSG(vdc, 0, 435 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 436 instance, vdc->dkio_flush_pending); 437 return (DDI_FAILURE); 438 } 439 440 if (vdc->validate_pending) { 441 DMSG(vdc, 0, 442 "[%d] Cannot detach: %d outstanding validate request\n", 443 instance, vdc->validate_pending); 444 return (DDI_FAILURE); 445 } 446 447 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 448 449 /* If we took ownership, release ownership */ 450 mutex_enter(&vdc->ownership_lock); 451 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 452 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 453 if (rv == 0) { 454 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 455 } 456 } 457 mutex_exit(&vdc->ownership_lock); 458 459 /* mark instance as detaching */ 460 vdc->lifecycle = VDC_LC_DETACHING; 461 462 /* 463 * Try and disable callbacks to prevent another handshake. We have to 464 * disable callbacks for all servers. 465 */ 466 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 467 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 468 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 469 srvr->ldc_id, rv); 470 } 471 472 if (vdc->initialized & VDC_THREAD) { 473 mutex_enter(&vdc->read_lock); 474 if ((vdc->read_state == VDC_READ_WAITING) || 475 (vdc->read_state == VDC_READ_RESET)) { 476 vdc->read_state = VDC_READ_RESET; 477 cv_signal(&vdc->read_cv); 478 } 479 480 mutex_exit(&vdc->read_lock); 481 482 /* wake up any thread waiting for connection to come online */ 483 mutex_enter(&vdc->lock); 484 if (vdc->state == VDC_STATE_INIT_WAITING) { 485 DMSG(vdc, 0, 486 "[%d] write reset - move to resetting state...\n", 487 instance); 488 vdc->state = VDC_STATE_RESETTING; 489 cv_signal(&vdc->initwait_cv); 490 } 491 mutex_exit(&vdc->lock); 492 493 /* now wait until state transitions to VDC_STATE_DETACH */ 494 thread_join(vdc->msg_proc_thr->t_did); 495 ASSERT(vdc->state == VDC_STATE_DETACH); 496 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 497 vdc->instance); 498 } 499 500 mutex_enter(&vdc->lock); 501 502 if (vdc->initialized & VDC_DRING) 503 vdc_destroy_descriptor_ring(vdc); 504 505 vdc_fini_ports(vdc); 506 507 if (vdc->failfast_thread) { 508 failfast_tid = vdc->failfast_thread->t_did; 509 vdc->failfast_interval = 0; 510 cv_signal(&vdc->failfast_cv); 511 } else { 512 failfast_tid = 0; 513 } 514 515 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 516 ownership_tid = vdc->ownership_thread->t_did; 517 vdc->ownership = VDC_OWNERSHIP_NONE; 518 cv_signal(&vdc->ownership_cv); 519 } else { 520 ownership_tid = 0; 521 } 522 523 mutex_exit(&vdc->lock); 524 525 if (failfast_tid != 0) 526 thread_join(failfast_tid); 527 528 if (ownership_tid != 0) 529 thread_join(ownership_tid); 530 531 if (vdc->initialized & VDC_MINOR) 532 ddi_remove_minor_node(dip, NULL); 533 534 if (vdc->io_stats) { 535 kstat_delete(vdc->io_stats); 536 vdc->io_stats = NULL; 537 } 538 539 if (vdc->err_stats) { 540 kstat_delete(vdc->err_stats); 541 vdc->err_stats = NULL; 542 } 543 544 if (vdc->initialized & VDC_LOCKS) { 545 mutex_destroy(&vdc->lock); 546 mutex_destroy(&vdc->read_lock); 547 mutex_destroy(&vdc->ownership_lock); 548 cv_destroy(&vdc->initwait_cv); 549 cv_destroy(&vdc->dring_free_cv); 550 cv_destroy(&vdc->membind_cv); 551 cv_destroy(&vdc->sync_pending_cv); 552 cv_destroy(&vdc->sync_blocked_cv); 553 cv_destroy(&vdc->read_cv); 554 cv_destroy(&vdc->running_cv); 555 cv_destroy(&vdc->ownership_cv); 556 cv_destroy(&vdc->failfast_cv); 557 cv_destroy(&vdc->failfast_io_cv); 558 } 559 560 if (vdc->minfo) 561 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 562 563 if (vdc->cinfo) 564 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 565 566 if (vdc->vtoc) 567 kmem_free(vdc->vtoc, sizeof (struct extvtoc)); 568 569 if (vdc->geom) 570 kmem_free(vdc->geom, sizeof (struct dk_geom)); 571 572 if (vdc->devid) { 573 ddi_devid_unregister(dip); 574 ddi_devid_free(vdc->devid); 575 } 576 577 if (vdc->initialized & VDC_SOFT_STATE) 578 ddi_soft_state_free(vdc_state, instance); 579 580 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 581 582 return (DDI_SUCCESS); 583 } 584 585 586 static int 587 vdc_do_attach(dev_info_t *dip) 588 { 589 int instance; 590 vdc_t *vdc = NULL; 591 int status; 592 md_t *mdp; 593 mde_cookie_t vd_node; 594 595 ASSERT(dip != NULL); 596 597 instance = ddi_get_instance(dip); 598 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 599 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 600 instance); 601 return (DDI_FAILURE); 602 } 603 604 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 605 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 606 return (DDI_FAILURE); 607 } 608 609 /* 610 * We assign the value to initialized in this case to zero out the 611 * variable and then set bits in it to indicate what has been done 612 */ 613 vdc->initialized = VDC_SOFT_STATE; 614 615 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 616 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 617 618 vdc->dip = dip; 619 vdc->instance = instance; 620 vdc->vdisk_type = VD_DISK_TYPE_UNK; 621 vdc->vdisk_label = VD_DISK_LABEL_UNK; 622 vdc->state = VDC_STATE_INIT; 623 vdc->lifecycle = VDC_LC_ATTACHING; 624 vdc->session_id = 0; 625 vdc->vdisk_bsize = DEV_BSIZE; 626 vdc->vio_bmask = 0; 627 vdc->vio_bshift = 0; 628 vdc->max_xfer_sz = maxphys / vdc->vdisk_bsize; 629 630 /* 631 * We assume, for now, that the vDisk server will export 'read' 632 * operations to us at a minimum (this is needed because of checks 633 * in vdc for supported operations early in the handshake process). 634 * The vDisk server will return ENOTSUP if this is not the case. 635 * The value will be overwritten during the attribute exchange with 636 * the bitmask of operations exported by server. 637 */ 638 vdc->operations = VD_OP_MASK_READ; 639 640 vdc->vtoc = NULL; 641 vdc->geom = NULL; 642 vdc->cinfo = NULL; 643 vdc->minfo = NULL; 644 645 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 646 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 647 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 649 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 650 651 vdc->threads_pending = 0; 652 vdc->sync_op_pending = B_FALSE; 653 vdc->sync_op_blocked = B_FALSE; 654 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 655 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 656 657 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 658 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 659 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 660 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 661 662 /* init blocking msg read functionality */ 663 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 664 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 665 vdc->read_state = VDC_READ_IDLE; 666 667 vdc->initialized |= VDC_LOCKS; 668 669 /* get device and port MD node for this disk instance */ 670 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 671 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 672 instance); 673 return (DDI_FAILURE); 674 } 675 676 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 677 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 678 return (DDI_FAILURE); 679 } 680 681 (void) md_fini_handle(mdp); 682 683 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 684 vdc_create_io_kstats(vdc); 685 vdc_create_err_kstats(vdc); 686 687 /* Initialize remaining structures before starting the msg thread */ 688 vdc->vdisk_label = VD_DISK_LABEL_UNK; 689 vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP); 690 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 691 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 692 693 /* initialize the thread responsible for managing state with server */ 694 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 695 vdc, 0, &p0, TS_RUN, minclsyspri); 696 if (vdc->msg_proc_thr == NULL) { 697 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 698 instance); 699 return (DDI_FAILURE); 700 } 701 702 vdc->initialized |= VDC_THREAD; 703 704 atomic_inc_32(&vdc_instance_count); 705 706 /* 707 * Check the disk label. This will send requests and do the handshake. 708 * We don't really care about the disk label now. What we really need is 709 * the handshake do be done so that we know the type of the disk (slice 710 * or full disk) and the appropriate device nodes can be created. 711 */ 712 713 mutex_enter(&vdc->lock); 714 (void) vdc_validate_geometry(vdc); 715 mutex_exit(&vdc->lock); 716 717 /* 718 * Now that we have the device info we can create the device nodes 719 */ 720 status = vdc_create_device_nodes(vdc); 721 if (status) { 722 DMSG(vdc, 0, "[%d] Failed to create device nodes", 723 instance); 724 goto return_status; 725 } 726 727 /* 728 * Setup devid 729 */ 730 if (vdc_setup_devid(vdc)) { 731 DMSG(vdc, 0, "[%d] No device id available\n", instance); 732 } 733 734 /* 735 * Fill in the fields of the error statistics kstat that were not 736 * available when creating the kstat 737 */ 738 vdc_set_err_kstats(vdc); 739 740 ddi_report_dev(dip); 741 vdc->lifecycle = VDC_LC_ONLINE; 742 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 743 744 return_status: 745 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 746 return (status); 747 } 748 749 static int 750 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 751 { 752 int status; 753 754 switch (cmd) { 755 case DDI_ATTACH: 756 if ((status = vdc_do_attach(dip)) != 0) 757 (void) vdc_detach(dip, DDI_DETACH); 758 return (status); 759 case DDI_RESUME: 760 /* nothing to do for this non-device */ 761 return (DDI_SUCCESS); 762 default: 763 return (DDI_FAILURE); 764 } 765 } 766 767 static int 768 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 769 { 770 int status = 0; 771 ldc_status_t ldc_state; 772 ldc_attr_t ldc_attr; 773 774 ASSERT(vdc != NULL); 775 ASSERT(srvr != NULL); 776 777 ldc_attr.devclass = LDC_DEV_BLK; 778 ldc_attr.instance = vdc->instance; 779 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 780 ldc_attr.mtu = VD_LDC_MTU; 781 782 if ((srvr->state & VDC_LDC_INIT) == 0) { 783 status = ldc_init(srvr->ldc_id, &ldc_attr, 784 &srvr->ldc_handle); 785 if (status != 0) { 786 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 787 vdc->instance, srvr->ldc_id, status); 788 return (status); 789 } 790 srvr->state |= VDC_LDC_INIT; 791 } 792 status = ldc_status(srvr->ldc_handle, &ldc_state); 793 if (status != 0) { 794 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 795 vdc->instance, status); 796 goto init_exit; 797 } 798 srvr->ldc_state = ldc_state; 799 800 if ((srvr->state & VDC_LDC_CB) == 0) { 801 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 802 (caddr_t)srvr); 803 if (status != 0) { 804 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 805 vdc->instance, status); 806 goto init_exit; 807 } 808 srvr->state |= VDC_LDC_CB; 809 } 810 811 /* 812 * At this stage we have initialised LDC, we will now try and open 813 * the connection. 814 */ 815 if (srvr->ldc_state == LDC_INIT) { 816 status = ldc_open(srvr->ldc_handle); 817 if (status != 0) { 818 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 819 vdc->instance, srvr->ldc_id, status); 820 goto init_exit; 821 } 822 srvr->state |= VDC_LDC_OPEN; 823 } 824 825 init_exit: 826 if (status) { 827 vdc_terminate_ldc(vdc, srvr); 828 } 829 830 return (status); 831 } 832 833 static int 834 vdc_start_ldc_connection(vdc_t *vdc) 835 { 836 int status = 0; 837 838 ASSERT(vdc != NULL); 839 840 ASSERT(MUTEX_HELD(&vdc->lock)); 841 842 status = vdc_do_ldc_up(vdc); 843 844 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 845 846 return (status); 847 } 848 849 static int 850 vdc_stop_ldc_connection(vdc_t *vdcp) 851 { 852 int status; 853 854 ASSERT(vdcp != NULL); 855 856 ASSERT(MUTEX_HELD(&vdcp->lock)); 857 858 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 859 vdcp->state); 860 861 status = ldc_down(vdcp->curr_server->ldc_handle); 862 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 863 864 vdcp->initialized &= ~VDC_HANDSHAKE; 865 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 866 867 return (status); 868 } 869 870 static void 871 vdc_create_io_kstats(vdc_t *vdc) 872 { 873 if (vdc->io_stats != NULL) { 874 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 875 return; 876 } 877 878 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 879 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 880 if (vdc->io_stats != NULL) { 881 vdc->io_stats->ks_lock = &vdc->lock; 882 kstat_install(vdc->io_stats); 883 } else { 884 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 885 " will not be gathered", vdc->instance); 886 } 887 } 888 889 static void 890 vdc_create_err_kstats(vdc_t *vdc) 891 { 892 vd_err_stats_t *stp; 893 char kstatmodule_err[KSTAT_STRLEN]; 894 char kstatname[KSTAT_STRLEN]; 895 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 896 int instance = vdc->instance; 897 898 if (vdc->err_stats != NULL) { 899 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 900 return; 901 } 902 903 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 904 "%serr", VDC_DRIVER_NAME); 905 (void) snprintf(kstatname, sizeof (kstatname), 906 "%s%d,err", VDC_DRIVER_NAME, instance); 907 908 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 909 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 910 911 if (vdc->err_stats == NULL) { 912 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 913 " will not be gathered", instance); 914 return; 915 } 916 917 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 918 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 919 KSTAT_DATA_UINT32); 920 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 921 KSTAT_DATA_UINT32); 922 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 923 KSTAT_DATA_UINT32); 924 kstat_named_init(&stp->vd_vid, "Vendor", 925 KSTAT_DATA_CHAR); 926 kstat_named_init(&stp->vd_pid, "Product", 927 KSTAT_DATA_CHAR); 928 kstat_named_init(&stp->vd_capacity, "Size", 929 KSTAT_DATA_ULONGLONG); 930 931 vdc->err_stats->ks_update = nulldev; 932 933 kstat_install(vdc->err_stats); 934 } 935 936 static void 937 vdc_set_err_kstats(vdc_t *vdc) 938 { 939 vd_err_stats_t *stp; 940 941 if (vdc->err_stats == NULL) 942 return; 943 944 mutex_enter(&vdc->lock); 945 946 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 947 ASSERT(stp != NULL); 948 949 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->vdisk_bsize; 950 (void) strcpy(stp->vd_vid.value.c, "SUN"); 951 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 952 953 mutex_exit(&vdc->lock); 954 } 955 956 static int 957 vdc_create_device_nodes_efi(vdc_t *vdc) 958 { 959 ddi_remove_minor_node(vdc->dip, "h"); 960 ddi_remove_minor_node(vdc->dip, "h,raw"); 961 962 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 963 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 964 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 965 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 966 vdc->instance); 967 return (EIO); 968 } 969 970 /* if any device node is created we set this flag */ 971 vdc->initialized |= VDC_MINOR; 972 973 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 974 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 975 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 976 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 977 vdc->instance); 978 return (EIO); 979 } 980 981 return (0); 982 } 983 984 static int 985 vdc_create_device_nodes_vtoc(vdc_t *vdc) 986 { 987 ddi_remove_minor_node(vdc->dip, "wd"); 988 ddi_remove_minor_node(vdc->dip, "wd,raw"); 989 990 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 991 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 992 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 993 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 994 vdc->instance); 995 return (EIO); 996 } 997 998 /* if any device node is created we set this flag */ 999 vdc->initialized |= VDC_MINOR; 1000 1001 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 1002 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1003 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1004 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1005 vdc->instance); 1006 return (EIO); 1007 } 1008 1009 return (0); 1010 } 1011 1012 /* 1013 * Function: 1014 * vdc_create_device_nodes 1015 * 1016 * Description: 1017 * This function creates the block and character device nodes under 1018 * /devices. It is called as part of the attach(9E) of the instance 1019 * during the handshake with vds after vds has sent the attributes 1020 * to vdc. 1021 * 1022 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1023 * of 2 is used in keeping with the Solaris convention that slice 2 1024 * refers to a whole disk. Slices start at 'a' 1025 * 1026 * Parameters: 1027 * vdc - soft state pointer 1028 * 1029 * Return Values 1030 * 0 - Success 1031 * EIO - Failed to create node 1032 * EINVAL - Unknown type of disk exported 1033 */ 1034 static int 1035 vdc_create_device_nodes(vdc_t *vdc) 1036 { 1037 char name[sizeof ("s,raw")]; 1038 dev_info_t *dip = NULL; 1039 int instance, status; 1040 int num_slices = 1; 1041 int i; 1042 1043 ASSERT(vdc != NULL); 1044 1045 instance = vdc->instance; 1046 dip = vdc->dip; 1047 1048 switch (vdc->vdisk_type) { 1049 case VD_DISK_TYPE_DISK: 1050 num_slices = V_NUMPAR; 1051 break; 1052 case VD_DISK_TYPE_SLICE: 1053 num_slices = 1; 1054 break; 1055 case VD_DISK_TYPE_UNK: 1056 default: 1057 return (EINVAL); 1058 } 1059 1060 /* 1061 * Minor nodes are different for EFI disks: EFI disks do not have 1062 * a minor node 'g' for the minor number corresponding to slice 1063 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1064 * representing the whole disk. 1065 */ 1066 for (i = 0; i < num_slices; i++) { 1067 1068 if (i == VD_EFI_WD_SLICE) { 1069 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1070 status = vdc_create_device_nodes_efi(vdc); 1071 else 1072 status = vdc_create_device_nodes_vtoc(vdc); 1073 if (status != 0) 1074 return (status); 1075 continue; 1076 } 1077 1078 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1079 if (ddi_create_minor_node(dip, name, S_IFBLK, 1080 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1081 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1082 instance, name); 1083 return (EIO); 1084 } 1085 1086 /* if any device node is created we set this flag */ 1087 vdc->initialized |= VDC_MINOR; 1088 1089 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1090 1091 if (ddi_create_minor_node(dip, name, S_IFCHR, 1092 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1093 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1094 instance, name); 1095 return (EIO); 1096 } 1097 } 1098 1099 return (0); 1100 } 1101 1102 /* 1103 * Driver prop_op(9e) entry point function. Return the number of blocks for 1104 * the partition in question or forward the request to the property facilities. 1105 */ 1106 static int 1107 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1108 char *name, caddr_t valuep, int *lengthp) 1109 { 1110 int instance = ddi_get_instance(dip); 1111 vdc_t *vdc; 1112 uint64_t nblocks; 1113 uint_t blksize; 1114 1115 vdc = ddi_get_soft_state(vdc_state, instance); 1116 1117 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1118 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1119 name, valuep, lengthp)); 1120 } 1121 1122 mutex_enter(&vdc->lock); 1123 (void) vdc_validate_geometry(vdc); 1124 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1125 mutex_exit(&vdc->lock); 1126 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1127 name, valuep, lengthp)); 1128 } 1129 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1130 blksize = vdc->vdisk_bsize; 1131 mutex_exit(&vdc->lock); 1132 1133 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1134 name, valuep, lengthp, nblocks, blksize)); 1135 } 1136 1137 /* 1138 * Function: 1139 * vdc_is_opened 1140 * 1141 * Description: 1142 * This function checks if any slice of a given virtual disk is 1143 * currently opened. 1144 * 1145 * Parameters: 1146 * vdc - soft state pointer 1147 * 1148 * Return Values 1149 * B_TRUE - at least one slice is opened. 1150 * B_FALSE - no slice is opened. 1151 */ 1152 static boolean_t 1153 vdc_is_opened(vdc_t *vdc) 1154 { 1155 int i, nslices; 1156 1157 switch (vdc->vdisk_type) { 1158 case VD_DISK_TYPE_DISK: 1159 nslices = V_NUMPAR; 1160 break; 1161 case VD_DISK_TYPE_SLICE: 1162 nslices = 1; 1163 break; 1164 case VD_DISK_TYPE_UNK: 1165 default: 1166 ASSERT(0); 1167 } 1168 1169 /* check if there's any layered open */ 1170 for (i = 0; i < nslices; i++) { 1171 if (vdc->open_lyr[i] > 0) 1172 return (B_TRUE); 1173 } 1174 1175 /* check if there is any other kind of open */ 1176 for (i = 0; i < OTYPCNT; i++) { 1177 if (vdc->open[i] != 0) 1178 return (B_TRUE); 1179 } 1180 1181 return (B_FALSE); 1182 } 1183 1184 static int 1185 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1186 { 1187 uint8_t slicemask; 1188 int i; 1189 1190 ASSERT(otyp < OTYPCNT); 1191 ASSERT(slice < V_NUMPAR); 1192 ASSERT(MUTEX_HELD(&vdc->lock)); 1193 1194 slicemask = 1 << slice; 1195 1196 /* check if slice is already exclusively opened */ 1197 if (vdc->open_excl & slicemask) 1198 return (EBUSY); 1199 1200 /* if open exclusive, check if slice is already opened */ 1201 if (flag & FEXCL) { 1202 if (vdc->open_lyr[slice] > 0) 1203 return (EBUSY); 1204 for (i = 0; i < OTYPCNT; i++) { 1205 if (vdc->open[i] & slicemask) 1206 return (EBUSY); 1207 } 1208 vdc->open_excl |= slicemask; 1209 } 1210 1211 /* mark slice as opened */ 1212 if (otyp == OTYP_LYR) { 1213 vdc->open_lyr[slice]++; 1214 } else { 1215 vdc->open[otyp] |= slicemask; 1216 } 1217 1218 return (0); 1219 } 1220 1221 static void 1222 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1223 { 1224 uint8_t slicemask; 1225 1226 ASSERT(otyp < OTYPCNT); 1227 ASSERT(slice < V_NUMPAR); 1228 ASSERT(MUTEX_HELD(&vdc->lock)); 1229 1230 slicemask = 1 << slice; 1231 1232 if (otyp == OTYP_LYR) { 1233 ASSERT(vdc->open_lyr[slice] > 0); 1234 vdc->open_lyr[slice]--; 1235 } else { 1236 vdc->open[otyp] &= ~slicemask; 1237 } 1238 1239 if (flag & FEXCL) 1240 vdc->open_excl &= ~slicemask; 1241 } 1242 1243 static int 1244 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1245 { 1246 _NOTE(ARGUNUSED(cred)) 1247 1248 int instance, nodelay; 1249 int slice, status = 0; 1250 vdc_t *vdc; 1251 1252 ASSERT(dev != NULL); 1253 instance = VDCUNIT(*dev); 1254 1255 if (otyp >= OTYPCNT) 1256 return (EINVAL); 1257 1258 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1259 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1260 return (ENXIO); 1261 } 1262 1263 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1264 getminor(*dev), flag, otyp); 1265 1266 slice = VDCPART(*dev); 1267 1268 nodelay = flag & (FNDELAY | FNONBLOCK); 1269 1270 if ((flag & FWRITE) && (!nodelay) && 1271 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1272 return (EROFS); 1273 } 1274 1275 mutex_enter(&vdc->lock); 1276 1277 status = vdc_mark_opened(vdc, slice, flag, otyp); 1278 1279 if (status != 0) { 1280 mutex_exit(&vdc->lock); 1281 return (status); 1282 } 1283 1284 if (nodelay) { 1285 1286 /* don't resubmit a validate request if there's already one */ 1287 if (vdc->validate_pending > 0) { 1288 mutex_exit(&vdc->lock); 1289 return (0); 1290 } 1291 1292 /* call vdc_validate() asynchronously to avoid blocking */ 1293 if (taskq_dispatch(system_taskq, vdc_validate_task, 1294 (void *)vdc, TQ_NOSLEEP) == NULL) { 1295 vdc_mark_closed(vdc, slice, flag, otyp); 1296 mutex_exit(&vdc->lock); 1297 return (ENXIO); 1298 } 1299 1300 vdc->validate_pending++; 1301 mutex_exit(&vdc->lock); 1302 return (0); 1303 } 1304 1305 mutex_exit(&vdc->lock); 1306 1307 vdc_validate(vdc); 1308 1309 mutex_enter(&vdc->lock); 1310 1311 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1312 vdc->slice[slice].nblocks == 0) { 1313 vdc_mark_closed(vdc, slice, flag, otyp); 1314 status = EIO; 1315 } 1316 1317 mutex_exit(&vdc->lock); 1318 1319 return (status); 1320 } 1321 1322 static int 1323 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1324 { 1325 _NOTE(ARGUNUSED(cred)) 1326 1327 int instance; 1328 int slice; 1329 int rv, rval; 1330 vdc_t *vdc; 1331 1332 instance = VDCUNIT(dev); 1333 1334 if (otyp >= OTYPCNT) 1335 return (EINVAL); 1336 1337 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1338 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1339 return (ENXIO); 1340 } 1341 1342 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1343 1344 slice = VDCPART(dev); 1345 1346 /* 1347 * Attempt to flush the W$ on a close operation. If this is 1348 * not a supported IOCTL command or the backing device is read-only 1349 * do not fail the close operation. 1350 */ 1351 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1352 1353 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1354 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1355 instance, rv); 1356 return (EIO); 1357 } 1358 1359 mutex_enter(&vdc->lock); 1360 vdc_mark_closed(vdc, slice, flag, otyp); 1361 mutex_exit(&vdc->lock); 1362 1363 return (0); 1364 } 1365 1366 static int 1367 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1368 { 1369 _NOTE(ARGUNUSED(credp)) 1370 1371 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1372 } 1373 1374 static int 1375 vdc_print(dev_t dev, char *str) 1376 { 1377 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1378 return (0); 1379 } 1380 1381 static int 1382 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1383 { 1384 int rv; 1385 size_t nbytes = nblk * DEV_BSIZE; 1386 int instance = VDCUNIT(dev); 1387 vdc_t *vdc = NULL; 1388 diskaddr_t vio_blkno; 1389 1390 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1391 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1392 return (ENXIO); 1393 } 1394 1395 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1396 instance, nbytes, blkno, (void *)addr); 1397 1398 /* convert logical block to vio block */ 1399 if ((blkno & vdc->vio_bmask) != 0) { 1400 DMSG(vdc, 0, "Misaligned block number (%lu)\n", blkno); 1401 return (EINVAL); 1402 } 1403 vio_blkno = blkno >> vdc->vio_bshift; 1404 1405 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1406 VDCPART(dev), vio_blkno, CB_STRATEGY, 0, VIO_write_dir); 1407 if (rv) { 1408 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1409 return (rv); 1410 } 1411 1412 if (ddi_in_panic()) 1413 (void) vdc_drain_response(vdc, CB_STRATEGY, NULL); 1414 1415 DMSG(vdc, 0, "[%d] End\n", instance); 1416 1417 return (0); 1418 } 1419 1420 /* -------------------------------------------------------------------------- */ 1421 1422 /* 1423 * Disk access routines 1424 * 1425 */ 1426 1427 /* 1428 * vdc_strategy() 1429 * 1430 * Return Value: 1431 * 0: As per strategy(9E), the strategy() function must return 0 1432 * [ bioerror(9f) sets b_flags to the proper error code ] 1433 */ 1434 static int 1435 vdc_strategy(struct buf *buf) 1436 { 1437 diskaddr_t vio_blkno; 1438 int rv = -1; 1439 vdc_t *vdc = NULL; 1440 int instance = VDCUNIT(buf->b_edev); 1441 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1442 int slice; 1443 1444 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1445 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1446 bioerror(buf, ENXIO); 1447 biodone(buf); 1448 return (0); 1449 } 1450 1451 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1452 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1453 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1454 1455 bp_mapin(buf); 1456 1457 if ((long)buf->b_private == VD_SLICE_NONE) { 1458 /* I/O using an absolute disk offset */ 1459 slice = VD_SLICE_NONE; 1460 } else { 1461 slice = VDCPART(buf->b_edev); 1462 } 1463 1464 /* 1465 * In the buf structure, b_lblkno represents a logical block number 1466 * using a block size of 512 bytes. For the VIO request, this block 1467 * number has to be converted to be represented with the block size 1468 * used by the VIO protocol. 1469 */ 1470 if ((buf->b_lblkno & vdc->vio_bmask) != 0) { 1471 bioerror(buf, EINVAL); 1472 biodone(buf); 1473 return (0); 1474 } 1475 vio_blkno = buf->b_lblkno >> vdc->vio_bshift; 1476 1477 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1478 buf->b_bcount, slice, vio_blkno, 1479 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1480 VIO_write_dir); 1481 1482 /* 1483 * If the request was successfully sent, the strategy call returns and 1484 * the ACK handler calls the bioxxx functions when the vDisk server is 1485 * done otherwise we handle the error here. 1486 */ 1487 if (rv) { 1488 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1489 bioerror(buf, rv); 1490 biodone(buf); 1491 } else if (ddi_in_panic()) { 1492 rv = vdc_drain_response(vdc, CB_STRATEGY, buf); 1493 if (rv != 0) { 1494 bioerror(buf, EIO); 1495 biodone(buf); 1496 } 1497 } 1498 1499 return (0); 1500 } 1501 1502 /* 1503 * Function: 1504 * vdc_min 1505 * 1506 * Description: 1507 * Routine to limit the size of a data transfer. Used in 1508 * conjunction with physio(9F). 1509 * 1510 * Arguments: 1511 * bp - pointer to the indicated buf(9S) struct. 1512 * 1513 */ 1514 static void 1515 vdc_min(struct buf *bufp) 1516 { 1517 vdc_t *vdc = NULL; 1518 int instance = VDCUNIT(bufp->b_edev); 1519 1520 vdc = ddi_get_soft_state(vdc_state, instance); 1521 VERIFY(vdc != NULL); 1522 1523 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->vdisk_bsize)) { 1524 bufp->b_bcount = vdc->max_xfer_sz * vdc->vdisk_bsize; 1525 } 1526 } 1527 1528 static int 1529 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1530 { 1531 _NOTE(ARGUNUSED(cred)) 1532 1533 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1534 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1535 } 1536 1537 static int 1538 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1539 { 1540 _NOTE(ARGUNUSED(cred)) 1541 1542 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1543 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1544 } 1545 1546 static int 1547 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1548 { 1549 _NOTE(ARGUNUSED(cred)) 1550 1551 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1552 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1553 } 1554 1555 static int 1556 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1557 { 1558 _NOTE(ARGUNUSED(cred)) 1559 1560 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1561 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1562 } 1563 1564 1565 /* -------------------------------------------------------------------------- */ 1566 1567 /* 1568 * Handshake support 1569 */ 1570 1571 1572 /* 1573 * Function: 1574 * vdc_init_ver_negotiation() 1575 * 1576 * Description: 1577 * 1578 * Arguments: 1579 * vdc - soft state pointer for this instance of the device driver. 1580 * 1581 * Return Code: 1582 * 0 - Success 1583 */ 1584 static int 1585 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1586 { 1587 vio_ver_msg_t pkt; 1588 size_t msglen = sizeof (pkt); 1589 int status = -1; 1590 1591 ASSERT(vdc != NULL); 1592 ASSERT(mutex_owned(&vdc->lock)); 1593 1594 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1595 1596 /* 1597 * set the Session ID to a unique value 1598 * (the lower 32 bits of the clock tick) 1599 */ 1600 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1601 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1602 1603 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1604 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1605 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1606 pkt.tag.vio_sid = vdc->session_id; 1607 pkt.dev_class = VDEV_DISK; 1608 pkt.ver_major = ver.major; 1609 pkt.ver_minor = ver.minor; 1610 1611 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1612 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1613 vdc->instance, status); 1614 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1615 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1616 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1617 vdc->curr_server->ldc_handle, status, msglen); 1618 if (msglen != sizeof (vio_ver_msg_t)) 1619 status = ENOMSG; 1620 } 1621 1622 return (status); 1623 } 1624 1625 /* 1626 * Function: 1627 * vdc_ver_negotiation() 1628 * 1629 * Description: 1630 * 1631 * Arguments: 1632 * vdcp - soft state pointer for this instance of the device driver. 1633 * 1634 * Return Code: 1635 * 0 - Success 1636 */ 1637 static int 1638 vdc_ver_negotiation(vdc_t *vdcp) 1639 { 1640 vio_msg_t vio_msg; 1641 int status; 1642 1643 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1644 return (status); 1645 1646 /* release lock and wait for response */ 1647 mutex_exit(&vdcp->lock); 1648 status = vdc_wait_for_response(vdcp, &vio_msg); 1649 mutex_enter(&vdcp->lock); 1650 if (status) { 1651 DMSG(vdcp, 0, 1652 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1653 vdcp->instance, status); 1654 return (status); 1655 } 1656 1657 /* check type and sub_type ... */ 1658 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1659 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1660 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1661 vdcp->instance); 1662 return (EPROTO); 1663 } 1664 1665 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1666 } 1667 1668 /* 1669 * Function: 1670 * vdc_init_attr_negotiation() 1671 * 1672 * Description: 1673 * 1674 * Arguments: 1675 * vdc - soft state pointer for this instance of the device driver. 1676 * 1677 * Return Code: 1678 * 0 - Success 1679 */ 1680 static int 1681 vdc_init_attr_negotiation(vdc_t *vdc) 1682 { 1683 vd_attr_msg_t pkt; 1684 size_t msglen = sizeof (pkt); 1685 int status; 1686 1687 ASSERT(vdc != NULL); 1688 ASSERT(mutex_owned(&vdc->lock)); 1689 1690 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1691 1692 /* fill in tag */ 1693 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1694 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1695 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1696 pkt.tag.vio_sid = vdc->session_id; 1697 /* fill in payload */ 1698 pkt.max_xfer_sz = vdc->max_xfer_sz; 1699 pkt.vdisk_block_size = vdc->vdisk_bsize; 1700 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1701 pkt.operations = 0; /* server will set bits of valid operations */ 1702 pkt.vdisk_type = 0; /* server will set to valid device type */ 1703 pkt.vdisk_media = 0; /* server will set to valid media type */ 1704 pkt.vdisk_size = 0; /* server will set to valid size */ 1705 1706 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1707 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1708 1709 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1710 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1711 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1712 vdc->curr_server->ldc_handle, status, msglen); 1713 if (msglen != sizeof (vd_attr_msg_t)) 1714 status = ENOMSG; 1715 } 1716 1717 return (status); 1718 } 1719 1720 /* 1721 * Function: 1722 * vdc_attr_negotiation() 1723 * 1724 * Description: 1725 * 1726 * Arguments: 1727 * vdc - soft state pointer for this instance of the device driver. 1728 * 1729 * Return Code: 1730 * 0 - Success 1731 */ 1732 static int 1733 vdc_attr_negotiation(vdc_t *vdcp) 1734 { 1735 int status; 1736 vio_msg_t vio_msg; 1737 1738 if (status = vdc_init_attr_negotiation(vdcp)) 1739 return (status); 1740 1741 /* release lock and wait for response */ 1742 mutex_exit(&vdcp->lock); 1743 status = vdc_wait_for_response(vdcp, &vio_msg); 1744 mutex_enter(&vdcp->lock); 1745 if (status) { 1746 DMSG(vdcp, 0, 1747 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1748 vdcp->instance, status); 1749 return (status); 1750 } 1751 1752 /* check type and sub_type ... */ 1753 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1754 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1755 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1756 vdcp->instance); 1757 return (EPROTO); 1758 } 1759 1760 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1761 } 1762 1763 1764 /* 1765 * Function: 1766 * vdc_init_dring_negotiate() 1767 * 1768 * Description: 1769 * 1770 * Arguments: 1771 * vdc - soft state pointer for this instance of the device driver. 1772 * 1773 * Return Code: 1774 * 0 - Success 1775 */ 1776 static int 1777 vdc_init_dring_negotiate(vdc_t *vdc) 1778 { 1779 vio_dring_reg_msg_t pkt; 1780 size_t msglen = sizeof (pkt); 1781 int status = -1; 1782 int retry; 1783 int nretries = 10; 1784 1785 ASSERT(vdc != NULL); 1786 ASSERT(mutex_owned(&vdc->lock)); 1787 1788 for (retry = 0; retry < nretries; retry++) { 1789 status = vdc_init_descriptor_ring(vdc); 1790 if (status != EAGAIN) 1791 break; 1792 drv_usecwait(vdc_min_timeout_ldc); 1793 } 1794 1795 if (status != 0) { 1796 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1797 vdc->instance, status); 1798 return (status); 1799 } 1800 1801 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1802 vdc->instance, status); 1803 1804 /* fill in tag */ 1805 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1806 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1807 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1808 pkt.tag.vio_sid = vdc->session_id; 1809 /* fill in payload */ 1810 pkt.dring_ident = 0; 1811 pkt.num_descriptors = vdc->dring_len; 1812 pkt.descriptor_size = vdc->dring_entry_size; 1813 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1814 pkt.ncookies = vdc->dring_cookie_count; 1815 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1816 1817 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1818 if (status != 0) { 1819 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1820 vdc->instance, status); 1821 } 1822 1823 return (status); 1824 } 1825 1826 1827 /* 1828 * Function: 1829 * vdc_dring_negotiation() 1830 * 1831 * Description: 1832 * 1833 * Arguments: 1834 * vdc - soft state pointer for this instance of the device driver. 1835 * 1836 * Return Code: 1837 * 0 - Success 1838 */ 1839 static int 1840 vdc_dring_negotiation(vdc_t *vdcp) 1841 { 1842 int status; 1843 vio_msg_t vio_msg; 1844 1845 if (status = vdc_init_dring_negotiate(vdcp)) 1846 return (status); 1847 1848 /* release lock and wait for response */ 1849 mutex_exit(&vdcp->lock); 1850 status = vdc_wait_for_response(vdcp, &vio_msg); 1851 mutex_enter(&vdcp->lock); 1852 if (status) { 1853 DMSG(vdcp, 0, 1854 "[%d] Failed waiting for Dring negotiation response," 1855 " rv(%d)", vdcp->instance, status); 1856 return (status); 1857 } 1858 1859 /* check type and sub_type ... */ 1860 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1861 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1862 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1863 vdcp->instance); 1864 return (EPROTO); 1865 } 1866 1867 return (vdc_handle_dring_reg_msg(vdcp, 1868 (vio_dring_reg_msg_t *)&vio_msg)); 1869 } 1870 1871 1872 /* 1873 * Function: 1874 * vdc_send_rdx() 1875 * 1876 * Description: 1877 * 1878 * Arguments: 1879 * vdc - soft state pointer for this instance of the device driver. 1880 * 1881 * Return Code: 1882 * 0 - Success 1883 */ 1884 static int 1885 vdc_send_rdx(vdc_t *vdcp) 1886 { 1887 vio_msg_t msg; 1888 size_t msglen = sizeof (vio_msg_t); 1889 int status; 1890 1891 /* 1892 * Send an RDX message to vds to indicate we are ready 1893 * to send data 1894 */ 1895 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1896 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1897 msg.tag.vio_subtype_env = VIO_RDX; 1898 msg.tag.vio_sid = vdcp->session_id; 1899 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1900 if (status != 0) { 1901 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1902 vdcp->instance, status); 1903 } 1904 1905 return (status); 1906 } 1907 1908 /* 1909 * Function: 1910 * vdc_handle_rdx() 1911 * 1912 * Description: 1913 * 1914 * Arguments: 1915 * vdc - soft state pointer for this instance of the device driver. 1916 * msgp - received msg 1917 * 1918 * Return Code: 1919 * 0 - Success 1920 */ 1921 static int 1922 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1923 { 1924 _NOTE(ARGUNUSED(vdcp)) 1925 _NOTE(ARGUNUSED(msgp)) 1926 1927 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1928 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1929 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1930 1931 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1932 1933 return (0); 1934 } 1935 1936 /* 1937 * Function: 1938 * vdc_rdx_exchange() 1939 * 1940 * Description: 1941 * 1942 * Arguments: 1943 * vdc - soft state pointer for this instance of the device driver. 1944 * 1945 * Return Code: 1946 * 0 - Success 1947 */ 1948 static int 1949 vdc_rdx_exchange(vdc_t *vdcp) 1950 { 1951 int status; 1952 vio_msg_t vio_msg; 1953 1954 if (status = vdc_send_rdx(vdcp)) 1955 return (status); 1956 1957 /* release lock and wait for response */ 1958 mutex_exit(&vdcp->lock); 1959 status = vdc_wait_for_response(vdcp, &vio_msg); 1960 mutex_enter(&vdcp->lock); 1961 if (status) { 1962 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1963 vdcp->instance, status); 1964 return (status); 1965 } 1966 1967 /* check type and sub_type ... */ 1968 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1969 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1970 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1971 return (EPROTO); 1972 } 1973 1974 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1975 } 1976 1977 1978 /* -------------------------------------------------------------------------- */ 1979 1980 /* 1981 * LDC helper routines 1982 */ 1983 1984 static int 1985 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1986 { 1987 int status; 1988 uint64_t delay_time; 1989 size_t len; 1990 1991 /* 1992 * Until we get a blocking ldc read we have to retry until the entire 1993 * LDC message has arrived before ldc_read() will return that message. 1994 * If ldc_read() succeed but returns a zero length message then that 1995 * means that the LDC queue is empty and we have to wait for a 1996 * notification from the LDC callback which will set the read_state to 1997 * VDC_READ_PENDING. Note we also bail out if the channel is reset or 1998 * goes away. 1999 */ 2000 delay_time = vdc_ldc_read_init_delay; 2001 2002 for (;;) { 2003 2004 len = *nbytesp; 2005 /* 2006 * vdc->curr_server is protected by vdc->lock but to avoid 2007 * contentions we don't take the lock here. We can do this 2008 * safely because vdc_recv() is only called from thread 2009 * process_msg_thread() which is also the only thread that 2010 * can change vdc->curr_server. 2011 */ 2012 status = ldc_read(vdc->curr_server->ldc_handle, 2013 (caddr_t)msgp, &len); 2014 2015 if (status == EAGAIN) { 2016 delay_time *= 2; 2017 if (delay_time >= vdc_ldc_read_max_delay) 2018 delay_time = vdc_ldc_read_max_delay; 2019 delay(delay_time); 2020 continue; 2021 } 2022 2023 if (status != 0) { 2024 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2025 break; 2026 } 2027 2028 if (len != 0) { 2029 *nbytesp = len; 2030 break; 2031 } 2032 2033 mutex_enter(&vdc->read_lock); 2034 2035 while (vdc->read_state != VDC_READ_PENDING) { 2036 2037 /* detect if the connection has been reset */ 2038 if (vdc->read_state == VDC_READ_RESET) { 2039 mutex_exit(&vdc->read_lock); 2040 return (ECONNRESET); 2041 } 2042 2043 vdc->read_state = VDC_READ_WAITING; 2044 cv_wait(&vdc->read_cv, &vdc->read_lock); 2045 } 2046 2047 vdc->read_state = VDC_READ_IDLE; 2048 mutex_exit(&vdc->read_lock); 2049 2050 delay_time = vdc_ldc_read_init_delay; 2051 } 2052 2053 return (status); 2054 } 2055 2056 2057 2058 #ifdef DEBUG 2059 void 2060 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2061 { 2062 char *ms, *ss, *ses; 2063 switch (msg->tag.vio_msgtype) { 2064 #define Q(_s) case _s : ms = #_s; break; 2065 Q(VIO_TYPE_CTRL) 2066 Q(VIO_TYPE_DATA) 2067 Q(VIO_TYPE_ERR) 2068 #undef Q 2069 default: ms = "unknown"; break; 2070 } 2071 2072 switch (msg->tag.vio_subtype) { 2073 #define Q(_s) case _s : ss = #_s; break; 2074 Q(VIO_SUBTYPE_INFO) 2075 Q(VIO_SUBTYPE_ACK) 2076 Q(VIO_SUBTYPE_NACK) 2077 #undef Q 2078 default: ss = "unknown"; break; 2079 } 2080 2081 switch (msg->tag.vio_subtype_env) { 2082 #define Q(_s) case _s : ses = #_s; break; 2083 Q(VIO_VER_INFO) 2084 Q(VIO_ATTR_INFO) 2085 Q(VIO_DRING_REG) 2086 Q(VIO_DRING_UNREG) 2087 Q(VIO_RDX) 2088 Q(VIO_PKT_DATA) 2089 Q(VIO_DESC_DATA) 2090 Q(VIO_DRING_DATA) 2091 #undef Q 2092 default: ses = "unknown"; break; 2093 } 2094 2095 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2096 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2097 msg->tag.vio_subtype_env, ms, ss, ses); 2098 } 2099 #endif 2100 2101 /* 2102 * Function: 2103 * vdc_send() 2104 * 2105 * Description: 2106 * The function encapsulates the call to write a message using LDC. 2107 * If LDC indicates that the call failed due to the queue being full, 2108 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2109 * 2110 * Arguments: 2111 * ldc_handle - LDC handle for the channel this instance of vdc uses 2112 * pkt - address of LDC message to be sent 2113 * msglen - the size of the message being sent. When the function 2114 * returns, this contains the number of bytes written. 2115 * 2116 * Return Code: 2117 * 0 - Success. 2118 * EINVAL - pkt or msglen were NULL 2119 * ECONNRESET - The connection was not up. 2120 * EWOULDBLOCK - LDC queue is full 2121 * xxx - other error codes returned by ldc_write 2122 */ 2123 static int 2124 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2125 { 2126 size_t size = 0; 2127 int status = 0; 2128 clock_t delay_ticks; 2129 2130 ASSERT(vdc != NULL); 2131 ASSERT(mutex_owned(&vdc->lock)); 2132 ASSERT(msglen != NULL); 2133 ASSERT(*msglen != 0); 2134 2135 #ifdef DEBUG 2136 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2137 #endif 2138 /* 2139 * Wait indefinitely to send if channel 2140 * is busy, but bail out if we succeed or 2141 * if the channel closes or is reset. 2142 */ 2143 delay_ticks = vdc_hz_min_ldc_delay; 2144 do { 2145 size = *msglen; 2146 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2147 if (status == EWOULDBLOCK) { 2148 delay(delay_ticks); 2149 /* geometric backoff */ 2150 delay_ticks *= 2; 2151 if (delay_ticks > vdc_hz_max_ldc_delay) 2152 delay_ticks = vdc_hz_max_ldc_delay; 2153 } 2154 } while (status == EWOULDBLOCK); 2155 2156 /* if LDC had serious issues --- reset vdc state */ 2157 if (status == EIO || status == ECONNRESET) { 2158 /* LDC had serious issues --- reset vdc state */ 2159 mutex_enter(&vdc->read_lock); 2160 if ((vdc->read_state == VDC_READ_WAITING) || 2161 (vdc->read_state == VDC_READ_RESET)) 2162 cv_signal(&vdc->read_cv); 2163 vdc->read_state = VDC_READ_RESET; 2164 mutex_exit(&vdc->read_lock); 2165 2166 /* wake up any waiters in the reset thread */ 2167 if (vdc->state == VDC_STATE_INIT_WAITING) { 2168 DMSG(vdc, 0, "[%d] write reset - " 2169 "vdc is resetting ..\n", vdc->instance); 2170 vdc->state = VDC_STATE_RESETTING; 2171 cv_signal(&vdc->initwait_cv); 2172 } 2173 2174 return (ECONNRESET); 2175 } 2176 2177 /* return the last size written */ 2178 *msglen = size; 2179 2180 return (status); 2181 } 2182 2183 /* 2184 * Function: 2185 * vdc_get_md_node 2186 * 2187 * Description: 2188 * Get the MD, the device node for the given disk instance. The 2189 * caller is responsible for cleaning up the reference to the 2190 * returned MD (mdpp) by calling md_fini_handle(). 2191 * 2192 * Arguments: 2193 * dip - dev info pointer for this instance of the device driver. 2194 * mdpp - the returned MD. 2195 * vd_nodep - the returned device node. 2196 * 2197 * Return Code: 2198 * 0 - Success. 2199 * ENOENT - Expected node or property did not exist. 2200 * ENXIO - Unexpected error communicating with MD framework 2201 */ 2202 static int 2203 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2204 { 2205 int status = ENOENT; 2206 char *node_name = NULL; 2207 md_t *mdp = NULL; 2208 int num_nodes; 2209 int num_vdevs; 2210 mde_cookie_t rootnode; 2211 mde_cookie_t *listp = NULL; 2212 boolean_t found_inst = B_FALSE; 2213 int listsz; 2214 int idx; 2215 uint64_t md_inst; 2216 int obp_inst; 2217 int instance = ddi_get_instance(dip); 2218 2219 /* 2220 * Get the OBP instance number for comparison with the MD instance 2221 * 2222 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2223 * notion of "instance", or unique identifier, for that node; OBP 2224 * stores the value of the "cfg-handle" MD property as the value of 2225 * the "reg" property on the node in the device tree it builds from 2226 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2227 * "reg" property value to uniquely identify this device instance. 2228 * If the "reg" property cannot be found, the device tree state is 2229 * presumably so broken that there is no point in continuing. 2230 */ 2231 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2232 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2233 return (ENOENT); 2234 } 2235 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2236 OBP_REG, -1); 2237 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2238 2239 /* 2240 * We now walk the MD nodes to find the node for this vdisk. 2241 */ 2242 if ((mdp = md_get_handle()) == NULL) { 2243 cmn_err(CE_WARN, "unable to init machine description"); 2244 return (ENXIO); 2245 } 2246 2247 num_nodes = md_node_count(mdp); 2248 ASSERT(num_nodes > 0); 2249 2250 listsz = num_nodes * sizeof (mde_cookie_t); 2251 2252 /* allocate memory for nodes */ 2253 listp = kmem_zalloc(listsz, KM_SLEEP); 2254 2255 rootnode = md_root_node(mdp); 2256 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2257 2258 /* 2259 * Search for all the virtual devices, we will then check to see which 2260 * ones are disk nodes. 2261 */ 2262 num_vdevs = md_scan_dag(mdp, rootnode, 2263 md_find_name(mdp, VDC_MD_VDEV_NAME), 2264 md_find_name(mdp, "fwd"), listp); 2265 2266 if (num_vdevs <= 0) { 2267 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2268 status = ENOENT; 2269 goto done; 2270 } 2271 2272 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2273 for (idx = 0; idx < num_vdevs; idx++) { 2274 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2275 if ((status != 0) || (node_name == NULL)) { 2276 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2277 ": err %d", VDC_MD_VDEV_NAME, status); 2278 continue; 2279 } 2280 2281 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2282 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2283 status = md_get_prop_val(mdp, listp[idx], 2284 VDC_MD_CFG_HDL, &md_inst); 2285 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2286 instance, md_inst); 2287 if ((status == 0) && (md_inst == obp_inst)) { 2288 found_inst = B_TRUE; 2289 break; 2290 } 2291 } 2292 } 2293 2294 if (!found_inst) { 2295 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2296 status = ENOENT; 2297 goto done; 2298 } 2299 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2300 2301 *vd_nodep = listp[idx]; 2302 *mdpp = mdp; 2303 done: 2304 kmem_free(listp, listsz); 2305 return (status); 2306 } 2307 2308 /* 2309 * Function: 2310 * vdc_init_ports 2311 * 2312 * Description: 2313 * Initialize all the ports for this vdisk instance. 2314 * 2315 * Arguments: 2316 * vdc - soft state pointer for this instance of the device driver. 2317 * mdp - md pointer 2318 * vd_nodep - device md node. 2319 * 2320 * Return Code: 2321 * 0 - Success. 2322 * ENOENT - Expected node or property did not exist. 2323 */ 2324 static int 2325 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2326 { 2327 int status = 0; 2328 int idx; 2329 int num_nodes; 2330 int num_vports; 2331 int num_chans; 2332 int listsz; 2333 mde_cookie_t vd_port; 2334 mde_cookie_t *chanp = NULL; 2335 mde_cookie_t *portp = NULL; 2336 vdc_server_t *srvr; 2337 vdc_server_t *prev_srvr = NULL; 2338 2339 /* 2340 * We now walk the MD nodes to find the port nodes for this vdisk. 2341 */ 2342 num_nodes = md_node_count(mdp); 2343 ASSERT(num_nodes > 0); 2344 2345 listsz = num_nodes * sizeof (mde_cookie_t); 2346 2347 /* allocate memory for nodes */ 2348 portp = kmem_zalloc(listsz, KM_SLEEP); 2349 chanp = kmem_zalloc(listsz, KM_SLEEP); 2350 2351 num_vports = md_scan_dag(mdp, vd_nodep, 2352 md_find_name(mdp, VDC_MD_PORT_NAME), 2353 md_find_name(mdp, "fwd"), portp); 2354 if (num_vports == 0) { 2355 DMSGX(0, "Found no '%s' node for '%s' port\n", 2356 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2357 status = ENOENT; 2358 goto done; 2359 } 2360 2361 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2362 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2363 2364 vdc->num_servers = 0; 2365 for (idx = 0; idx < num_vports; idx++) { 2366 2367 /* initialize this port */ 2368 vd_port = portp[idx]; 2369 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2370 srvr->vdcp = vdc; 2371 2372 /* get port id */ 2373 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2374 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2375 VDC_MD_ID); 2376 kmem_free(srvr, sizeof (vdc_server_t)); 2377 continue; 2378 } 2379 2380 /* set the connection timeout */ 2381 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2382 &srvr->ctimeout) != 0) { 2383 srvr->ctimeout = 0; 2384 } 2385 2386 /* get the ldc id */ 2387 num_chans = md_scan_dag(mdp, vd_port, 2388 md_find_name(mdp, VDC_MD_CHAN_NAME), 2389 md_find_name(mdp, "fwd"), chanp); 2390 2391 /* expecting at least one channel */ 2392 if (num_chans <= 0) { 2393 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2394 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2395 kmem_free(srvr, sizeof (vdc_server_t)); 2396 continue; 2397 } else if (num_chans != 1) { 2398 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2399 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2400 num_chans); 2401 } 2402 2403 /* 2404 * We use the first channel found (index 0), irrespective of how 2405 * many are there in total. 2406 */ 2407 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2408 &srvr->ldc_id) != 0) { 2409 cmn_err(CE_NOTE, "Channel '%s' property not found", 2410 VDC_MD_ID); 2411 kmem_free(srvr, sizeof (vdc_server_t)); 2412 continue; 2413 } 2414 2415 /* 2416 * now initialise LDC channel which will be used to 2417 * communicate with this server 2418 */ 2419 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2420 kmem_free(srvr, sizeof (vdc_server_t)); 2421 continue; 2422 } 2423 2424 /* add server to list */ 2425 if (prev_srvr) 2426 prev_srvr->next = srvr; 2427 else 2428 vdc->server_list = srvr; 2429 2430 prev_srvr = srvr; 2431 2432 /* inc numbers of servers */ 2433 vdc->num_servers++; 2434 } 2435 2436 /* 2437 * Adjust the max number of handshake retries to match 2438 * the number of vdisk servers. 2439 */ 2440 if (vdc_hshake_retries < vdc->num_servers) 2441 vdc_hshake_retries = vdc->num_servers; 2442 2443 /* pick first server as current server */ 2444 if (vdc->server_list != NULL) { 2445 vdc->curr_server = vdc->server_list; 2446 status = 0; 2447 } else { 2448 status = ENOENT; 2449 } 2450 2451 done: 2452 kmem_free(chanp, listsz); 2453 kmem_free(portp, listsz); 2454 return (status); 2455 } 2456 2457 2458 /* 2459 * Function: 2460 * vdc_do_ldc_up 2461 * 2462 * Description: 2463 * Bring the channel for the current server up. 2464 * 2465 * Arguments: 2466 * vdc - soft state pointer for this instance of the device driver. 2467 * 2468 * Return Code: 2469 * 0 - Success. 2470 * EINVAL - Driver is detaching / LDC error 2471 * ECONNREFUSED - Other end is not listening 2472 */ 2473 static int 2474 vdc_do_ldc_up(vdc_t *vdc) 2475 { 2476 int status; 2477 ldc_status_t ldc_state; 2478 2479 ASSERT(MUTEX_HELD(&vdc->lock)); 2480 2481 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2482 vdc->instance, vdc->curr_server->ldc_id); 2483 2484 if (vdc->lifecycle == VDC_LC_DETACHING) 2485 return (EINVAL); 2486 2487 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2488 switch (status) { 2489 case ECONNREFUSED: /* listener not ready at other end */ 2490 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2491 vdc->instance, vdc->curr_server->ldc_id, status); 2492 status = 0; 2493 break; 2494 default: 2495 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2496 "channel=%ld, err=%d", vdc->instance, 2497 vdc->curr_server->ldc_id, status); 2498 break; 2499 } 2500 } 2501 2502 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2503 vdc->curr_server->ldc_state = ldc_state; 2504 if (ldc_state == LDC_UP) { 2505 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2506 vdc->instance); 2507 vdc->seq_num = 1; 2508 vdc->seq_num_reply = 0; 2509 } 2510 } 2511 2512 return (status); 2513 } 2514 2515 /* 2516 * Function: 2517 * vdc_terminate_ldc() 2518 * 2519 * Description: 2520 * 2521 * Arguments: 2522 * vdc - soft state pointer for this instance of the device driver. 2523 * srvr - vdc per-server info structure 2524 * 2525 * Return Code: 2526 * None 2527 */ 2528 static void 2529 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2530 { 2531 int instance = ddi_get_instance(vdc->dip); 2532 2533 if (srvr->state & VDC_LDC_OPEN) { 2534 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2535 (void) ldc_close(srvr->ldc_handle); 2536 } 2537 if (srvr->state & VDC_LDC_CB) { 2538 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2539 (void) ldc_unreg_callback(srvr->ldc_handle); 2540 } 2541 if (srvr->state & VDC_LDC_INIT) { 2542 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2543 (void) ldc_fini(srvr->ldc_handle); 2544 srvr->ldc_handle = NULL; 2545 } 2546 2547 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2548 } 2549 2550 /* 2551 * Function: 2552 * vdc_fini_ports() 2553 * 2554 * Description: 2555 * Finalize all ports by closing the channel associated with each 2556 * port and also freeing the server structure. 2557 * 2558 * Arguments: 2559 * vdc - soft state pointer for this instance of the device driver. 2560 * 2561 * Return Code: 2562 * None 2563 */ 2564 static void 2565 vdc_fini_ports(vdc_t *vdc) 2566 { 2567 int instance = ddi_get_instance(vdc->dip); 2568 vdc_server_t *srvr, *prev_srvr; 2569 2570 ASSERT(vdc != NULL); 2571 ASSERT(mutex_owned(&vdc->lock)); 2572 2573 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2574 2575 srvr = vdc->server_list; 2576 2577 while (srvr) { 2578 2579 vdc_terminate_ldc(vdc, srvr); 2580 2581 /* next server */ 2582 prev_srvr = srvr; 2583 srvr = srvr->next; 2584 2585 /* free server */ 2586 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2587 } 2588 2589 vdc->server_list = NULL; 2590 } 2591 2592 /* -------------------------------------------------------------------------- */ 2593 2594 /* 2595 * Descriptor Ring helper routines 2596 */ 2597 2598 /* 2599 * Function: 2600 * vdc_init_descriptor_ring() 2601 * 2602 * Description: 2603 * 2604 * Arguments: 2605 * vdc - soft state pointer for this instance of the device driver. 2606 * 2607 * Return Code: 2608 * 0 - Success 2609 */ 2610 static int 2611 vdc_init_descriptor_ring(vdc_t *vdc) 2612 { 2613 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2614 int status = 0; 2615 int i; 2616 2617 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2618 2619 ASSERT(vdc != NULL); 2620 ASSERT(mutex_owned(&vdc->lock)); 2621 2622 /* ensure we have enough room to store max sized block */ 2623 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2624 2625 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2626 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2627 /* 2628 * Calculate the maximum block size we can transmit using one 2629 * Descriptor Ring entry from the attributes returned by the 2630 * vDisk server. This is subject to a minimum of 'maxphys' 2631 * as we do not have the capability to split requests over 2632 * multiple DRing entries. 2633 */ 2634 if ((vdc->max_xfer_sz * vdc->vdisk_bsize) < maxphys) { 2635 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2636 vdc->instance); 2637 vdc->dring_max_cookies = maxphys / PAGESIZE; 2638 } else { 2639 vdc->dring_max_cookies = 2640 (vdc->max_xfer_sz * vdc->vdisk_bsize) / PAGESIZE; 2641 } 2642 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2643 (sizeof (ldc_mem_cookie_t) * 2644 (vdc->dring_max_cookies - 1))); 2645 vdc->dring_len = VD_DRING_LEN; 2646 2647 status = ldc_mem_dring_create(vdc->dring_len, 2648 vdc->dring_entry_size, &vdc->dring_hdl); 2649 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2650 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2651 vdc->instance); 2652 return (status); 2653 } 2654 vdc->initialized |= VDC_DRING_INIT; 2655 } 2656 2657 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2658 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2659 vdc->dring_cookie = 2660 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2661 2662 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2663 vdc->dring_hdl, 2664 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2665 &vdc->dring_cookie[0], 2666 &vdc->dring_cookie_count); 2667 if (status != 0) { 2668 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2669 "(%lx) to channel (%lx) status=%d\n", 2670 vdc->instance, vdc->dring_hdl, 2671 vdc->curr_server->ldc_handle, status); 2672 return (status); 2673 } 2674 ASSERT(vdc->dring_cookie_count == 1); 2675 vdc->initialized |= VDC_DRING_BOUND; 2676 } 2677 2678 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2679 if (status != 0) { 2680 DMSG(vdc, 0, 2681 "[%d] Failed to get info for descriptor ring (%lx)\n", 2682 vdc->instance, vdc->dring_hdl); 2683 return (status); 2684 } 2685 2686 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2687 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2688 2689 /* Allocate the local copy of this dring */ 2690 vdc->local_dring = 2691 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2692 KM_SLEEP); 2693 vdc->initialized |= VDC_DRING_LOCAL; 2694 } 2695 2696 /* 2697 * Mark all DRing entries as free and initialize the private 2698 * descriptor's memory handles. If any entry is initialized, 2699 * we need to free it later so we set the bit in 'initialized' 2700 * at the start. 2701 */ 2702 vdc->initialized |= VDC_DRING_ENTRY; 2703 for (i = 0; i < vdc->dring_len; i++) { 2704 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2705 dep->hdr.dstate = VIO_DESC_FREE; 2706 2707 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2708 &vdc->local_dring[i].desc_mhdl); 2709 if (status != 0) { 2710 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2711 " descriptor %d", vdc->instance, i); 2712 return (status); 2713 } 2714 vdc->local_dring[i].is_free = B_TRUE; 2715 vdc->local_dring[i].dep = dep; 2716 } 2717 2718 /* Initialize the starting index */ 2719 vdc->dring_curr_idx = 0; 2720 2721 return (status); 2722 } 2723 2724 /* 2725 * Function: 2726 * vdc_destroy_descriptor_ring() 2727 * 2728 * Description: 2729 * 2730 * Arguments: 2731 * vdc - soft state pointer for this instance of the device driver. 2732 * 2733 * Return Code: 2734 * None 2735 */ 2736 static void 2737 vdc_destroy_descriptor_ring(vdc_t *vdc) 2738 { 2739 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2740 ldc_mem_handle_t mhdl = NULL; 2741 ldc_mem_info_t minfo; 2742 int status = -1; 2743 int i; /* loop */ 2744 2745 ASSERT(vdc != NULL); 2746 ASSERT(mutex_owned(&vdc->lock)); 2747 2748 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2749 2750 if (vdc->initialized & VDC_DRING_ENTRY) { 2751 DMSG(vdc, 0, 2752 "[%d] Removing Local DRing entries\n", vdc->instance); 2753 for (i = 0; i < vdc->dring_len; i++) { 2754 ldep = &vdc->local_dring[i]; 2755 mhdl = ldep->desc_mhdl; 2756 2757 if (mhdl == NULL) 2758 continue; 2759 2760 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2761 DMSG(vdc, 0, 2762 "ldc_mem_info returned an error: %d\n", 2763 status); 2764 2765 /* 2766 * This must mean that the mem handle 2767 * is not valid. Clear it out so that 2768 * no one tries to use it. 2769 */ 2770 ldep->desc_mhdl = NULL; 2771 continue; 2772 } 2773 2774 if (minfo.status == LDC_BOUND) { 2775 (void) ldc_mem_unbind_handle(mhdl); 2776 } 2777 2778 (void) ldc_mem_free_handle(mhdl); 2779 2780 ldep->desc_mhdl = NULL; 2781 } 2782 vdc->initialized &= ~VDC_DRING_ENTRY; 2783 } 2784 2785 if (vdc->initialized & VDC_DRING_LOCAL) { 2786 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2787 kmem_free(vdc->local_dring, 2788 vdc->dring_len * sizeof (vdc_local_desc_t)); 2789 vdc->initialized &= ~VDC_DRING_LOCAL; 2790 } 2791 2792 if (vdc->initialized & VDC_DRING_BOUND) { 2793 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2794 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2795 if (status == 0) { 2796 vdc->initialized &= ~VDC_DRING_BOUND; 2797 } else { 2798 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2799 vdc->instance, status, vdc->dring_hdl); 2800 } 2801 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2802 } 2803 2804 if (vdc->initialized & VDC_DRING_INIT) { 2805 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2806 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2807 if (status == 0) { 2808 vdc->dring_hdl = NULL; 2809 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2810 vdc->initialized &= ~VDC_DRING_INIT; 2811 } else { 2812 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2813 vdc->instance, status, vdc->dring_hdl); 2814 } 2815 } 2816 } 2817 2818 /* 2819 * Function: 2820 * vdc_map_to_shared_dring() 2821 * 2822 * Description: 2823 * Copy contents of the local descriptor to the shared 2824 * memory descriptor. 2825 * 2826 * Arguments: 2827 * vdcp - soft state pointer for this instance of the device driver. 2828 * idx - descriptor ring index 2829 * 2830 * Return Code: 2831 * None 2832 */ 2833 static int 2834 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2835 { 2836 vdc_local_desc_t *ldep; 2837 vd_dring_entry_t *dep; 2838 int rv; 2839 2840 ldep = &(vdcp->local_dring[idx]); 2841 2842 /* for now leave in the old pop_mem_hdl stuff */ 2843 if (ldep->nbytes > 0) { 2844 rv = vdc_populate_mem_hdl(vdcp, ldep); 2845 if (rv) { 2846 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2847 vdcp->instance); 2848 return (rv); 2849 } 2850 } 2851 2852 /* 2853 * fill in the data details into the DRing 2854 */ 2855 dep = ldep->dep; 2856 ASSERT(dep != NULL); 2857 2858 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2859 dep->payload.operation = ldep->operation; 2860 dep->payload.addr = ldep->offset; 2861 dep->payload.nbytes = ldep->nbytes; 2862 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2863 dep->payload.slice = ldep->slice; 2864 dep->hdr.dstate = VIO_DESC_READY; 2865 dep->hdr.ack = 1; /* request an ACK for every message */ 2866 2867 return (0); 2868 } 2869 2870 /* 2871 * Function: 2872 * vdc_send_request 2873 * 2874 * Description: 2875 * This routine writes the data to be transmitted to vds into the 2876 * descriptor, notifies vds that the ring has been updated and 2877 * then waits for the request to be processed. 2878 * 2879 * Arguments: 2880 * vdcp - the soft state pointer 2881 * operation - operation we want vds to perform (VD_OP_XXX) 2882 * addr - address of data buf to be read/written. 2883 * nbytes - number of bytes to read/write 2884 * slice - the disk slice this request is for 2885 * offset - relative disk offset 2886 * cb_type - type of call - STRATEGY or SYNC 2887 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2888 * . mode for ioctl(9e) 2889 * . LP64 diskaddr_t (block I/O) 2890 * dir - direction of operation (READ/WRITE/BOTH) 2891 * 2892 * Return Codes: 2893 * 0 2894 * ENXIO 2895 */ 2896 static int 2897 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2898 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2899 void *cb_arg, vio_desc_direction_t dir) 2900 { 2901 int rv = 0; 2902 2903 ASSERT(vdcp != NULL); 2904 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2905 2906 mutex_enter(&vdcp->lock); 2907 2908 /* 2909 * If this is a block read/write operation we update the I/O statistics 2910 * to indicate that the request is being put on the waitq to be 2911 * serviced. 2912 * 2913 * We do it here (a common routine for both synchronous and strategy 2914 * calls) for performance reasons - we are already holding vdc->lock 2915 * so there is no extra locking overhead. We would have to explicitly 2916 * grab the 'lock' mutex to update the stats if we were to do this 2917 * higher up the stack in vdc_strategy() et. al. 2918 */ 2919 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2920 DTRACE_IO1(start, buf_t *, cb_arg); 2921 VD_KSTAT_WAITQ_ENTER(vdcp); 2922 } 2923 2924 do { 2925 while (vdcp->state != VDC_STATE_RUNNING) { 2926 2927 /* return error if detaching */ 2928 if (vdcp->state == VDC_STATE_DETACH) { 2929 rv = ENXIO; 2930 goto done; 2931 } 2932 2933 /* fail request if connection timeout is reached */ 2934 if (vdcp->ctimeout_reached) { 2935 rv = EIO; 2936 goto done; 2937 } 2938 2939 /* 2940 * If we are panicking and the disk is not ready then 2941 * we can't send any request because we can't complete 2942 * the handshake now. 2943 */ 2944 if (ddi_in_panic()) { 2945 rv = EIO; 2946 goto done; 2947 } 2948 2949 cv_wait(&vdcp->running_cv, &vdcp->lock); 2950 } 2951 2952 } while (vdc_populate_descriptor(vdcp, operation, addr, 2953 nbytes, slice, offset, cb_type, cb_arg, dir)); 2954 2955 done: 2956 /* 2957 * If this is a block read/write we update the I/O statistics kstat 2958 * to indicate that this request has been placed on the queue for 2959 * processing (i.e sent to the vDisk server) - iostat(1M) will 2960 * report the time waiting for the vDisk server under the %b column 2961 * In the case of an error we simply take it off the wait queue. 2962 */ 2963 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2964 if (rv == 0) { 2965 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2966 DTRACE_PROBE1(send, buf_t *, cb_arg); 2967 } else { 2968 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2969 VD_KSTAT_WAITQ_EXIT(vdcp); 2970 DTRACE_IO1(done, buf_t *, cb_arg); 2971 } 2972 } 2973 2974 mutex_exit(&vdcp->lock); 2975 2976 return (rv); 2977 } 2978 2979 2980 /* 2981 * Function: 2982 * vdc_populate_descriptor 2983 * 2984 * Description: 2985 * This routine writes the data to be transmitted to vds into the 2986 * descriptor, notifies vds that the ring has been updated and 2987 * then waits for the request to be processed. 2988 * 2989 * Arguments: 2990 * vdcp - the soft state pointer 2991 * operation - operation we want vds to perform (VD_OP_XXX) 2992 * addr - address of data buf to be read/written. 2993 * nbytes - number of bytes to read/write 2994 * slice - the disk slice this request is for 2995 * offset - relative disk offset 2996 * cb_type - type of call - STRATEGY or SYNC 2997 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2998 * . mode for ioctl(9e) 2999 * . LP64 diskaddr_t (block I/O) 3000 * dir - direction of operation (READ/WRITE/BOTH) 3001 * 3002 * Return Codes: 3003 * 0 3004 * EAGAIN 3005 * ECONNRESET 3006 * ENXIO 3007 */ 3008 static int 3009 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 3010 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 3011 void *cb_arg, vio_desc_direction_t dir) 3012 { 3013 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 3014 int idx; /* Index of DRing entry used */ 3015 int next_idx; 3016 vio_dring_msg_t dmsg; 3017 size_t msglen; 3018 int rv; 3019 3020 ASSERT(MUTEX_HELD(&vdcp->lock)); 3021 vdcp->threads_pending++; 3022 loop: 3023 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 3024 3025 /* Get next available D-Ring entry */ 3026 idx = vdcp->dring_curr_idx; 3027 local_dep = &(vdcp->local_dring[idx]); 3028 3029 if (!local_dep->is_free) { 3030 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3031 vdcp->instance); 3032 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3033 if (vdcp->state == VDC_STATE_RUNNING || 3034 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3035 goto loop; 3036 } 3037 vdcp->threads_pending--; 3038 return (ECONNRESET); 3039 } 3040 3041 next_idx = idx + 1; 3042 if (next_idx >= vdcp->dring_len) 3043 next_idx = 0; 3044 vdcp->dring_curr_idx = next_idx; 3045 3046 ASSERT(local_dep->is_free); 3047 3048 local_dep->operation = operation; 3049 local_dep->addr = addr; 3050 local_dep->nbytes = nbytes; 3051 local_dep->slice = slice; 3052 local_dep->offset = offset; 3053 local_dep->cb_type = cb_type; 3054 local_dep->cb_arg = cb_arg; 3055 local_dep->dir = dir; 3056 3057 local_dep->is_free = B_FALSE; 3058 3059 rv = vdc_map_to_shared_dring(vdcp, idx); 3060 if (rv) { 3061 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3062 vdcp->instance); 3063 /* free the descriptor */ 3064 local_dep->is_free = B_TRUE; 3065 vdcp->dring_curr_idx = idx; 3066 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3067 if (vdcp->state == VDC_STATE_RUNNING || 3068 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3069 goto loop; 3070 } 3071 vdcp->threads_pending--; 3072 return (ECONNRESET); 3073 } 3074 3075 /* 3076 * Send a msg with the DRing details to vds 3077 */ 3078 VIO_INIT_DRING_DATA_TAG(dmsg); 3079 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3080 dmsg.dring_ident = vdcp->dring_ident; 3081 dmsg.start_idx = idx; 3082 dmsg.end_idx = idx; 3083 vdcp->seq_num++; 3084 3085 DTRACE_PROBE2(populate, int, vdcp->instance, 3086 vdc_local_desc_t *, local_dep); 3087 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3088 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3089 3090 /* 3091 * note we're still holding the lock here to 3092 * make sure the message goes out in order !!!... 3093 */ 3094 msglen = sizeof (dmsg); 3095 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3096 switch (rv) { 3097 case ECONNRESET: 3098 /* 3099 * vdc_send initiates the reset on failure. 3100 * Since the transaction has already been put 3101 * on the local dring, it will automatically get 3102 * retried when the channel is reset. Given that, 3103 * it is ok to just return success even though the 3104 * send failed. 3105 */ 3106 rv = 0; 3107 break; 3108 3109 case 0: /* EOK */ 3110 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3111 break; 3112 3113 default: 3114 goto cleanup_and_exit; 3115 } 3116 3117 vdcp->threads_pending--; 3118 return (rv); 3119 3120 cleanup_and_exit: 3121 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3122 return (ENXIO); 3123 } 3124 3125 /* 3126 * Function: 3127 * vdc_do_sync_op 3128 * 3129 * Description: 3130 * Wrapper around vdc_populate_descriptor that blocks until the 3131 * response to the message is available. 3132 * 3133 * Arguments: 3134 * vdcp - the soft state pointer 3135 * operation - operation we want vds to perform (VD_OP_XXX) 3136 * addr - address of data buf to be read/written. 3137 * nbytes - number of bytes to read/write 3138 * slice - the disk slice this request is for 3139 * offset - relative disk offset 3140 * cb_type - type of call - STRATEGY or SYNC 3141 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3142 * . mode for ioctl(9e) 3143 * . LP64 diskaddr_t (block I/O) 3144 * dir - direction of operation (READ/WRITE/BOTH) 3145 * rconflict - check for reservation conflict in case of failure 3146 * 3147 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3148 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3149 * result of a successful operation with vd_scsi_status(). 3150 * 3151 * Return Codes: 3152 * 0 3153 * EAGAIN 3154 * EFAULT 3155 * ENXIO 3156 * EIO 3157 */ 3158 static int 3159 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3160 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3161 vio_desc_direction_t dir, boolean_t rconflict) 3162 { 3163 int status; 3164 vdc_io_t *vio; 3165 boolean_t check_resv_conflict = B_FALSE; 3166 3167 ASSERT(cb_type == CB_SYNC); 3168 3169 /* 3170 * Grab the lock, if blocked wait until the server 3171 * response causes us to wake up again. 3172 */ 3173 mutex_enter(&vdcp->lock); 3174 vdcp->sync_op_cnt++; 3175 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) { 3176 if (ddi_in_panic()) { 3177 /* don't block if we are panicking */ 3178 vdcp->sync_op_cnt--; 3179 mutex_exit(&vdcp->lock); 3180 return (EIO); 3181 } else { 3182 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3183 } 3184 } 3185 3186 if (vdcp->state == VDC_STATE_DETACH) { 3187 cv_broadcast(&vdcp->sync_blocked_cv); 3188 vdcp->sync_op_cnt--; 3189 mutex_exit(&vdcp->lock); 3190 return (ENXIO); 3191 } 3192 3193 /* now block anyone other thread entering after us */ 3194 vdcp->sync_op_blocked = B_TRUE; 3195 vdcp->sync_op_pending = B_TRUE; 3196 mutex_exit(&vdcp->lock); 3197 3198 status = vdc_send_request(vdcp, operation, addr, 3199 nbytes, slice, offset, cb_type, cb_arg, dir); 3200 3201 mutex_enter(&vdcp->lock); 3202 3203 if (status != 0) { 3204 vdcp->sync_op_pending = B_FALSE; 3205 } else if (ddi_in_panic()) { 3206 if (vdc_drain_response(vdcp, CB_SYNC, NULL) == 0) { 3207 status = vdcp->sync_op_status; 3208 } else { 3209 vdcp->sync_op_pending = B_FALSE; 3210 status = EIO; 3211 } 3212 } else { 3213 /* 3214 * block until our transaction completes. 3215 * Also anyone else waiting also gets to go next. 3216 */ 3217 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3218 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3219 3220 DMSG(vdcp, 2, ": operation returned %d\n", 3221 vdcp->sync_op_status); 3222 if (vdcp->state == VDC_STATE_DETACH) { 3223 vdcp->sync_op_pending = B_FALSE; 3224 status = ENXIO; 3225 } else { 3226 status = vdcp->sync_op_status; 3227 if (status != 0 && vdcp->failfast_interval != 0) { 3228 /* 3229 * Operation has failed and failfast is enabled. 3230 * We need to check if the failure is due to a 3231 * reservation conflict if this was requested. 3232 */ 3233 check_resv_conflict = rconflict; 3234 } 3235 3236 } 3237 } 3238 3239 vdcp->sync_op_status = 0; 3240 vdcp->sync_op_blocked = B_FALSE; 3241 vdcp->sync_op_cnt--; 3242 3243 /* signal the next waiting thread */ 3244 cv_signal(&vdcp->sync_blocked_cv); 3245 3246 /* 3247 * We have to check for reservation conflict after unblocking sync 3248 * operations because some sync operations will be used to do this 3249 * check. 3250 */ 3251 if (check_resv_conflict) { 3252 vio = vdc_failfast_io_queue(vdcp, NULL); 3253 while (vio->vio_qtime != 0) 3254 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3255 kmem_free(vio, sizeof (vdc_io_t)); 3256 } 3257 3258 mutex_exit(&vdcp->lock); 3259 3260 return (status); 3261 } 3262 3263 3264 /* 3265 * Function: 3266 * vdc_drain_response() 3267 * 3268 * Description: 3269 * When a guest is panicking, the completion of requests needs to be 3270 * handled differently because interrupts are disabled and vdc 3271 * will not get messages. We have to poll for the messages instead. 3272 * 3273 * Note: since we are panicking we don't implement the io:::done 3274 * DTrace probe or update the I/O statistics kstats. 3275 * 3276 * Arguments: 3277 * vdc - soft state pointer for this instance of the device driver. 3278 * cb_type - the type of request we want to drain. If type is CB_SYNC 3279 * then we drain all responses until we find a CB_SYNC request. 3280 * If the type is CB_STRATEGY then the behavior depends on the 3281 * value of the buf argument. 3282 * buf - if the cb_type argument is CB_SYNC then the buf argument 3283 * must be NULL. If the cb_type argument is CB_STRATEGY and 3284 * if buf is NULL then we drain all responses, otherwise we 3285 * poll until we receive a ACK/NACK for the specific I/O 3286 * described by buf. 3287 * 3288 * Return Code: 3289 * 0 - Success. If we were expecting a response to a particular 3290 * CB_SYNC or CB_STRATEGY request then this means that a 3291 * response has been received. 3292 */ 3293 static int 3294 vdc_drain_response(vdc_t *vdc, vio_cb_type_t cb_type, struct buf *buf) 3295 { 3296 int rv, idx, retries; 3297 size_t msglen; 3298 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3299 vio_dring_msg_t dmsg; 3300 struct buf *mbuf; 3301 boolean_t ack; 3302 3303 ASSERT(cb_type == CB_STRATEGY || cb_type == CB_SYNC); 3304 3305 mutex_enter(&vdc->lock); 3306 3307 retries = 0; 3308 for (;;) { 3309 msglen = sizeof (dmsg); 3310 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3311 &msglen); 3312 if (rv) { 3313 rv = EINVAL; 3314 break; 3315 } 3316 3317 /* 3318 * if there are no packets wait and check again 3319 */ 3320 if ((rv == 0) && (msglen == 0)) { 3321 if (retries++ > vdc_dump_retries) { 3322 rv = EAGAIN; 3323 break; 3324 } 3325 3326 drv_usecwait(vdc_usec_timeout_dump); 3327 continue; 3328 } 3329 3330 /* 3331 * Ignore all messages that are not ACKs/NACKs to 3332 * DRing requests. 3333 */ 3334 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3335 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3336 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3337 dmsg.tag.vio_msgtype, 3338 dmsg.tag.vio_subtype, 3339 dmsg.tag.vio_subtype_env); 3340 continue; 3341 } 3342 3343 /* 3344 * Record if the packet was ACK'ed or not. If the packet was not 3345 * ACK'ed then we will just mark the request as failed; we don't 3346 * want to reset the connection at this point. 3347 */ 3348 switch (dmsg.tag.vio_subtype) { 3349 case VIO_SUBTYPE_ACK: 3350 ack = B_TRUE; 3351 break; 3352 case VIO_SUBTYPE_NACK: 3353 ack = B_FALSE; 3354 break; 3355 default: 3356 continue; 3357 } 3358 3359 idx = dmsg.start_idx; 3360 if (idx >= vdc->dring_len) { 3361 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3362 vdc->instance, idx); 3363 continue; 3364 } 3365 ldep = &vdc->local_dring[idx]; 3366 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3367 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3368 vdc->instance, idx, ldep->dep->hdr.dstate); 3369 continue; 3370 } 3371 3372 switch (ldep->cb_type) { 3373 3374 case CB_STRATEGY: 3375 mbuf = ldep->cb_arg; 3376 if (mbuf != NULL) { 3377 mbuf->b_resid = mbuf->b_bcount - 3378 ldep->dep->payload.nbytes; 3379 bioerror(mbuf, 3380 ack ? ldep->dep->payload.status : EIO); 3381 biodone(mbuf); 3382 } 3383 rv = vdc_depopulate_descriptor(vdc, idx); 3384 if (buf != NULL && buf == mbuf) { 3385 rv = 0; 3386 goto done; 3387 } 3388 break; 3389 3390 case CB_SYNC: 3391 rv = vdc_depopulate_descriptor(vdc, idx); 3392 vdc->sync_op_status = ack ? rv : EIO; 3393 vdc->sync_op_pending = B_FALSE; 3394 cv_signal(&vdc->sync_pending_cv); 3395 if (cb_type == CB_SYNC) { 3396 rv = 0; 3397 goto done; 3398 } 3399 break; 3400 } 3401 3402 /* if this is the last descriptor - break out of loop */ 3403 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3404 /* 3405 * If we were expecting a response for a particular 3406 * request then we return with an error otherwise we 3407 * have successfully completed the drain. 3408 */ 3409 rv = (buf != NULL || cb_type == CB_SYNC)? ESRCH: 0; 3410 break; 3411 } 3412 } 3413 3414 done: 3415 mutex_exit(&vdc->lock); 3416 DMSG(vdc, 0, "End idx=%d\n", idx); 3417 3418 return (rv); 3419 } 3420 3421 3422 /* 3423 * Function: 3424 * vdc_depopulate_descriptor() 3425 * 3426 * Description: 3427 * 3428 * Arguments: 3429 * vdc - soft state pointer for this instance of the device driver. 3430 * idx - Index of the Descriptor Ring entry being modified 3431 * 3432 * Return Code: 3433 * 0 - Success 3434 */ 3435 static int 3436 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3437 { 3438 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3439 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3440 int status = ENXIO; 3441 int rv = 0; 3442 3443 ASSERT(vdc != NULL); 3444 ASSERT(idx < vdc->dring_len); 3445 ldep = &vdc->local_dring[idx]; 3446 ASSERT(ldep != NULL); 3447 ASSERT(MUTEX_HELD(&vdc->lock)); 3448 3449 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3450 DMSG(vdc, 2, ": idx = %d\n", idx); 3451 3452 dep = ldep->dep; 3453 ASSERT(dep != NULL); 3454 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3455 (dep->payload.status == ECANCELED)); 3456 3457 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3458 3459 ldep->is_free = B_TRUE; 3460 status = dep->payload.status; 3461 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3462 3463 /* 3464 * If no buffers were used to transfer information to the server when 3465 * populating the descriptor then no memory handles need to be unbound 3466 * and we can return now. 3467 */ 3468 if (ldep->nbytes == 0) { 3469 cv_signal(&vdc->dring_free_cv); 3470 return (status); 3471 } 3472 3473 /* 3474 * If the upper layer passed in a misaligned address we copied the 3475 * data into an aligned buffer before sending it to LDC - we now 3476 * copy it back to the original buffer. 3477 */ 3478 if (ldep->align_addr) { 3479 ASSERT(ldep->addr != NULL); 3480 3481 if (dep->payload.nbytes > 0) 3482 bcopy(ldep->align_addr, ldep->addr, 3483 dep->payload.nbytes); 3484 kmem_free(ldep->align_addr, 3485 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3486 ldep->align_addr = NULL; 3487 } 3488 3489 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3490 if (rv != 0) { 3491 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3492 vdc->instance, ldep->desc_mhdl, idx, rv); 3493 /* 3494 * The error returned by the vDisk server is more informative 3495 * and thus has a higher priority but if it isn't set we ensure 3496 * that this function returns an error. 3497 */ 3498 if (status == 0) 3499 status = EINVAL; 3500 } 3501 3502 cv_signal(&vdc->membind_cv); 3503 cv_signal(&vdc->dring_free_cv); 3504 3505 return (status); 3506 } 3507 3508 /* 3509 * Function: 3510 * vdc_populate_mem_hdl() 3511 * 3512 * Description: 3513 * 3514 * Arguments: 3515 * vdc - soft state pointer for this instance of the device driver. 3516 * idx - Index of the Descriptor Ring entry being modified 3517 * addr - virtual address being mapped in 3518 * nybtes - number of bytes in 'addr' 3519 * operation - the vDisk operation being performed (VD_OP_xxx) 3520 * 3521 * Return Code: 3522 * 0 - Success 3523 */ 3524 static int 3525 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3526 { 3527 vd_dring_entry_t *dep = NULL; 3528 ldc_mem_handle_t mhdl; 3529 caddr_t vaddr; 3530 size_t nbytes; 3531 uint8_t perm = LDC_MEM_RW; 3532 uint8_t maptype; 3533 int rv = 0; 3534 int i; 3535 3536 ASSERT(vdcp != NULL); 3537 3538 dep = ldep->dep; 3539 mhdl = ldep->desc_mhdl; 3540 3541 switch (ldep->dir) { 3542 case VIO_read_dir: 3543 perm = LDC_MEM_W; 3544 break; 3545 3546 case VIO_write_dir: 3547 perm = LDC_MEM_R; 3548 break; 3549 3550 case VIO_both_dir: 3551 perm = LDC_MEM_RW; 3552 break; 3553 3554 default: 3555 ASSERT(0); /* catch bad programming in vdc */ 3556 } 3557 3558 /* 3559 * LDC expects any addresses passed in to be 8-byte aligned. We need 3560 * to copy the contents of any misaligned buffers to a newly allocated 3561 * buffer and bind it instead (and copy the the contents back to the 3562 * original buffer passed in when depopulating the descriptor) 3563 */ 3564 vaddr = ldep->addr; 3565 nbytes = ldep->nbytes; 3566 if (((uint64_t)vaddr & 0x7) != 0) { 3567 ASSERT(ldep->align_addr == NULL); 3568 ldep->align_addr = 3569 kmem_alloc(sizeof (caddr_t) * 3570 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3571 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3572 "(buf=%p nb=%ld op=%d)\n", 3573 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3574 nbytes, ldep->operation); 3575 if (perm != LDC_MEM_W) 3576 bcopy(vaddr, ldep->align_addr, nbytes); 3577 vaddr = ldep->align_addr; 3578 } 3579 3580 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3581 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3582 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3583 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3584 vdcp->instance, dep->payload.ncookies); 3585 if (rv != 0) { 3586 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3587 "(mhdl=%p, buf=%p, err=%d)\n", 3588 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3589 if (ldep->align_addr) { 3590 kmem_free(ldep->align_addr, 3591 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3592 ldep->align_addr = NULL; 3593 } 3594 return (EAGAIN); 3595 } 3596 3597 /* 3598 * Get the other cookies (if any). 3599 */ 3600 for (i = 1; i < dep->payload.ncookies; i++) { 3601 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3602 if (rv != 0) { 3603 (void) ldc_mem_unbind_handle(mhdl); 3604 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3605 "(mhdl=%lx cnum=%d), err=%d", 3606 vdcp->instance, mhdl, i, rv); 3607 if (ldep->align_addr) { 3608 kmem_free(ldep->align_addr, 3609 sizeof (caddr_t) * ldep->nbytes); 3610 ldep->align_addr = NULL; 3611 } 3612 return (EAGAIN); 3613 } 3614 } 3615 3616 return (rv); 3617 } 3618 3619 /* 3620 * Interrupt handlers for messages from LDC 3621 */ 3622 3623 /* 3624 * Function: 3625 * vdc_handle_cb() 3626 * 3627 * Description: 3628 * 3629 * Arguments: 3630 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3631 * arg - soft state pointer for this instance of the device driver. 3632 * 3633 * Return Code: 3634 * 0 - Success 3635 */ 3636 static uint_t 3637 vdc_handle_cb(uint64_t event, caddr_t arg) 3638 { 3639 ldc_status_t ldc_state; 3640 int rv = 0; 3641 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3642 vdc_t *vdc = srvr->vdcp; 3643 3644 ASSERT(vdc != NULL); 3645 3646 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3647 3648 /* If callback is not for the current server, ignore it */ 3649 mutex_enter(&vdc->lock); 3650 3651 if (vdc->curr_server != srvr) { 3652 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3653 vdc->instance, event, srvr->id); 3654 mutex_exit(&vdc->lock); 3655 return (LDC_SUCCESS); 3656 } 3657 3658 /* 3659 * Depending on the type of event that triggered this callback, 3660 * we modify the handshake state or read the data. 3661 * 3662 * NOTE: not done as a switch() as event could be triggered by 3663 * a state change and a read request. Also the ordering of the 3664 * check for the event types is deliberate. 3665 */ 3666 if (event & LDC_EVT_UP) { 3667 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3668 3669 /* get LDC state */ 3670 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3671 if (rv != 0) { 3672 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3673 vdc->instance, rv); 3674 mutex_exit(&vdc->lock); 3675 return (LDC_SUCCESS); 3676 } 3677 if (srvr->ldc_state != LDC_UP && 3678 ldc_state == LDC_UP) { 3679 /* 3680 * Reset the transaction sequence numbers when 3681 * LDC comes up. We then kick off the handshake 3682 * negotiation with the vDisk server. 3683 */ 3684 vdc->seq_num = 1; 3685 vdc->seq_num_reply = 0; 3686 srvr->ldc_state = ldc_state; 3687 cv_signal(&vdc->initwait_cv); 3688 } 3689 } 3690 3691 if (event & LDC_EVT_READ) { 3692 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3693 mutex_enter(&vdc->read_lock); 3694 cv_signal(&vdc->read_cv); 3695 vdc->read_state = VDC_READ_PENDING; 3696 mutex_exit(&vdc->read_lock); 3697 mutex_exit(&vdc->lock); 3698 3699 /* that's all we have to do - no need to handle DOWN/RESET */ 3700 return (LDC_SUCCESS); 3701 } 3702 3703 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3704 3705 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3706 3707 /* 3708 * Need to wake up any readers so they will 3709 * detect that a reset has occurred. 3710 */ 3711 mutex_enter(&vdc->read_lock); 3712 if ((vdc->read_state == VDC_READ_WAITING) || 3713 (vdc->read_state == VDC_READ_RESET)) 3714 cv_signal(&vdc->read_cv); 3715 vdc->read_state = VDC_READ_RESET; 3716 mutex_exit(&vdc->read_lock); 3717 3718 /* wake up any threads waiting for connection to come up */ 3719 if (vdc->state == VDC_STATE_INIT_WAITING) { 3720 vdc->state = VDC_STATE_RESETTING; 3721 cv_signal(&vdc->initwait_cv); 3722 } 3723 3724 } 3725 3726 mutex_exit(&vdc->lock); 3727 3728 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3729 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3730 vdc->instance, event); 3731 3732 return (LDC_SUCCESS); 3733 } 3734 3735 /* 3736 * Function: 3737 * vdc_wait_for_response() 3738 * 3739 * Description: 3740 * Block waiting for a response from the server. If there is 3741 * no data the thread block on the read_cv that is signalled 3742 * by the callback when an EVT_READ occurs. 3743 * 3744 * Arguments: 3745 * vdcp - soft state pointer for this instance of the device driver. 3746 * 3747 * Return Code: 3748 * 0 - Success 3749 */ 3750 static int 3751 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3752 { 3753 size_t nbytes = sizeof (*msgp); 3754 int status; 3755 3756 ASSERT(vdcp != NULL); 3757 3758 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3759 3760 status = vdc_recv(vdcp, msgp, &nbytes); 3761 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3762 status, (int)nbytes); 3763 if (status) { 3764 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3765 vdcp->instance, status); 3766 return (status); 3767 } 3768 3769 if (nbytes < sizeof (vio_msg_tag_t)) { 3770 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3771 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3772 return (ENOMSG); 3773 } 3774 3775 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3776 msgp->tag.vio_msgtype, 3777 msgp->tag.vio_subtype, 3778 msgp->tag.vio_subtype_env); 3779 3780 /* 3781 * Verify the Session ID of the message 3782 * 3783 * Every message after the Version has been negotiated should 3784 * have the correct session ID set. 3785 */ 3786 if ((msgp->tag.vio_sid != vdcp->session_id) && 3787 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3788 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3789 "expected 0x%lx [seq num %lx @ %d]", 3790 vdcp->instance, msgp->tag.vio_sid, 3791 vdcp->session_id, 3792 ((vio_dring_msg_t *)msgp)->seq_num, 3793 ((vio_dring_msg_t *)msgp)->start_idx); 3794 return (ENOMSG); 3795 } 3796 return (0); 3797 } 3798 3799 3800 /* 3801 * Function: 3802 * vdc_resubmit_backup_dring() 3803 * 3804 * Description: 3805 * Resubmit each descriptor in the backed up dring to 3806 * vDisk server. The Dring was backed up during connection 3807 * reset. 3808 * 3809 * Arguments: 3810 * vdcp - soft state pointer for this instance of the device driver. 3811 * 3812 * Return Code: 3813 * 0 - Success 3814 */ 3815 static int 3816 vdc_resubmit_backup_dring(vdc_t *vdcp) 3817 { 3818 int processed = 0; 3819 int count; 3820 int b_idx; 3821 int rv = 0; 3822 int dring_size; 3823 int op; 3824 vio_msg_t vio_msg; 3825 vdc_local_desc_t *curr_ldep; 3826 3827 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3828 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3829 3830 if (vdcp->local_dring_backup == NULL) { 3831 /* the pending requests have already been processed */ 3832 return (0); 3833 } 3834 3835 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3836 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3837 3838 /* 3839 * Walk the backup copy of the local descriptor ring and 3840 * resubmit all the outstanding transactions. 3841 */ 3842 b_idx = vdcp->local_dring_backup_tail; 3843 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3844 3845 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3846 3847 /* only resubmit outstanding transactions */ 3848 if (!curr_ldep->is_free) { 3849 /* 3850 * If we are retrying a block read/write operation we 3851 * need to update the I/O statistics to indicate that 3852 * the request is being put back on the waitq to be 3853 * serviced (it will have been taken off after the 3854 * error was reported). 3855 */ 3856 mutex_enter(&vdcp->lock); 3857 op = curr_ldep->operation; 3858 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3859 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3860 VD_KSTAT_WAITQ_ENTER(vdcp); 3861 } 3862 3863 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3864 rv = vdc_populate_descriptor(vdcp, op, 3865 curr_ldep->addr, curr_ldep->nbytes, 3866 curr_ldep->slice, curr_ldep->offset, 3867 curr_ldep->cb_type, curr_ldep->cb_arg, 3868 curr_ldep->dir); 3869 3870 if (rv) { 3871 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3872 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3873 VD_KSTAT_WAITQ_EXIT(vdcp); 3874 DTRACE_IO1(done, buf_t *, 3875 curr_ldep->cb_arg); 3876 } 3877 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3878 vdcp->instance, b_idx); 3879 mutex_exit(&vdcp->lock); 3880 goto done; 3881 } 3882 3883 /* 3884 * If this is a block read/write we update the I/O 3885 * statistics kstat to indicate that the request 3886 * has been sent back to the vDisk server and should 3887 * now be put on the run queue. 3888 */ 3889 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3890 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3891 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3892 } 3893 mutex_exit(&vdcp->lock); 3894 3895 /* Wait for the response message. */ 3896 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3897 b_idx); 3898 rv = vdc_wait_for_response(vdcp, &vio_msg); 3899 if (rv) { 3900 /* 3901 * If this is a block read/write we update 3902 * the I/O statistics kstat to take it 3903 * off the run queue. 3904 */ 3905 mutex_enter(&vdcp->lock); 3906 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3907 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3908 VD_KSTAT_RUNQ_EXIT(vdcp); 3909 DTRACE_IO1(done, buf_t *, 3910 curr_ldep->cb_arg); 3911 } 3912 DMSG(vdcp, 1, "[%d] wait_for_response " 3913 "returned err=%d\n", vdcp->instance, 3914 rv); 3915 mutex_exit(&vdcp->lock); 3916 goto done; 3917 } 3918 3919 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3920 rv = vdc_process_data_msg(vdcp, &vio_msg); 3921 if (rv) { 3922 DMSG(vdcp, 1, "[%d] process_data_msg " 3923 "returned err=%d\n", vdcp->instance, 3924 rv); 3925 goto done; 3926 } 3927 /* 3928 * Mark this entry as free so that we will not resubmit 3929 * this "done" request again, if we were to use the same 3930 * backup_dring again in future. This could happen when 3931 * a reset happens while processing the backup_dring. 3932 */ 3933 curr_ldep->is_free = B_TRUE; 3934 processed++; 3935 } 3936 3937 /* get the next element to submit */ 3938 if (++b_idx >= vdcp->local_dring_backup_len) 3939 b_idx = 0; 3940 } 3941 3942 /* all done - now clear up pending dring copy */ 3943 dring_size = vdcp->local_dring_backup_len * 3944 sizeof (vdcp->local_dring_backup[0]); 3945 3946 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3947 3948 vdcp->local_dring_backup = NULL; 3949 3950 done: 3951 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3952 3953 return (rv); 3954 } 3955 3956 /* 3957 * Function: 3958 * vdc_cancel_backup_dring 3959 * 3960 * Description: 3961 * Cancel each descriptor in the backed up dring to vDisk server. 3962 * The Dring was backed up during connection reset. 3963 * 3964 * Arguments: 3965 * vdcp - soft state pointer for this instance of the device driver. 3966 * 3967 * Return Code: 3968 * None 3969 */ 3970 void 3971 vdc_cancel_backup_dring(vdc_t *vdcp) 3972 { 3973 vdc_local_desc_t *ldep; 3974 struct buf *bufp; 3975 int count; 3976 int b_idx; 3977 int dring_size; 3978 int cancelled = 0; 3979 3980 ASSERT(MUTEX_HELD(&vdcp->lock)); 3981 ASSERT(vdcp->state == VDC_STATE_INIT || 3982 vdcp->state == VDC_STATE_INIT_WAITING || 3983 vdcp->state == VDC_STATE_NEGOTIATE || 3984 vdcp->state == VDC_STATE_RESETTING); 3985 3986 if (vdcp->local_dring_backup == NULL) { 3987 /* the pending requests have already been processed */ 3988 return; 3989 } 3990 3991 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3992 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3993 3994 /* 3995 * Walk the backup copy of the local descriptor ring and 3996 * cancel all the outstanding transactions. 3997 */ 3998 b_idx = vdcp->local_dring_backup_tail; 3999 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 4000 4001 ldep = &(vdcp->local_dring_backup[b_idx]); 4002 4003 /* only cancel outstanding transactions */ 4004 if (!ldep->is_free) { 4005 4006 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 4007 cancelled++; 4008 4009 /* 4010 * All requests have already been cleared from the 4011 * local descriptor ring and the LDC channel has been 4012 * reset so we will never get any reply for these 4013 * requests. Now we just have to notify threads waiting 4014 * for replies that the request has failed. 4015 */ 4016 switch (ldep->cb_type) { 4017 case CB_SYNC: 4018 ASSERT(vdcp->sync_op_pending); 4019 vdcp->sync_op_status = EIO; 4020 vdcp->sync_op_pending = B_FALSE; 4021 cv_signal(&vdcp->sync_pending_cv); 4022 break; 4023 4024 case CB_STRATEGY: 4025 bufp = ldep->cb_arg; 4026 ASSERT(bufp != NULL); 4027 bufp->b_resid = bufp->b_bcount; 4028 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4029 VD_KSTAT_RUNQ_EXIT(vdcp); 4030 DTRACE_IO1(done, buf_t *, bufp); 4031 bioerror(bufp, EIO); 4032 biodone(bufp); 4033 break; 4034 4035 default: 4036 ASSERT(0); 4037 } 4038 4039 } 4040 4041 /* get the next element to cancel */ 4042 if (++b_idx >= vdcp->local_dring_backup_len) 4043 b_idx = 0; 4044 } 4045 4046 /* all done - now clear up pending dring copy */ 4047 dring_size = vdcp->local_dring_backup_len * 4048 sizeof (vdcp->local_dring_backup[0]); 4049 4050 (void) kmem_free(vdcp->local_dring_backup, dring_size); 4051 4052 vdcp->local_dring_backup = NULL; 4053 4054 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 4055 } 4056 4057 /* 4058 * Function: 4059 * vdc_connection_timeout 4060 * 4061 * Description: 4062 * This function is invoked if the timeout set to establish the connection 4063 * with vds expires. This will happen if we spend too much time in the 4064 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4065 * cancel any pending request and mark them as failed. 4066 * 4067 * If the timeout does not expire, it will be cancelled when we reach the 4068 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4069 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4070 * VDC_STATE_RESETTING state in which case we do nothing because the 4071 * timeout is being cancelled. 4072 * 4073 * Arguments: 4074 * arg - argument of the timeout function actually a soft state 4075 * pointer for the instance of the device driver. 4076 * 4077 * Return Code: 4078 * None 4079 */ 4080 void 4081 vdc_connection_timeout(void *arg) 4082 { 4083 vdc_t *vdcp = (vdc_t *)arg; 4084 4085 mutex_enter(&vdcp->lock); 4086 4087 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4088 vdcp->state == VDC_STATE_DETACH) { 4089 /* 4090 * The connection has just been re-established or 4091 * we are detaching. 4092 */ 4093 vdcp->ctimeout_reached = B_FALSE; 4094 mutex_exit(&vdcp->lock); 4095 return; 4096 } 4097 4098 vdcp->ctimeout_reached = B_TRUE; 4099 4100 /* notify requests waiting for sending */ 4101 cv_broadcast(&vdcp->running_cv); 4102 4103 /* cancel requests waiting for a result */ 4104 vdc_cancel_backup_dring(vdcp); 4105 4106 mutex_exit(&vdcp->lock); 4107 4108 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4109 vdcp->instance); 4110 } 4111 4112 /* 4113 * Function: 4114 * vdc_backup_local_dring() 4115 * 4116 * Description: 4117 * Backup the current dring in the event of a reset. The Dring 4118 * transactions will be resubmitted to the server when the 4119 * connection is restored. 4120 * 4121 * Arguments: 4122 * vdcp - soft state pointer for this instance of the device driver. 4123 * 4124 * Return Code: 4125 * NONE 4126 */ 4127 static void 4128 vdc_backup_local_dring(vdc_t *vdcp) 4129 { 4130 int dring_size; 4131 4132 ASSERT(MUTEX_HELD(&vdcp->lock)); 4133 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4134 4135 /* 4136 * If the backup dring is stil around, it means 4137 * that the last restore did not complete. However, 4138 * since we never got back into the running state, 4139 * the backup copy we have is still valid. 4140 */ 4141 if (vdcp->local_dring_backup != NULL) { 4142 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4143 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4144 vdcp->local_dring_backup_tail); 4145 return; 4146 } 4147 4148 /* 4149 * The backup dring can be NULL and the local dring may not be 4150 * initialized. This can happen if we had a reset while establishing 4151 * a new connection but after the connection has timed out. In that 4152 * case the backup dring is NULL because the requests have been 4153 * cancelled and the request occured before the local dring is 4154 * initialized. 4155 */ 4156 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4157 return; 4158 4159 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4160 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4161 4162 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4163 4164 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4165 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4166 4167 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4168 vdcp->local_dring_backup_len = vdcp->dring_len; 4169 } 4170 4171 static void 4172 vdc_switch_server(vdc_t *vdcp) 4173 { 4174 int rv; 4175 vdc_server_t *curr_server, *new_server; 4176 4177 ASSERT(MUTEX_HELD(&vdcp->lock)); 4178 4179 /* if there is only one server return back */ 4180 if (vdcp->num_servers == 1) { 4181 return; 4182 } 4183 4184 /* Get current and next server */ 4185 curr_server = vdcp->curr_server; 4186 new_server = 4187 (curr_server->next) ? curr_server->next : vdcp->server_list; 4188 ASSERT(curr_server != new_server); 4189 4190 /* bring current server's channel down */ 4191 rv = ldc_down(curr_server->ldc_handle); 4192 if (rv) { 4193 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4194 vdcp->instance, curr_server->id); 4195 return; 4196 } 4197 4198 /* switch the server */ 4199 vdcp->curr_server = new_server; 4200 4201 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4202 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4203 } 4204 4205 /* -------------------------------------------------------------------------- */ 4206 4207 /* 4208 * The following functions process the incoming messages from vds 4209 */ 4210 4211 /* 4212 * Function: 4213 * vdc_process_msg_thread() 4214 * 4215 * Description: 4216 * 4217 * Main VDC message processing thread. Each vDisk instance 4218 * consists of a copy of this thread. This thread triggers 4219 * all the handshakes and data exchange with the server. It 4220 * also handles all channel resets 4221 * 4222 * Arguments: 4223 * vdc - soft state pointer for this instance of the device driver. 4224 * 4225 * Return Code: 4226 * None 4227 */ 4228 static void 4229 vdc_process_msg_thread(vdc_t *vdcp) 4230 { 4231 int status; 4232 int ctimeout; 4233 timeout_id_t tmid = 0; 4234 clock_t ldcup_timeout = 0; 4235 4236 mutex_enter(&vdcp->lock); 4237 4238 for (;;) { 4239 4240 #define Q(_s) (vdcp->state == _s) ? #_s : 4241 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4242 Q(VDC_STATE_INIT) 4243 Q(VDC_STATE_INIT_WAITING) 4244 Q(VDC_STATE_NEGOTIATE) 4245 Q(VDC_STATE_HANDLE_PENDING) 4246 Q(VDC_STATE_RUNNING) 4247 Q(VDC_STATE_RESETTING) 4248 Q(VDC_STATE_DETACH) 4249 "UNKNOWN"); 4250 4251 switch (vdcp->state) { 4252 case VDC_STATE_INIT: 4253 4254 /* 4255 * If requested, start a timeout to check if the 4256 * connection with vds is established in the 4257 * specified delay. If the timeout expires, we 4258 * will cancel any pending request. 4259 * 4260 * If some reset have occurred while establishing 4261 * the connection, we already have a timeout armed 4262 * and in that case we don't need to arm a new one. 4263 * 4264 * The same rule applies when there are multiple vds'. 4265 * If either a connection cannot be established or 4266 * the handshake times out, the connection thread will 4267 * try another server. The 'ctimeout' will report 4268 * back an error after it expires irrespective of 4269 * whether the vdisk is trying to connect to just 4270 * one or multiple servers. 4271 */ 4272 ctimeout = (vdc_timeout != 0)? 4273 vdc_timeout : vdcp->curr_server->ctimeout; 4274 4275 if (ctimeout != 0 && tmid == 0) { 4276 tmid = timeout(vdc_connection_timeout, vdcp, 4277 ctimeout * drv_usectohz(MICROSEC)); 4278 } 4279 4280 /* Check if we are re-initializing repeatedly */ 4281 if (vdcp->hshake_cnt > vdc_hshake_retries && 4282 vdcp->lifecycle != VDC_LC_ONLINE) { 4283 4284 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4285 vdcp->instance, vdcp->hshake_cnt); 4286 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4287 vdcp->instance); 4288 vdcp->state = VDC_STATE_DETACH; 4289 break; 4290 } 4291 4292 /* Switch to STATE_DETACH if drv is detaching */ 4293 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4294 vdcp->state = VDC_STATE_DETACH; 4295 break; 4296 } 4297 4298 /* Switch server */ 4299 if (vdcp->hshake_cnt > 0) 4300 vdc_switch_server(vdcp); 4301 vdcp->hshake_cnt++; 4302 4303 /* Bring up connection with vds via LDC */ 4304 status = vdc_start_ldc_connection(vdcp); 4305 if (status != EINVAL) { 4306 vdcp->state = VDC_STATE_INIT_WAITING; 4307 } 4308 break; 4309 4310 case VDC_STATE_INIT_WAITING: 4311 4312 /* if channel is UP, start negotiation */ 4313 if (vdcp->curr_server->ldc_state == LDC_UP) { 4314 vdcp->state = VDC_STATE_NEGOTIATE; 4315 break; 4316 } 4317 4318 /* check if only one server exists */ 4319 if (vdcp->num_servers == 1) { 4320 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4321 } else { 4322 /* 4323 * wait for LDC_UP, if it times out, switch 4324 * to another server. 4325 */ 4326 ldcup_timeout = ddi_get_lbolt() + 4327 (vdc_ldcup_timeout * 4328 drv_usectohz(MICROSEC)); 4329 status = cv_timedwait(&vdcp->initwait_cv, 4330 &vdcp->lock, ldcup_timeout); 4331 if (status == -1 && 4332 vdcp->state == VDC_STATE_INIT_WAITING && 4333 vdcp->curr_server->ldc_state != LDC_UP) { 4334 /* timed out & still waiting */ 4335 vdcp->state = VDC_STATE_INIT; 4336 break; 4337 } 4338 } 4339 4340 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4341 DMSG(vdcp, 0, 4342 "state moved to %d out from under us...\n", 4343 vdcp->state); 4344 } 4345 break; 4346 4347 case VDC_STATE_NEGOTIATE: 4348 switch (status = vdc_ver_negotiation(vdcp)) { 4349 case 0: 4350 break; 4351 default: 4352 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4353 status); 4354 goto reset; 4355 } 4356 4357 switch (status = vdc_attr_negotiation(vdcp)) { 4358 case 0: 4359 break; 4360 default: 4361 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4362 status); 4363 goto reset; 4364 } 4365 4366 switch (status = vdc_dring_negotiation(vdcp)) { 4367 case 0: 4368 break; 4369 default: 4370 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4371 status); 4372 goto reset; 4373 } 4374 4375 switch (status = vdc_rdx_exchange(vdcp)) { 4376 case 0: 4377 vdcp->state = VDC_STATE_HANDLE_PENDING; 4378 goto done; 4379 default: 4380 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4381 status); 4382 goto reset; 4383 } 4384 reset: 4385 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4386 status); 4387 vdcp->state = VDC_STATE_RESETTING; 4388 vdcp->self_reset = B_TRUE; 4389 done: 4390 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4391 vdcp->state); 4392 break; 4393 4394 case VDC_STATE_HANDLE_PENDING: 4395 4396 if (vdcp->ctimeout_reached) { 4397 /* 4398 * The connection timeout had been reached so 4399 * pending requests have been cancelled. Now 4400 * that the connection is back we can reset 4401 * the timeout. 4402 */ 4403 ASSERT(vdcp->local_dring_backup == NULL); 4404 ASSERT(tmid != 0); 4405 tmid = 0; 4406 vdcp->ctimeout_reached = B_FALSE; 4407 vdcp->state = VDC_STATE_RUNNING; 4408 DMSG(vdcp, 0, "[%d] connection to service " 4409 "domain is up", vdcp->instance); 4410 break; 4411 } 4412 4413 mutex_exit(&vdcp->lock); 4414 if (tmid != 0) { 4415 (void) untimeout(tmid); 4416 tmid = 0; 4417 } 4418 status = vdc_resubmit_backup_dring(vdcp); 4419 mutex_enter(&vdcp->lock); 4420 4421 if (status) 4422 vdcp->state = VDC_STATE_RESETTING; 4423 else 4424 vdcp->state = VDC_STATE_RUNNING; 4425 4426 break; 4427 4428 /* enter running state */ 4429 case VDC_STATE_RUNNING: 4430 /* 4431 * Signal anyone waiting for the connection 4432 * to come on line. 4433 */ 4434 vdcp->hshake_cnt = 0; 4435 cv_broadcast(&vdcp->running_cv); 4436 4437 /* failfast has to been checked after reset */ 4438 cv_signal(&vdcp->failfast_cv); 4439 4440 /* ownership is lost during reset */ 4441 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4442 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4443 cv_signal(&vdcp->ownership_cv); 4444 4445 cmn_err(CE_CONT, "?vdisk@%d is online using " 4446 "ldc@%ld,%ld\n", vdcp->instance, 4447 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4448 4449 mutex_exit(&vdcp->lock); 4450 4451 for (;;) { 4452 vio_msg_t msg; 4453 status = vdc_wait_for_response(vdcp, &msg); 4454 if (status) break; 4455 4456 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4457 vdcp->instance); 4458 status = vdc_process_data_msg(vdcp, &msg); 4459 if (status) { 4460 DMSG(vdcp, 1, "[%d] process_data_msg " 4461 "returned err=%d\n", vdcp->instance, 4462 status); 4463 break; 4464 } 4465 4466 } 4467 4468 mutex_enter(&vdcp->lock); 4469 4470 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4471 vdcp->instance); 4472 4473 vdcp->state = VDC_STATE_RESETTING; 4474 vdcp->self_reset = B_TRUE; 4475 break; 4476 4477 case VDC_STATE_RESETTING: 4478 /* 4479 * When we reach this state, we either come from the 4480 * VDC_STATE_RUNNING state and we can have pending 4481 * request but no timeout is armed; or we come from 4482 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4483 * VDC_HANDLE_PENDING state and there is no pending 4484 * request or pending requests have already been copied 4485 * into the backup dring. So we can safely keep the 4486 * connection timeout armed while we are in this state. 4487 */ 4488 4489 DMSG(vdcp, 0, "Initiating channel reset " 4490 "(pending = %d)\n", (int)vdcp->threads_pending); 4491 4492 if (vdcp->self_reset) { 4493 DMSG(vdcp, 0, 4494 "[%d] calling stop_ldc_connection.\n", 4495 vdcp->instance); 4496 status = vdc_stop_ldc_connection(vdcp); 4497 vdcp->self_reset = B_FALSE; 4498 } 4499 4500 /* 4501 * Wait for all threads currently waiting 4502 * for a free dring entry to use. 4503 */ 4504 while (vdcp->threads_pending) { 4505 cv_broadcast(&vdcp->membind_cv); 4506 cv_broadcast(&vdcp->dring_free_cv); 4507 mutex_exit(&vdcp->lock); 4508 /* give the waiters enough time to wake up */ 4509 delay(vdc_hz_min_ldc_delay); 4510 mutex_enter(&vdcp->lock); 4511 } 4512 4513 ASSERT(vdcp->threads_pending == 0); 4514 4515 /* Sanity check that no thread is receiving */ 4516 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4517 4518 vdcp->read_state = VDC_READ_IDLE; 4519 4520 vdc_backup_local_dring(vdcp); 4521 4522 /* cleanup the old d-ring */ 4523 vdc_destroy_descriptor_ring(vdcp); 4524 4525 /* go and start again */ 4526 vdcp->state = VDC_STATE_INIT; 4527 4528 break; 4529 4530 case VDC_STATE_DETACH: 4531 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4532 vdcp->instance); 4533 4534 /* cancel any pending timeout */ 4535 mutex_exit(&vdcp->lock); 4536 if (tmid != 0) { 4537 (void) untimeout(tmid); 4538 tmid = 0; 4539 } 4540 mutex_enter(&vdcp->lock); 4541 4542 /* 4543 * Signal anyone waiting for connection 4544 * to come online 4545 */ 4546 cv_broadcast(&vdcp->running_cv); 4547 4548 while (vdcp->sync_op_pending) { 4549 cv_signal(&vdcp->sync_pending_cv); 4550 cv_signal(&vdcp->sync_blocked_cv); 4551 mutex_exit(&vdcp->lock); 4552 /* give the waiters enough time to wake up */ 4553 delay(vdc_hz_min_ldc_delay); 4554 mutex_enter(&vdcp->lock); 4555 } 4556 4557 mutex_exit(&vdcp->lock); 4558 4559 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4560 vdcp->instance); 4561 thread_exit(); 4562 break; 4563 } 4564 } 4565 } 4566 4567 4568 /* 4569 * Function: 4570 * vdc_process_data_msg() 4571 * 4572 * Description: 4573 * This function is called by the message processing thread each time 4574 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4575 * be an ACK or NACK from vds[1] which vdc handles as follows. 4576 * ACK - wake up the waiting thread 4577 * NACK - resend any messages necessary 4578 * 4579 * [1] Although the message format allows it, vds should not send a 4580 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4581 * some bizarre reason it does, vdc will reset the connection. 4582 * 4583 * Arguments: 4584 * vdc - soft state pointer for this instance of the device driver. 4585 * msg - the LDC message sent by vds 4586 * 4587 * Return Code: 4588 * 0 - Success. 4589 * > 0 - error value returned by LDC 4590 */ 4591 static int 4592 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4593 { 4594 int status = 0; 4595 vio_dring_msg_t *dring_msg; 4596 vdc_local_desc_t *ldep = NULL; 4597 int start, end; 4598 int idx; 4599 int op; 4600 4601 dring_msg = (vio_dring_msg_t *)msg; 4602 4603 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4604 ASSERT(vdcp != NULL); 4605 4606 mutex_enter(&vdcp->lock); 4607 4608 /* 4609 * Check to see if the message has bogus data 4610 */ 4611 idx = start = dring_msg->start_idx; 4612 end = dring_msg->end_idx; 4613 if ((start >= vdcp->dring_len) || 4614 (end >= vdcp->dring_len) || (end < -1)) { 4615 /* 4616 * Update the I/O statistics to indicate that an error ocurred. 4617 * No need to update the wait/run queues as no specific read or 4618 * write request is being completed in response to this 'msg'. 4619 */ 4620 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4621 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4622 vdcp->instance, start, end); 4623 mutex_exit(&vdcp->lock); 4624 return (EINVAL); 4625 } 4626 4627 /* 4628 * Verify that the sequence number is what vdc expects. 4629 */ 4630 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4631 case VDC_SEQ_NUM_TODO: 4632 break; /* keep processing this message */ 4633 case VDC_SEQ_NUM_SKIP: 4634 mutex_exit(&vdcp->lock); 4635 return (0); 4636 case VDC_SEQ_NUM_INVALID: 4637 /* 4638 * Update the I/O statistics to indicate that an error ocurred. 4639 * No need to update the wait/run queues as no specific read or 4640 * write request is being completed in response to this 'msg'. 4641 */ 4642 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4643 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4644 mutex_exit(&vdcp->lock); 4645 return (ENXIO); 4646 } 4647 4648 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4649 /* 4650 * Update the I/O statistics to indicate that an error ocurred. 4651 * 4652 * We need to update the run queue if a read or write request 4653 * is being NACKed - otherwise there will appear to be an 4654 * indefinite outstanding request and statistics reported by 4655 * iostat(1M) will be incorrect. The transaction will be 4656 * resubmitted from the backup DRing following the reset 4657 * and the wait/run queues will be entered again. 4658 */ 4659 ldep = &vdcp->local_dring[idx]; 4660 op = ldep->operation; 4661 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4662 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4663 VD_KSTAT_RUNQ_EXIT(vdcp); 4664 } 4665 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4666 VDC_DUMP_DRING_MSG(dring_msg); 4667 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4668 mutex_exit(&vdcp->lock); 4669 return (EIO); 4670 4671 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4672 /* 4673 * Update the I/O statistics to indicate that an error occurred. 4674 * No need to update the wait/run queues as no specific read or 4675 * write request is being completed in response to this 'msg'. 4676 */ 4677 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4678 mutex_exit(&vdcp->lock); 4679 return (EPROTO); 4680 } 4681 4682 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4683 ASSERT(start == end); 4684 4685 ldep = &vdcp->local_dring[idx]; 4686 4687 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4688 ldep->dep->hdr.dstate, ldep->cb_type); 4689 4690 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4691 struct buf *bufp; 4692 4693 switch (ldep->cb_type) { 4694 case CB_SYNC: 4695 ASSERT(vdcp->sync_op_pending); 4696 4697 status = vdc_depopulate_descriptor(vdcp, idx); 4698 vdcp->sync_op_status = status; 4699 vdcp->sync_op_pending = B_FALSE; 4700 cv_signal(&vdcp->sync_pending_cv); 4701 break; 4702 4703 case CB_STRATEGY: 4704 bufp = ldep->cb_arg; 4705 ASSERT(bufp != NULL); 4706 bufp->b_resid = 4707 bufp->b_bcount - ldep->dep->payload.nbytes; 4708 status = ldep->dep->payload.status; /* Future:ntoh */ 4709 if (status != 0) { 4710 DMSG(vdcp, 1, "strategy status=%d\n", status); 4711 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4712 bioerror(bufp, status); 4713 } 4714 4715 (void) vdc_depopulate_descriptor(vdcp, idx); 4716 4717 DMSG(vdcp, 1, 4718 "strategy complete req=%ld bytes resp=%ld bytes\n", 4719 bufp->b_bcount, ldep->dep->payload.nbytes); 4720 4721 if (status != 0 && vdcp->failfast_interval != 0) { 4722 /* 4723 * The I/O has failed and failfast is enabled. 4724 * We need the failfast thread to check if the 4725 * failure is due to a reservation conflict. 4726 */ 4727 (void) vdc_failfast_io_queue(vdcp, bufp); 4728 } else { 4729 if (status == 0) { 4730 op = (bufp->b_flags & B_READ) ? 4731 VD_OP_BREAD : VD_OP_BWRITE; 4732 VD_UPDATE_IO_STATS(vdcp, op, 4733 ldep->dep->payload.nbytes); 4734 } 4735 VD_KSTAT_RUNQ_EXIT(vdcp); 4736 DTRACE_IO1(done, buf_t *, bufp); 4737 biodone(bufp); 4738 } 4739 break; 4740 4741 default: 4742 ASSERT(0); 4743 } 4744 } 4745 4746 /* let the arrival signal propogate */ 4747 mutex_exit(&vdcp->lock); 4748 4749 /* probe gives the count of how many entries were processed */ 4750 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4751 4752 return (0); 4753 } 4754 4755 4756 /* 4757 * Function: 4758 * vdc_handle_ver_msg() 4759 * 4760 * Description: 4761 * 4762 * Arguments: 4763 * vdc - soft state pointer for this instance of the device driver. 4764 * ver_msg - LDC message sent by vDisk server 4765 * 4766 * Return Code: 4767 * 0 - Success 4768 */ 4769 static int 4770 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4771 { 4772 int status = 0; 4773 4774 ASSERT(vdc != NULL); 4775 ASSERT(mutex_owned(&vdc->lock)); 4776 4777 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4778 return (EPROTO); 4779 } 4780 4781 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4782 return (EINVAL); 4783 } 4784 4785 switch (ver_msg->tag.vio_subtype) { 4786 case VIO_SUBTYPE_ACK: 4787 /* 4788 * We check to see if the version returned is indeed supported 4789 * (The server may have also adjusted the minor number downwards 4790 * and if so 'ver_msg' will contain the actual version agreed) 4791 */ 4792 if (vdc_is_supported_version(ver_msg)) { 4793 vdc->ver.major = ver_msg->ver_major; 4794 vdc->ver.minor = ver_msg->ver_minor; 4795 ASSERT(vdc->ver.major > 0); 4796 } else { 4797 status = EPROTO; 4798 } 4799 break; 4800 4801 case VIO_SUBTYPE_NACK: 4802 /* 4803 * call vdc_is_supported_version() which will return the next 4804 * supported version (if any) in 'ver_msg' 4805 */ 4806 (void) vdc_is_supported_version(ver_msg); 4807 if (ver_msg->ver_major > 0) { 4808 size_t len = sizeof (*ver_msg); 4809 4810 ASSERT(vdc->ver.major > 0); 4811 4812 /* reset the necessary fields and resend */ 4813 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4814 ver_msg->dev_class = VDEV_DISK; 4815 4816 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4817 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4818 vdc->instance, status); 4819 if (len != sizeof (*ver_msg)) 4820 status = EBADMSG; 4821 } else { 4822 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4823 vdc->instance); 4824 status = ENOTSUP; 4825 } 4826 4827 break; 4828 case VIO_SUBTYPE_INFO: 4829 /* 4830 * Handle the case where vds starts handshake 4831 * (for now only vdc is the instigator) 4832 */ 4833 status = ENOTSUP; 4834 break; 4835 4836 default: 4837 status = EINVAL; 4838 break; 4839 } 4840 4841 return (status); 4842 } 4843 4844 /* 4845 * Function: 4846 * vdc_handle_attr_msg() 4847 * 4848 * Description: 4849 * 4850 * Arguments: 4851 * vdc - soft state pointer for this instance of the device driver. 4852 * attr_msg - LDC message sent by vDisk server 4853 * 4854 * Return Code: 4855 * 0 - Success 4856 */ 4857 static int 4858 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4859 { 4860 int status = 0; 4861 4862 ASSERT(vdc != NULL); 4863 ASSERT(mutex_owned(&vdc->lock)); 4864 4865 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4866 return (EPROTO); 4867 } 4868 4869 switch (attr_msg->tag.vio_subtype) { 4870 case VIO_SUBTYPE_ACK: 4871 /* 4872 * We now verify the attributes sent by vds. 4873 */ 4874 if (attr_msg->vdisk_size == 0) { 4875 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4876 vdc->instance); 4877 status = EINVAL; 4878 break; 4879 } 4880 4881 if (attr_msg->max_xfer_sz == 0) { 4882 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4883 vdc->instance); 4884 status = EINVAL; 4885 break; 4886 } 4887 4888 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4889 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4890 vdc->instance); 4891 attr_msg->vdisk_size = 0; 4892 } 4893 4894 /* update the VIO block size */ 4895 if (attr_msg->vdisk_block_size > 0 && 4896 vdc_update_vio_bsize(vdc, 4897 attr_msg->vdisk_block_size) != 0) { 4898 DMSG(vdc, 0, "[%d] Invalid block size (%u) from vds", 4899 vdc->instance, attr_msg->vdisk_block_size); 4900 status = EINVAL; 4901 break; 4902 } 4903 4904 /* update disk, block and transfer sizes */ 4905 vdc_update_size(vdc, attr_msg->vdisk_size, 4906 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 4907 vdc->vdisk_type = attr_msg->vdisk_type; 4908 vdc->operations = attr_msg->operations; 4909 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4910 vdc->vdisk_media = attr_msg->vdisk_media; 4911 else 4912 vdc->vdisk_media = 0; 4913 4914 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4915 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4916 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4917 vdc->instance, vdc->vdisk_bsize, 4918 attr_msg->vdisk_block_size); 4919 4920 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4921 (attr_msg->vdisk_size > INT64_MAX) || 4922 (attr_msg->operations == 0) || 4923 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4924 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4925 vdc->instance); 4926 status = EINVAL; 4927 break; 4928 } 4929 4930 /* 4931 * Now that we have received all attributes we can create a 4932 * fake geometry for the disk. 4933 */ 4934 vdc_create_fake_geometry(vdc); 4935 break; 4936 4937 case VIO_SUBTYPE_NACK: 4938 /* 4939 * vds could not handle the attributes we sent so we 4940 * stop negotiating. 4941 */ 4942 status = EPROTO; 4943 break; 4944 4945 case VIO_SUBTYPE_INFO: 4946 /* 4947 * Handle the case where vds starts the handshake 4948 * (for now; vdc is the only supported instigatior) 4949 */ 4950 status = ENOTSUP; 4951 break; 4952 4953 default: 4954 status = ENOTSUP; 4955 break; 4956 } 4957 4958 return (status); 4959 } 4960 4961 /* 4962 * Function: 4963 * vdc_handle_dring_reg_msg() 4964 * 4965 * Description: 4966 * 4967 * Arguments: 4968 * vdc - soft state pointer for this instance of the driver. 4969 * dring_msg - LDC message sent by vDisk server 4970 * 4971 * Return Code: 4972 * 0 - Success 4973 */ 4974 static int 4975 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4976 { 4977 int status = 0; 4978 4979 ASSERT(vdc != NULL); 4980 ASSERT(mutex_owned(&vdc->lock)); 4981 4982 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4983 return (EPROTO); 4984 } 4985 4986 switch (dring_msg->tag.vio_subtype) { 4987 case VIO_SUBTYPE_ACK: 4988 /* save the received dring_ident */ 4989 vdc->dring_ident = dring_msg->dring_ident; 4990 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4991 vdc->instance, vdc->dring_ident); 4992 break; 4993 4994 case VIO_SUBTYPE_NACK: 4995 /* 4996 * vds could not handle the DRing info we sent so we 4997 * stop negotiating. 4998 */ 4999 DMSG(vdc, 0, "[%d] server could not register DRing\n", 5000 vdc->instance); 5001 status = EPROTO; 5002 break; 5003 5004 case VIO_SUBTYPE_INFO: 5005 /* 5006 * Handle the case where vds starts handshake 5007 * (for now only vdc is the instigatior) 5008 */ 5009 status = ENOTSUP; 5010 break; 5011 default: 5012 status = ENOTSUP; 5013 } 5014 5015 return (status); 5016 } 5017 5018 /* 5019 * Function: 5020 * vdc_verify_seq_num() 5021 * 5022 * Description: 5023 * This functions verifies that the sequence number sent back by the vDisk 5024 * server with the latest message is what is expected (i.e. it is greater 5025 * than the last seq num sent by the vDisk server and less than or equal 5026 * to the last seq num generated by vdc). 5027 * 5028 * It then checks the request ID to see if any requests need processing 5029 * in the DRing. 5030 * 5031 * Arguments: 5032 * vdc - soft state pointer for this instance of the driver. 5033 * dring_msg - pointer to the LDC message sent by vds 5034 * 5035 * Return Code: 5036 * VDC_SEQ_NUM_TODO - Message needs to be processed 5037 * VDC_SEQ_NUM_SKIP - Message has already been processed 5038 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 5039 * vdc cannot deal with them 5040 */ 5041 static int 5042 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 5043 { 5044 ASSERT(vdc != NULL); 5045 ASSERT(dring_msg != NULL); 5046 ASSERT(mutex_owned(&vdc->lock)); 5047 5048 /* 5049 * Check to see if the messages were responded to in the correct 5050 * order by vds. 5051 */ 5052 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5053 (dring_msg->seq_num > vdc->seq_num)) { 5054 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5055 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5056 vdc->instance, dring_msg->seq_num, 5057 vdc->seq_num_reply, vdc->seq_num, 5058 vdc->req_id_proc, vdc->req_id); 5059 return (VDC_SEQ_NUM_INVALID); 5060 } 5061 vdc->seq_num_reply = dring_msg->seq_num; 5062 5063 if (vdc->req_id_proc < vdc->req_id) 5064 return (VDC_SEQ_NUM_TODO); 5065 else 5066 return (VDC_SEQ_NUM_SKIP); 5067 } 5068 5069 5070 /* 5071 * Function: 5072 * vdc_is_supported_version() 5073 * 5074 * Description: 5075 * This routine checks if the major/minor version numbers specified in 5076 * 'ver_msg' are supported. If not it finds the next version that is 5077 * in the supported version list 'vdc_version[]' and sets the fields in 5078 * 'ver_msg' to those values 5079 * 5080 * Arguments: 5081 * ver_msg - LDC message sent by vDisk server 5082 * 5083 * Return Code: 5084 * B_TRUE - Success 5085 * B_FALSE - Version not supported 5086 */ 5087 static boolean_t 5088 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5089 { 5090 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5091 5092 for (int i = 0; i < vdc_num_versions; i++) { 5093 ASSERT(vdc_version[i].major > 0); 5094 ASSERT((i == 0) || 5095 (vdc_version[i].major < vdc_version[i-1].major)); 5096 5097 /* 5098 * If the major versions match, adjust the minor version, if 5099 * necessary, down to the highest value supported by this 5100 * client. The server should support all minor versions lower 5101 * than the value it sent 5102 */ 5103 if (ver_msg->ver_major == vdc_version[i].major) { 5104 if (ver_msg->ver_minor > vdc_version[i].minor) { 5105 DMSGX(0, 5106 "Adjusting minor version from %u to %u", 5107 ver_msg->ver_minor, vdc_version[i].minor); 5108 ver_msg->ver_minor = vdc_version[i].minor; 5109 } 5110 return (B_TRUE); 5111 } 5112 5113 /* 5114 * If the message contains a higher major version number, set 5115 * the message's major/minor versions to the current values 5116 * and return false, so this message will get resent with 5117 * these values, and the server will potentially try again 5118 * with the same or a lower version 5119 */ 5120 if (ver_msg->ver_major > vdc_version[i].major) { 5121 ver_msg->ver_major = vdc_version[i].major; 5122 ver_msg->ver_minor = vdc_version[i].minor; 5123 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5124 ver_msg->ver_major, ver_msg->ver_minor); 5125 5126 return (B_FALSE); 5127 } 5128 5129 /* 5130 * Otherwise, the message's major version is less than the 5131 * current major version, so continue the loop to the next 5132 * (lower) supported version 5133 */ 5134 } 5135 5136 /* 5137 * No common version was found; "ground" the version pair in the 5138 * message to terminate negotiation 5139 */ 5140 ver_msg->ver_major = 0; 5141 ver_msg->ver_minor = 0; 5142 5143 return (B_FALSE); 5144 } 5145 /* -------------------------------------------------------------------------- */ 5146 5147 /* 5148 * DKIO(7) support 5149 */ 5150 5151 typedef struct vdc_dk_arg { 5152 struct dk_callback dkc; 5153 int mode; 5154 dev_t dev; 5155 vdc_t *vdc; 5156 } vdc_dk_arg_t; 5157 5158 /* 5159 * Function: 5160 * vdc_dkio_flush_cb() 5161 * 5162 * Description: 5163 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5164 * by kernel code. 5165 * 5166 * Arguments: 5167 * arg - a pointer to a vdc_dk_arg_t structure. 5168 */ 5169 void 5170 vdc_dkio_flush_cb(void *arg) 5171 { 5172 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5173 struct dk_callback *dkc = NULL; 5174 vdc_t *vdc = NULL; 5175 int rv; 5176 5177 if (dk_arg == NULL) { 5178 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5179 return; 5180 } 5181 dkc = &dk_arg->dkc; 5182 vdc = dk_arg->vdc; 5183 ASSERT(vdc != NULL); 5184 5185 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5186 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5187 if (rv != 0) { 5188 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5189 vdc->instance, rv, 5190 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5191 } 5192 5193 /* 5194 * Trigger the call back to notify the caller the the ioctl call has 5195 * been completed. 5196 */ 5197 if ((dk_arg->mode & FKIOCTL) && 5198 (dkc != NULL) && 5199 (dkc->dkc_callback != NULL)) { 5200 ASSERT(dkc->dkc_cookie != NULL); 5201 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5202 } 5203 5204 /* Indicate that one less DKIO write flush is outstanding */ 5205 mutex_enter(&vdc->lock); 5206 vdc->dkio_flush_pending--; 5207 ASSERT(vdc->dkio_flush_pending >= 0); 5208 mutex_exit(&vdc->lock); 5209 5210 /* free the mem that was allocated when the callback was dispatched */ 5211 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5212 } 5213 5214 /* 5215 * Function: 5216 * vdc_dkio_gapart() 5217 * 5218 * Description: 5219 * This function implements the DKIOCGAPART ioctl. 5220 * 5221 * Arguments: 5222 * vdc - soft state pointer 5223 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5224 * flag - ioctl flags 5225 */ 5226 static int 5227 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5228 { 5229 struct dk_geom *geom; 5230 struct extvtoc *vtoc; 5231 union { 5232 struct dk_map map[NDKMAP]; 5233 struct dk_map32 map32[NDKMAP]; 5234 } data; 5235 int i, rv, size; 5236 5237 mutex_enter(&vdc->lock); 5238 5239 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5240 mutex_exit(&vdc->lock); 5241 return (rv); 5242 } 5243 5244 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) { 5245 mutex_exit(&vdc->lock); 5246 return (EOVERFLOW); 5247 } 5248 5249 vtoc = vdc->vtoc; 5250 geom = vdc->geom; 5251 5252 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5253 5254 for (i = 0; i < vtoc->v_nparts; i++) { 5255 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5256 (geom->dkg_nhead * geom->dkg_nsect); 5257 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5258 } 5259 size = NDKMAP * sizeof (struct dk_map32); 5260 5261 } else { 5262 5263 for (i = 0; i < vtoc->v_nparts; i++) { 5264 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5265 (geom->dkg_nhead * geom->dkg_nsect); 5266 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5267 } 5268 size = NDKMAP * sizeof (struct dk_map); 5269 5270 } 5271 5272 mutex_exit(&vdc->lock); 5273 5274 if (ddi_copyout(&data, arg, size, flag) != 0) 5275 return (EFAULT); 5276 5277 return (0); 5278 } 5279 5280 /* 5281 * Function: 5282 * vdc_dkio_partition() 5283 * 5284 * Description: 5285 * This function implements the DKIOCPARTITION ioctl. 5286 * 5287 * Arguments: 5288 * vdc - soft state pointer 5289 * arg - a pointer to a struct partition64 structure 5290 * flag - ioctl flags 5291 */ 5292 static int 5293 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5294 { 5295 struct partition64 p64; 5296 efi_gpt_t *gpt; 5297 efi_gpe_t *gpe; 5298 vd_efi_dev_t edev; 5299 uint_t partno; 5300 int rv; 5301 5302 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5303 return (EFAULT); 5304 } 5305 5306 VDC_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5307 5308 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5309 return (rv); 5310 } 5311 5312 partno = p64.p_partno; 5313 5314 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5315 vd_efi_free(&edev, gpt, gpe); 5316 return (ESRCH); 5317 } 5318 5319 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5320 sizeof (struct uuid)); 5321 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5322 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5323 5324 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5325 vd_efi_free(&edev, gpt, gpe); 5326 return (EFAULT); 5327 } 5328 5329 vd_efi_free(&edev, gpt, gpe); 5330 return (0); 5331 } 5332 5333 /* 5334 * Function: 5335 * vdc_dioctl_rwcmd() 5336 * 5337 * Description: 5338 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5339 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5340 * 5341 * Arguments: 5342 * dev - device 5343 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5344 * flag - ioctl flags 5345 */ 5346 static int 5347 vdc_dioctl_rwcmd(vdc_t *vdc, caddr_t arg, int flag) 5348 { 5349 struct dadkio_rwcmd32 rwcmd32; 5350 struct dadkio_rwcmd rwcmd; 5351 struct iovec aiov; 5352 struct uio auio; 5353 int rw, status; 5354 struct buf *buf; 5355 5356 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5357 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5358 sizeof (struct dadkio_rwcmd32), flag)) { 5359 return (EFAULT); 5360 } 5361 rwcmd.cmd = rwcmd32.cmd; 5362 rwcmd.flags = rwcmd32.flags; 5363 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5364 rwcmd.buflen = rwcmd32.buflen; 5365 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5366 } else { 5367 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5368 sizeof (struct dadkio_rwcmd), flag)) { 5369 return (EFAULT); 5370 } 5371 } 5372 5373 switch (rwcmd.cmd) { 5374 case DADKIO_RWCMD_READ: 5375 rw = B_READ; 5376 break; 5377 case DADKIO_RWCMD_WRITE: 5378 rw = B_WRITE; 5379 break; 5380 default: 5381 return (EINVAL); 5382 } 5383 5384 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5385 aiov.iov_base = rwcmd.bufaddr; 5386 aiov.iov_len = rwcmd.buflen; 5387 5388 bzero((caddr_t)&auio, sizeof (struct uio)); 5389 auio.uio_iov = &aiov; 5390 auio.uio_iovcnt = 1; 5391 auio.uio_loffset = rwcmd.blkaddr * vdc->vdisk_bsize; 5392 auio.uio_resid = rwcmd.buflen; 5393 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5394 5395 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5396 bioinit(buf); 5397 /* 5398 * We use the private field of buf to specify that this is an 5399 * I/O using an absolute offset. 5400 */ 5401 buf->b_private = (void *)VD_SLICE_NONE; 5402 5403 status = physio(vdc_strategy, buf, VD_MAKE_DEV(vdc->instance, 0), 5404 rw, vdc_min, &auio); 5405 5406 biofini(buf); 5407 kmem_free(buf, sizeof (buf_t)); 5408 5409 return (status); 5410 } 5411 5412 /* 5413 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5414 * buffer is returned in alloc_len. 5415 */ 5416 static vd_scsi_t * 5417 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5418 int *alloc_len) 5419 { 5420 vd_scsi_t *vd_scsi; 5421 int vd_scsi_len = VD_SCSI_SIZE; 5422 5423 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5424 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5425 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5426 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5427 5428 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5429 5430 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5431 5432 vd_scsi->cdb_len = cdb_len; 5433 vd_scsi->sense_len = sense_len; 5434 vd_scsi->datain_len = datain_len; 5435 vd_scsi->dataout_len = dataout_len; 5436 5437 *alloc_len = vd_scsi_len; 5438 5439 return (vd_scsi); 5440 } 5441 5442 /* 5443 * Convert the status of a SCSI command to a Solaris return code. 5444 * 5445 * Arguments: 5446 * vd_scsi - The SCSI operation buffer. 5447 * log_error - indicate if an error message should be logged. 5448 * 5449 * Note that our SCSI error messages are rather primitive for the moment 5450 * and could be improved by decoding some data like the SCSI command and 5451 * the sense key. 5452 * 5453 * Return value: 5454 * 0 - Status is good. 5455 * EACCES - Status reports a reservation conflict. 5456 * ENOTSUP - Status reports a check condition and sense key 5457 * reports an illegal request. 5458 * EIO - Any other status. 5459 */ 5460 static int 5461 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5462 { 5463 int rv; 5464 char path_str[MAXPATHLEN]; 5465 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5466 union scsi_cdb *cdb; 5467 struct scsi_extended_sense *sense; 5468 5469 if (vd_scsi->cmd_status == STATUS_GOOD) 5470 /* no error */ 5471 return (0); 5472 5473 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5474 if (vdc_scsi_log_error) 5475 log_error = B_TRUE; 5476 5477 if (log_error) { 5478 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5479 ddi_pathname(vdc->dip, path_str), vdc->instance, 5480 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5481 } 5482 5483 /* default returned value */ 5484 rv = EIO; 5485 5486 switch (vd_scsi->cmd_status) { 5487 5488 case STATUS_CHECK: 5489 case STATUS_TERMINATED: 5490 if (log_error) 5491 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5492 5493 /* check sense buffer */ 5494 if (vd_scsi->sense_len == 0 || 5495 vd_scsi->sense_status != STATUS_GOOD) { 5496 if (log_error) 5497 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5498 break; 5499 } 5500 5501 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5502 5503 if (log_error) { 5504 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5505 "\tASC: 0x%x, ASCQ: 0x%x\n", 5506 scsi_sense_key((uint8_t *)sense), 5507 scsi_sense_asc((uint8_t *)sense), 5508 scsi_sense_ascq((uint8_t *)sense)); 5509 } 5510 5511 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5512 rv = ENOTSUP; 5513 break; 5514 5515 case STATUS_BUSY: 5516 if (log_error) 5517 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5518 break; 5519 5520 case STATUS_RESERVATION_CONFLICT: 5521 /* 5522 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5523 * reservation conflict could be due to various reasons like 5524 * incorrect keys, not registered or not reserved etc. So, 5525 * we should not panic in that case. 5526 */ 5527 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5528 if (vdc->failfast_interval != 0 && 5529 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5530 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5531 /* failfast is enabled so we have to panic */ 5532 (void) snprintf(panic_str, sizeof (panic_str), 5533 VDC_RESV_CONFLICT_FMT_STR "%s", 5534 ddi_pathname(vdc->dip, path_str)); 5535 panic(panic_str); 5536 } 5537 if (log_error) 5538 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5539 rv = EACCES; 5540 break; 5541 5542 case STATUS_QFULL: 5543 if (log_error) 5544 cmn_err(CE_NOTE, "\tQueue Full\n"); 5545 break; 5546 5547 case STATUS_MET: 5548 case STATUS_INTERMEDIATE: 5549 case STATUS_SCSI2: 5550 case STATUS_INTERMEDIATE_MET: 5551 case STATUS_ACA_ACTIVE: 5552 if (log_error) 5553 cmn_err(CE_CONT, 5554 "\tUnexpected SCSI status received: 0x%x\n", 5555 vd_scsi->cmd_status); 5556 break; 5557 5558 default: 5559 if (log_error) 5560 cmn_err(CE_CONT, 5561 "\tInvalid SCSI status received: 0x%x\n", 5562 vd_scsi->cmd_status); 5563 break; 5564 } 5565 5566 return (rv); 5567 } 5568 5569 /* 5570 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5571 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5572 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5573 * converted to a VD_OP_RESET operation. 5574 */ 5575 static int 5576 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5577 { 5578 struct uscsi_cmd uscsi; 5579 struct uscsi_cmd32 uscsi32; 5580 vd_scsi_t *vd_scsi; 5581 int vd_scsi_len; 5582 union scsi_cdb *cdb; 5583 struct scsi_extended_sense *sense; 5584 char *datain, *dataout; 5585 size_t cdb_len, datain_len, dataout_len, sense_len; 5586 int rv; 5587 5588 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5589 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5590 mode) != 0) 5591 return (EFAULT); 5592 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5593 } else { 5594 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5595 mode) != 0) 5596 return (EFAULT); 5597 } 5598 5599 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5600 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5601 USCSI_RESET_ALL)) { 5602 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5603 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5604 return (rv); 5605 } 5606 5607 /* cdb buffer length */ 5608 cdb_len = uscsi.uscsi_cdblen; 5609 5610 /* data in and out buffers length */ 5611 if (uscsi.uscsi_flags & USCSI_READ) { 5612 datain_len = uscsi.uscsi_buflen; 5613 dataout_len = 0; 5614 } else { 5615 datain_len = 0; 5616 dataout_len = uscsi.uscsi_buflen; 5617 } 5618 5619 /* sense buffer length */ 5620 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5621 sense_len = uscsi.uscsi_rqlen; 5622 else 5623 sense_len = 0; 5624 5625 /* allocate buffer for the VD_SCSICMD_OP operation */ 5626 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5627 &vd_scsi_len); 5628 5629 /* 5630 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5631 * but basically they prevent a SCSI command from being retried in case 5632 * of an error. 5633 */ 5634 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5635 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5636 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5637 5638 /* set task attribute */ 5639 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5640 vd_scsi->task_attribute = 0; 5641 } else { 5642 if (uscsi.uscsi_flags & USCSI_HEAD) 5643 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5644 else if (uscsi.uscsi_flags & USCSI_HTAG) 5645 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5646 else if (uscsi.uscsi_flags & USCSI_OTAG) 5647 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5648 else 5649 vd_scsi->task_attribute = 0; 5650 } 5651 5652 /* set timeout */ 5653 vd_scsi->timeout = uscsi.uscsi_timeout; 5654 5655 /* copy-in cdb data */ 5656 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5657 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5658 rv = EFAULT; 5659 goto done; 5660 } 5661 5662 /* keep a pointer to the sense buffer */ 5663 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5664 5665 /* keep a pointer to the data-in buffer */ 5666 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5667 5668 /* copy-in request data to the data-out buffer */ 5669 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5670 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5671 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5672 mode)) { 5673 rv = EFAULT; 5674 goto done; 5675 } 5676 } 5677 5678 /* submit the request */ 5679 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5680 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5681 5682 if (rv != 0) 5683 goto done; 5684 5685 /* update scsi status */ 5686 uscsi.uscsi_status = vd_scsi->cmd_status; 5687 5688 /* update sense data */ 5689 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5690 (uscsi.uscsi_status == STATUS_CHECK || 5691 uscsi.uscsi_status == STATUS_TERMINATED)) { 5692 5693 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5694 5695 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5696 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5697 vd_scsi->sense_len; 5698 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5699 vd_scsi->sense_len, mode) != 0) { 5700 rv = EFAULT; 5701 goto done; 5702 } 5703 } 5704 } 5705 5706 /* update request data */ 5707 if (uscsi.uscsi_status == STATUS_GOOD) { 5708 if (uscsi.uscsi_flags & USCSI_READ) { 5709 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5710 vd_scsi->datain_len; 5711 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5712 vd_scsi->datain_len, mode) != 0) { 5713 rv = EFAULT; 5714 goto done; 5715 } 5716 } else { 5717 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5718 vd_scsi->dataout_len; 5719 } 5720 } 5721 5722 /* copy-out result */ 5723 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5724 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5725 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5726 mode) != 0) { 5727 rv = EFAULT; 5728 goto done; 5729 } 5730 } else { 5731 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5732 mode) != 0) { 5733 rv = EFAULT; 5734 goto done; 5735 } 5736 } 5737 5738 /* get the return code from the SCSI command status */ 5739 rv = vdc_scsi_status(vdc, vd_scsi, 5740 !(uscsi.uscsi_flags & USCSI_SILENT)); 5741 5742 done: 5743 kmem_free(vd_scsi, vd_scsi_len); 5744 return (rv); 5745 } 5746 5747 /* 5748 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5749 * 5750 * Arguments: 5751 * cmd - SCSI PERSISTENT IN command 5752 * len - length of the SCSI input buffer 5753 * vd_scsi_len - return the length of the allocated buffer 5754 * 5755 * Returned Value: 5756 * a pointer to the allocated VD_OP_SCSICMD buffer. 5757 */ 5758 static vd_scsi_t * 5759 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5760 { 5761 int cdb_len, sense_len, datain_len, dataout_len; 5762 vd_scsi_t *vd_scsi; 5763 union scsi_cdb *cdb; 5764 5765 cdb_len = CDB_GROUP1; 5766 sense_len = sizeof (struct scsi_extended_sense); 5767 datain_len = len; 5768 dataout_len = 0; 5769 5770 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5771 vd_scsi_len); 5772 5773 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5774 5775 /* set cdb */ 5776 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5777 cdb->cdb_opaque[1] = cmd; 5778 FORMG1COUNT(cdb, datain_len); 5779 5780 vd_scsi->timeout = vdc_scsi_timeout; 5781 5782 return (vd_scsi); 5783 } 5784 5785 /* 5786 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5787 * 5788 * Arguments: 5789 * cmd - SCSI PERSISTENT OUT command 5790 * len - length of the SCSI output buffer 5791 * vd_scsi_len - return the length of the allocated buffer 5792 * 5793 * Returned Code: 5794 * a pointer to the allocated VD_OP_SCSICMD buffer. 5795 */ 5796 static vd_scsi_t * 5797 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5798 { 5799 int cdb_len, sense_len, datain_len, dataout_len; 5800 vd_scsi_t *vd_scsi; 5801 union scsi_cdb *cdb; 5802 5803 cdb_len = CDB_GROUP1; 5804 sense_len = sizeof (struct scsi_extended_sense); 5805 datain_len = 0; 5806 dataout_len = len; 5807 5808 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5809 vd_scsi_len); 5810 5811 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5812 5813 /* set cdb */ 5814 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5815 cdb->cdb_opaque[1] = cmd; 5816 FORMG1COUNT(cdb, dataout_len); 5817 5818 vd_scsi->timeout = vdc_scsi_timeout; 5819 5820 return (vd_scsi); 5821 } 5822 5823 /* 5824 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5825 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5826 * server with a VD_OP_SCSICMD operation. 5827 */ 5828 static int 5829 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5830 { 5831 vd_scsi_t *vd_scsi; 5832 mhioc_inkeys_t inkeys; 5833 mhioc_key_list_t klist; 5834 struct mhioc_inkeys32 inkeys32; 5835 struct mhioc_key_list32 klist32; 5836 sd_prin_readkeys_t *scsi_keys; 5837 void *user_keys; 5838 int vd_scsi_len; 5839 int listsize, listlen, rv; 5840 5841 /* copyin arguments */ 5842 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5843 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5844 if (rv != 0) 5845 return (EFAULT); 5846 5847 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5848 sizeof (klist32), mode); 5849 if (rv != 0) 5850 return (EFAULT); 5851 5852 listsize = klist32.listsize; 5853 } else { 5854 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5855 if (rv != 0) 5856 return (EFAULT); 5857 5858 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5859 if (rv != 0) 5860 return (EFAULT); 5861 5862 listsize = klist.listsize; 5863 } 5864 5865 /* build SCSI VD_OP request */ 5866 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5867 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5868 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5869 5870 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5871 5872 /* submit the request */ 5873 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5874 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5875 5876 if (rv != 0) 5877 goto done; 5878 5879 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5880 5881 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5882 inkeys32.generation = scsi_keys->generation; 5883 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5884 if (rv != 0) { 5885 rv = EFAULT; 5886 goto done; 5887 } 5888 5889 klist32.listlen = listlen; 5890 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5891 sizeof (klist32), mode); 5892 if (rv != 0) { 5893 rv = EFAULT; 5894 goto done; 5895 } 5896 5897 user_keys = (caddr_t)(uintptr_t)klist32.list; 5898 } else { 5899 inkeys.generation = scsi_keys->generation; 5900 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5901 if (rv != 0) { 5902 rv = EFAULT; 5903 goto done; 5904 } 5905 5906 klist.listlen = listlen; 5907 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5908 if (rv != 0) { 5909 rv = EFAULT; 5910 goto done; 5911 } 5912 5913 user_keys = klist.list; 5914 } 5915 5916 /* copy out keys */ 5917 if (listlen > 0 && listsize > 0) { 5918 if (listsize < listlen) 5919 listlen = listsize; 5920 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5921 listlen * MHIOC_RESV_KEY_SIZE, mode); 5922 if (rv != 0) 5923 rv = EFAULT; 5924 } 5925 5926 if (rv == 0) 5927 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5928 5929 done: 5930 kmem_free(vd_scsi, vd_scsi_len); 5931 5932 return (rv); 5933 } 5934 5935 /* 5936 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5937 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5938 * the vdisk server with a VD_OP_SCSICMD operation. 5939 */ 5940 static int 5941 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5942 { 5943 vd_scsi_t *vd_scsi; 5944 mhioc_inresvs_t inresv; 5945 mhioc_resv_desc_list_t rlist; 5946 struct mhioc_inresvs32 inresv32; 5947 struct mhioc_resv_desc_list32 rlist32; 5948 mhioc_resv_desc_t mhd_resv; 5949 sd_prin_readresv_t *scsi_resv; 5950 sd_readresv_desc_t *resv; 5951 mhioc_resv_desc_t *user_resv; 5952 int vd_scsi_len; 5953 int listsize, listlen, i, rv; 5954 5955 /* copyin arguments */ 5956 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5957 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5958 if (rv != 0) 5959 return (EFAULT); 5960 5961 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5962 sizeof (rlist32), mode); 5963 if (rv != 0) 5964 return (EFAULT); 5965 5966 listsize = rlist32.listsize; 5967 } else { 5968 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5969 if (rv != 0) 5970 return (EFAULT); 5971 5972 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5973 if (rv != 0) 5974 return (EFAULT); 5975 5976 listsize = rlist.listsize; 5977 } 5978 5979 /* build SCSI VD_OP request */ 5980 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5981 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5982 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5983 5984 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5985 5986 /* submit the request */ 5987 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5988 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5989 5990 if (rv != 0) 5991 goto done; 5992 5993 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5994 5995 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5996 inresv32.generation = scsi_resv->generation; 5997 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5998 if (rv != 0) { 5999 rv = EFAULT; 6000 goto done; 6001 } 6002 6003 rlist32.listlen = listlen; 6004 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 6005 sizeof (rlist32), mode); 6006 if (rv != 0) { 6007 rv = EFAULT; 6008 goto done; 6009 } 6010 6011 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 6012 } else { 6013 inresv.generation = scsi_resv->generation; 6014 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 6015 if (rv != 0) { 6016 rv = EFAULT; 6017 goto done; 6018 } 6019 6020 rlist.listlen = listlen; 6021 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 6022 if (rv != 0) { 6023 rv = EFAULT; 6024 goto done; 6025 } 6026 6027 user_resv = rlist.list; 6028 } 6029 6030 /* copy out reservations */ 6031 if (listsize > 0 && listlen > 0) { 6032 if (listsize < listlen) 6033 listlen = listsize; 6034 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 6035 6036 for (i = 0; i < listlen; i++) { 6037 mhd_resv.type = resv->type; 6038 mhd_resv.scope = resv->scope; 6039 mhd_resv.scope_specific_addr = 6040 BE_32(resv->scope_specific_addr); 6041 bcopy(&resv->resvkey, &mhd_resv.key, 6042 MHIOC_RESV_KEY_SIZE); 6043 6044 rv = ddi_copyout(&mhd_resv, user_resv, 6045 sizeof (mhd_resv), mode); 6046 if (rv != 0) { 6047 rv = EFAULT; 6048 goto done; 6049 } 6050 resv++; 6051 user_resv++; 6052 } 6053 } 6054 6055 if (rv == 0) 6056 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6057 6058 done: 6059 kmem_free(vd_scsi, vd_scsi_len); 6060 return (rv); 6061 } 6062 6063 /* 6064 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6065 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6066 * server with a VD_OP_SCSICMD operation. 6067 */ 6068 static int 6069 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6070 { 6071 vd_scsi_t *vd_scsi; 6072 sd_prout_t *scsi_prout; 6073 mhioc_register_t mhd_reg; 6074 int vd_scsi_len, rv; 6075 6076 /* copyin arguments */ 6077 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6078 if (rv != 0) 6079 return (EFAULT); 6080 6081 /* build SCSI VD_OP request */ 6082 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6083 sizeof (sd_prout_t), &vd_scsi_len); 6084 6085 /* set parameters */ 6086 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6087 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6088 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6089 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6090 6091 /* submit the request */ 6092 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6093 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6094 6095 if (rv == 0) 6096 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6097 6098 kmem_free(vd_scsi, vd_scsi_len); 6099 return (rv); 6100 } 6101 6102 /* 6103 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6104 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6105 * server with a VD_OP_SCSICMD operation. 6106 */ 6107 static int 6108 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6109 { 6110 union scsi_cdb *cdb; 6111 vd_scsi_t *vd_scsi; 6112 sd_prout_t *scsi_prout; 6113 mhioc_resv_desc_t mhd_resv; 6114 int vd_scsi_len, rv; 6115 6116 /* copyin arguments */ 6117 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6118 if (rv != 0) 6119 return (EFAULT); 6120 6121 /* build SCSI VD_OP request */ 6122 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6123 sizeof (sd_prout_t), &vd_scsi_len); 6124 6125 /* set parameters */ 6126 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6127 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6128 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6129 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6130 cdb->cdb_opaque[2] = mhd_resv.type; 6131 6132 /* submit the request */ 6133 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6134 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6135 6136 if (rv == 0) 6137 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6138 6139 kmem_free(vd_scsi, vd_scsi_len); 6140 return (rv); 6141 } 6142 6143 /* 6144 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6145 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6146 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6147 */ 6148 static int 6149 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6150 { 6151 union scsi_cdb *cdb; 6152 vd_scsi_t *vd_scsi; 6153 sd_prout_t *scsi_prout; 6154 mhioc_preemptandabort_t mhd_preempt; 6155 int vd_scsi_len, rv; 6156 6157 /* copyin arguments */ 6158 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6159 if (rv != 0) 6160 return (EFAULT); 6161 6162 /* build SCSI VD_OP request */ 6163 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6164 sizeof (sd_prout_t), &vd_scsi_len); 6165 6166 /* set parameters */ 6167 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6168 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6169 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6170 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6171 MHIOC_RESV_KEY_SIZE); 6172 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6173 MHIOC_RESV_KEY_SIZE); 6174 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6175 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6176 6177 /* submit the request */ 6178 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6179 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6180 6181 if (rv == 0) 6182 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6183 6184 kmem_free(vd_scsi, vd_scsi_len); 6185 return (rv); 6186 } 6187 6188 /* 6189 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6190 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6191 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6192 */ 6193 static int 6194 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6195 { 6196 vd_scsi_t *vd_scsi; 6197 sd_prout_t *scsi_prout; 6198 mhioc_registerandignorekey_t mhd_regi; 6199 int vd_scsi_len, rv; 6200 6201 /* copyin arguments */ 6202 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6203 if (rv != 0) 6204 return (EFAULT); 6205 6206 /* build SCSI VD_OP request */ 6207 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6208 sizeof (sd_prout_t), &vd_scsi_len); 6209 6210 /* set parameters */ 6211 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6212 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6213 MHIOC_RESV_KEY_SIZE); 6214 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6215 6216 /* submit the request */ 6217 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6218 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6219 6220 if (rv == 0) 6221 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6222 6223 kmem_free(vd_scsi, vd_scsi_len); 6224 return (rv); 6225 } 6226 6227 /* 6228 * This function is used by the failfast mechanism to send a SCSI command 6229 * to check for reservation conflict. 6230 */ 6231 static int 6232 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6233 { 6234 int cdb_len, sense_len, vd_scsi_len; 6235 vd_scsi_t *vd_scsi; 6236 union scsi_cdb *cdb; 6237 int rv; 6238 6239 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6240 6241 if (scmd == SCMD_WRITE_G1) 6242 cdb_len = CDB_GROUP1; 6243 else 6244 cdb_len = CDB_GROUP0; 6245 6246 sense_len = sizeof (struct scsi_extended_sense); 6247 6248 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6249 6250 /* set cdb */ 6251 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6252 cdb->scc_cmd = scmd; 6253 6254 vd_scsi->timeout = vdc_scsi_timeout; 6255 6256 /* 6257 * Submit the request. The last argument has to be B_FALSE so that 6258 * vdc_do_sync_op does not loop checking for reservation conflict if 6259 * the operation returns an error. 6260 */ 6261 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6262 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6263 6264 if (rv == 0) 6265 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6266 6267 kmem_free(vd_scsi, vd_scsi_len); 6268 return (rv); 6269 } 6270 6271 /* 6272 * This function is used by the failfast mechanism to check for reservation 6273 * conflict. It sends some SCSI commands which will fail with a reservation 6274 * conflict error if the system does not have access to the disk and this 6275 * will panic the system. 6276 * 6277 * Returned Code: 6278 * 0 - disk is accessible without reservation conflict error 6279 * != 0 - unable to check if disk is accessible 6280 */ 6281 int 6282 vdc_failfast_check_resv(vdc_t *vdc) 6283 { 6284 int failure = 0; 6285 6286 /* 6287 * Send a TEST UNIT READY command. The command will panic 6288 * the system if it fails with a reservation conflict. 6289 */ 6290 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6291 failure++; 6292 6293 /* 6294 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6295 * a reserved device, so we also do a WRITE(10) of zero byte in 6296 * order to provoke a Reservation Conflict status on those newer 6297 * devices. 6298 */ 6299 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6300 failure++; 6301 6302 return (failure); 6303 } 6304 6305 /* 6306 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6307 * queue when it has failed and failfast is enabled. Then we have to check 6308 * if it has failed because of a reservation conflict in which case we have 6309 * to panic the system. 6310 * 6311 * Async I/O should be queued with their block I/O data transfer structure 6312 * (buf). Sync I/O should be queued with buf = NULL. 6313 */ 6314 static vdc_io_t * 6315 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6316 { 6317 vdc_io_t *vio; 6318 6319 ASSERT(MUTEX_HELD(&vdc->lock)); 6320 6321 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6322 vio->vio_next = vdc->failfast_io_queue; 6323 vio->vio_buf = buf; 6324 vio->vio_qtime = ddi_get_lbolt(); 6325 6326 vdc->failfast_io_queue = vio; 6327 6328 /* notify the failfast thread that a new I/O is queued */ 6329 cv_signal(&vdc->failfast_cv); 6330 6331 return (vio); 6332 } 6333 6334 /* 6335 * Remove and complete I/O in the failfast I/O queue which have been 6336 * added after the indicated deadline. A deadline of 0 means that all 6337 * I/O have to be unqueued and marked as completed. 6338 */ 6339 static void 6340 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6341 { 6342 vdc_io_t *vio, *vio_tmp; 6343 6344 ASSERT(MUTEX_HELD(&vdc->lock)); 6345 6346 vio_tmp = NULL; 6347 vio = vdc->failfast_io_queue; 6348 6349 if (deadline != 0) { 6350 /* 6351 * Skip any io queued after the deadline. The failfast 6352 * I/O queue is ordered starting with the last I/O added 6353 * to the queue. 6354 */ 6355 while (vio != NULL && vio->vio_qtime > deadline) { 6356 vio_tmp = vio; 6357 vio = vio->vio_next; 6358 } 6359 } 6360 6361 if (vio == NULL) 6362 /* nothing to unqueue */ 6363 return; 6364 6365 /* update the queue */ 6366 if (vio_tmp == NULL) 6367 vdc->failfast_io_queue = NULL; 6368 else 6369 vio_tmp->vio_next = NULL; 6370 6371 /* 6372 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6373 * structure (buf) and they are completed by calling biodone(). Sync 6374 * I/O do not have a buf and they are completed by setting the 6375 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6376 * thread waiting for the I/O to complete is responsible for freeing 6377 * the vio structure. 6378 */ 6379 while (vio != NULL) { 6380 vio_tmp = vio->vio_next; 6381 if (vio->vio_buf != NULL) { 6382 VD_KSTAT_RUNQ_EXIT(vdc); 6383 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6384 biodone(vio->vio_buf); 6385 kmem_free(vio, sizeof (vdc_io_t)); 6386 } else { 6387 vio->vio_qtime = 0; 6388 } 6389 vio = vio_tmp; 6390 } 6391 6392 cv_broadcast(&vdc->failfast_io_cv); 6393 } 6394 6395 /* 6396 * Failfast Thread. 6397 * 6398 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6399 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6400 * we still have access to the disk. If a command fails with a RESERVATION 6401 * CONFLICT error then the system will immediatly panic. 6402 * 6403 * The failfast thread is also woken up when an I/O has failed. It then check 6404 * the access to the disk to ensure that the I/O failure was not due to a 6405 * reservation conflict. 6406 * 6407 * There is one failfast thread for each virtual disk for which failfast is 6408 * enabled. We could have only one thread sending requests for all disks but 6409 * this would need vdc to send asynchronous requests and to have callbacks to 6410 * process replies. 6411 */ 6412 static void 6413 vdc_failfast_thread(void *arg) 6414 { 6415 int status; 6416 vdc_t *vdc = (vdc_t *)arg; 6417 clock_t timeout, starttime; 6418 6419 mutex_enter(&vdc->lock); 6420 6421 while (vdc->failfast_interval != 0) { 6422 6423 starttime = ddi_get_lbolt(); 6424 6425 mutex_exit(&vdc->lock); 6426 6427 /* check for reservation conflict */ 6428 status = vdc_failfast_check_resv(vdc); 6429 6430 mutex_enter(&vdc->lock); 6431 /* 6432 * We have dropped the lock to send the SCSI command so we have 6433 * to check that failfast is still enabled. 6434 */ 6435 if (vdc->failfast_interval == 0) 6436 break; 6437 6438 /* 6439 * If we have successfully check the disk access and there was 6440 * no reservation conflict then we can complete any I/O queued 6441 * before the last check. 6442 */ 6443 if (status == 0) 6444 vdc_failfast_io_unqueue(vdc, starttime); 6445 6446 /* proceed again if some I/O are still in the queue */ 6447 if (vdc->failfast_io_queue != NULL) 6448 continue; 6449 6450 timeout = ddi_get_lbolt() + 6451 drv_usectohz(vdc->failfast_interval); 6452 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6453 } 6454 6455 /* 6456 * Failfast is being stop so we can complete any queued I/O. 6457 */ 6458 vdc_failfast_io_unqueue(vdc, 0); 6459 vdc->failfast_thread = NULL; 6460 mutex_exit(&vdc->lock); 6461 thread_exit(); 6462 } 6463 6464 /* 6465 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6466 */ 6467 static int 6468 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6469 { 6470 unsigned int mh_time; 6471 6472 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6473 return (EFAULT); 6474 6475 mutex_enter(&vdc->lock); 6476 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6477 vdc->failfast_thread = thread_create(NULL, 0, 6478 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6479 v.v_maxsyspri - 2); 6480 } 6481 6482 vdc->failfast_interval = mh_time * 1000; 6483 cv_signal(&vdc->failfast_cv); 6484 mutex_exit(&vdc->lock); 6485 6486 return (0); 6487 } 6488 6489 /* 6490 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6491 * converted to VD_OP_SET_ACCESS operations. 6492 */ 6493 static int 6494 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6495 { 6496 int rv; 6497 6498 /* submit owership command request */ 6499 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6500 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6501 VIO_both_dir, B_TRUE); 6502 6503 return (rv); 6504 } 6505 6506 /* 6507 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6508 * VD_OP_GET_ACCESS operation. 6509 */ 6510 static int 6511 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6512 { 6513 int rv; 6514 6515 /* submit owership command request */ 6516 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6517 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6518 VIO_both_dir, B_TRUE); 6519 6520 return (rv); 6521 } 6522 6523 /* 6524 * Disk Ownership Thread. 6525 * 6526 * When we have taken the ownership of a disk, this thread waits to be 6527 * notified when the LDC channel is reset so that it can recover the 6528 * ownership. 6529 * 6530 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6531 * can not be used to do the ownership recovery because it has to be 6532 * running to handle the reply message to the ownership operation. 6533 */ 6534 static void 6535 vdc_ownership_thread(void *arg) 6536 { 6537 vdc_t *vdc = (vdc_t *)arg; 6538 clock_t timeout; 6539 uint64_t status; 6540 6541 mutex_enter(&vdc->ownership_lock); 6542 mutex_enter(&vdc->lock); 6543 6544 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6545 6546 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6547 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6548 /* 6549 * There was a reset so the ownership has been lost, 6550 * try to recover. We do this without using the preempt 6551 * option so that we don't steal the ownership from 6552 * someone who has preempted us. 6553 */ 6554 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6555 vdc->instance); 6556 6557 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6558 VDC_OWNERSHIP_GRANTED); 6559 6560 mutex_exit(&vdc->lock); 6561 6562 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6563 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6564 6565 mutex_enter(&vdc->lock); 6566 6567 if (status == 0) { 6568 DMSG(vdc, 0, "[%d] Ownership recovered", 6569 vdc->instance); 6570 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6571 } else { 6572 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6573 vdc->instance); 6574 } 6575 6576 } 6577 6578 /* 6579 * If we have the ownership then we just wait for an event 6580 * to happen (LDC reset), otherwise we will retry to recover 6581 * after a delay. 6582 */ 6583 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6584 timeout = 0; 6585 else 6586 timeout = ddi_get_lbolt() + 6587 drv_usectohz(vdc_ownership_delay); 6588 6589 /* Release the ownership_lock and wait on the vdc lock */ 6590 mutex_exit(&vdc->ownership_lock); 6591 6592 if (timeout == 0) 6593 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6594 else 6595 (void) cv_timedwait(&vdc->ownership_cv, 6596 &vdc->lock, timeout); 6597 6598 mutex_exit(&vdc->lock); 6599 6600 mutex_enter(&vdc->ownership_lock); 6601 mutex_enter(&vdc->lock); 6602 } 6603 6604 vdc->ownership_thread = NULL; 6605 mutex_exit(&vdc->lock); 6606 mutex_exit(&vdc->ownership_lock); 6607 6608 thread_exit(); 6609 } 6610 6611 static void 6612 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6613 { 6614 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6615 6616 mutex_enter(&vdc->lock); 6617 vdc->ownership = ownership_flags; 6618 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6619 vdc->ownership_thread == NULL) { 6620 /* start ownership thread */ 6621 vdc->ownership_thread = thread_create(NULL, 0, 6622 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6623 v.v_maxsyspri - 2); 6624 } else { 6625 /* notify the ownership thread */ 6626 cv_signal(&vdc->ownership_cv); 6627 } 6628 mutex_exit(&vdc->lock); 6629 } 6630 6631 /* 6632 * Get the size and the block size of a virtual disk from the vdisk server. 6633 */ 6634 static int 6635 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6636 { 6637 int rv = 0; 6638 size_t alloc_len; 6639 vd_capacity_t *vd_cap; 6640 6641 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6642 6643 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6644 6645 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6646 6647 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6648 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6649 6650 *dsk_size = vd_cap->vdisk_size; 6651 *blk_size = vd_cap->vdisk_block_size; 6652 6653 kmem_free(vd_cap, alloc_len); 6654 return (rv); 6655 } 6656 6657 /* 6658 * Check the disk capacity. Disk size information is updated if size has 6659 * changed. 6660 * 6661 * Return 0 if the disk capacity is available, or non-zero if it is not. 6662 */ 6663 static int 6664 vdc_check_capacity(vdc_t *vdc) 6665 { 6666 size_t dsk_size, blk_size; 6667 int rv; 6668 6669 /* 6670 * If the vdisk does not support the VD_OP_GET_CAPACITY operation 6671 * then the disk capacity has been retrieved during the handshake 6672 * and there's nothing more to do here. 6673 */ 6674 if (!VD_OP_SUPPORTED(vdc->operations, VD_OP_GET_CAPACITY)) 6675 return (0); 6676 6677 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6678 return (rv); 6679 6680 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || blk_size == 0) 6681 return (EINVAL); 6682 6683 mutex_enter(&vdc->lock); 6684 /* 6685 * First try to update the VIO block size (which is the same as the 6686 * vdisk block size). If this returns an error then that means that 6687 * we can not use that block size so basically the vdisk is unusable 6688 * and we return an error. 6689 */ 6690 rv = vdc_update_vio_bsize(vdc, blk_size); 6691 if (rv == 0) 6692 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6693 6694 mutex_exit(&vdc->lock); 6695 6696 return (rv); 6697 } 6698 6699 /* 6700 * This structure is used in the DKIO(7I) array below. 6701 */ 6702 typedef struct vdc_dk_ioctl { 6703 uint8_t op; /* VD_OP_XXX value */ 6704 int cmd; /* Solaris ioctl operation number */ 6705 size_t nbytes; /* size of structure to be copied */ 6706 6707 /* function to convert between vDisk and Solaris structure formats */ 6708 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6709 int mode, int dir); 6710 } vdc_dk_ioctl_t; 6711 6712 /* 6713 * Subset of DKIO(7I) operations currently supported 6714 */ 6715 static vdc_dk_ioctl_t dk_ioctl[] = { 6716 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6717 vdc_null_copy_func}, 6718 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6719 vdc_get_wce_convert}, 6720 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6721 vdc_set_wce_convert}, 6722 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6723 vdc_get_vtoc_convert}, 6724 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6725 vdc_set_vtoc_convert}, 6726 {VD_OP_GET_VTOC, DKIOCGEXTVTOC, sizeof (vd_vtoc_t), 6727 vdc_get_extvtoc_convert}, 6728 {VD_OP_SET_VTOC, DKIOCSEXTVTOC, sizeof (vd_vtoc_t), 6729 vdc_set_extvtoc_convert}, 6730 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6731 vdc_get_geom_convert}, 6732 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6733 vdc_get_geom_convert}, 6734 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6735 vdc_get_geom_convert}, 6736 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6737 vdc_set_geom_convert}, 6738 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6739 vdc_get_efi_convert}, 6740 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6741 vdc_set_efi_convert}, 6742 6743 /* DIOCTL_RWCMD is converted to a read or a write */ 6744 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6745 6746 /* mhd(7I) non-shared multihost disks ioctls */ 6747 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6748 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6749 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6750 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6751 6752 /* mhd(7I) shared multihost disks ioctls */ 6753 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6754 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6755 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6756 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6757 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6758 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6759 6760 /* mhd(7I) failfast ioctl */ 6761 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6762 6763 /* 6764 * These particular ioctls are not sent to the server - vdc fakes up 6765 * the necessary info. 6766 */ 6767 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6768 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6769 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6770 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6771 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6772 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6773 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6774 }; 6775 6776 /* 6777 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6778 * function and forward them to the vdisk. 6779 */ 6780 static int 6781 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6782 { 6783 vdc_t *vdc = (vdc_t *)vdisk; 6784 dev_t dev; 6785 int rval; 6786 6787 dev = makedevice(ddi_driver_major(vdc->dip), 6788 VD_MAKE_DEV(vdc->instance, 0)); 6789 6790 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6791 } 6792 6793 /* 6794 * Function: 6795 * vd_process_ioctl() 6796 * 6797 * Description: 6798 * This routine processes disk specific ioctl calls 6799 * 6800 * Arguments: 6801 * dev - the device number 6802 * cmd - the operation [dkio(7I)] to be processed 6803 * arg - pointer to user provided structure 6804 * (contains data to be set or reference parameter for get) 6805 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6806 * rvalp - pointer to return value for calling process. 6807 * 6808 * Return Code: 6809 * 0 6810 * EFAULT 6811 * ENXIO 6812 * EIO 6813 * ENOTSUP 6814 */ 6815 static int 6816 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6817 { 6818 int instance = VDCUNIT(dev); 6819 vdc_t *vdc = NULL; 6820 int rv = -1; 6821 int idx = 0; /* index into dk_ioctl[] */ 6822 size_t len = 0; /* #bytes to send to vds */ 6823 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6824 caddr_t mem_p = NULL; 6825 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6826 vdc_dk_ioctl_t *iop; 6827 6828 vdc = ddi_get_soft_state(vdc_state, instance); 6829 if (vdc == NULL) { 6830 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6831 instance); 6832 return (ENXIO); 6833 } 6834 6835 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6836 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6837 6838 if (rvalp != NULL) { 6839 /* the return value of the ioctl is 0 by default */ 6840 *rvalp = 0; 6841 } 6842 6843 /* 6844 * Validate the ioctl operation to be performed. 6845 * 6846 * If we have looped through the array without finding a match then we 6847 * don't support this ioctl. 6848 */ 6849 for (idx = 0; idx < nioctls; idx++) { 6850 if (cmd == dk_ioctl[idx].cmd) 6851 break; 6852 } 6853 6854 if (idx >= nioctls) { 6855 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6856 vdc->instance, cmd); 6857 return (ENOTSUP); 6858 } 6859 6860 iop = &(dk_ioctl[idx]); 6861 6862 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6863 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6864 dk_efi_t dk_efi; 6865 6866 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6867 if (rv != 0) 6868 return (EFAULT); 6869 6870 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6871 } else { 6872 len = iop->nbytes; 6873 } 6874 6875 /* check if the ioctl is applicable */ 6876 switch (cmd) { 6877 case CDROMREADOFFSET: 6878 case DKIOCREMOVABLE: 6879 return (ENOTTY); 6880 6881 case USCSICMD: 6882 case MHIOCTKOWN: 6883 case MHIOCSTATUS: 6884 case MHIOCQRESERVE: 6885 case MHIOCRELEASE: 6886 case MHIOCGRP_INKEYS: 6887 case MHIOCGRP_INRESV: 6888 case MHIOCGRP_REGISTER: 6889 case MHIOCGRP_RESERVE: 6890 case MHIOCGRP_PREEMPTANDABORT: 6891 case MHIOCGRP_REGISTERANDIGNOREKEY: 6892 case MHIOCENFAILFAST: 6893 if (vdc->cinfo == NULL) 6894 return (ENXIO); 6895 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6896 return (ENOTTY); 6897 break; 6898 6899 case DIOCTL_RWCMD: 6900 if (vdc->cinfo == NULL) 6901 return (ENXIO); 6902 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6903 return (ENOTTY); 6904 break; 6905 6906 case DKIOCINFO: 6907 if (vdc->cinfo == NULL) 6908 return (ENXIO); 6909 break; 6910 6911 case DKIOCGMEDIAINFO: 6912 if (vdc->minfo == NULL) 6913 return (ENXIO); 6914 if (vdc_check_capacity(vdc) != 0) 6915 /* disk capacity is not available */ 6916 return (EIO); 6917 break; 6918 } 6919 6920 /* 6921 * Deal with ioctls which require a processing different than 6922 * converting ioctl arguments and sending a corresponding 6923 * VD operation. 6924 */ 6925 switch (cmd) { 6926 6927 case USCSICMD: 6928 { 6929 return (vdc_uscsi_cmd(vdc, arg, mode)); 6930 } 6931 6932 case MHIOCTKOWN: 6933 { 6934 mutex_enter(&vdc->ownership_lock); 6935 /* 6936 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6937 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6938 * while we are processing the ioctl. 6939 */ 6940 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6941 6942 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6943 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6944 if (rv == 0) { 6945 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6946 VDC_OWNERSHIP_GRANTED); 6947 } else { 6948 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6949 } 6950 mutex_exit(&vdc->ownership_lock); 6951 return (rv); 6952 } 6953 6954 case MHIOCRELEASE: 6955 { 6956 mutex_enter(&vdc->ownership_lock); 6957 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6958 if (rv == 0) { 6959 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6960 } 6961 mutex_exit(&vdc->ownership_lock); 6962 return (rv); 6963 } 6964 6965 case MHIOCSTATUS: 6966 { 6967 uint64_t status; 6968 6969 rv = vdc_access_get(vdc, &status, mode); 6970 if (rv == 0 && rvalp != NULL) 6971 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6972 return (rv); 6973 } 6974 6975 case MHIOCQRESERVE: 6976 { 6977 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6978 return (rv); 6979 } 6980 6981 case MHIOCGRP_INKEYS: 6982 { 6983 return (vdc_mhd_inkeys(vdc, arg, mode)); 6984 } 6985 6986 case MHIOCGRP_INRESV: 6987 { 6988 return (vdc_mhd_inresv(vdc, arg, mode)); 6989 } 6990 6991 case MHIOCGRP_REGISTER: 6992 { 6993 return (vdc_mhd_register(vdc, arg, mode)); 6994 } 6995 6996 case MHIOCGRP_RESERVE: 6997 { 6998 return (vdc_mhd_reserve(vdc, arg, mode)); 6999 } 7000 7001 case MHIOCGRP_PREEMPTANDABORT: 7002 { 7003 return (vdc_mhd_preemptabort(vdc, arg, mode)); 7004 } 7005 7006 case MHIOCGRP_REGISTERANDIGNOREKEY: 7007 { 7008 return (vdc_mhd_registerignore(vdc, arg, mode)); 7009 } 7010 7011 case MHIOCENFAILFAST: 7012 { 7013 rv = vdc_failfast(vdc, arg, mode); 7014 return (rv); 7015 } 7016 7017 case DIOCTL_RWCMD: 7018 { 7019 return (vdc_dioctl_rwcmd(vdc, arg, mode)); 7020 } 7021 7022 case DKIOCGAPART: 7023 { 7024 return (vdc_dkio_gapart(vdc, arg, mode)); 7025 } 7026 7027 case DKIOCPARTITION: 7028 { 7029 return (vdc_dkio_partition(vdc, arg, mode)); 7030 } 7031 7032 case DKIOCINFO: 7033 { 7034 struct dk_cinfo cinfo; 7035 7036 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 7037 cinfo.dki_partition = VDCPART(dev); 7038 7039 rv = ddi_copyout(&cinfo, (void *)arg, 7040 sizeof (struct dk_cinfo), mode); 7041 if (rv != 0) 7042 return (EFAULT); 7043 7044 return (0); 7045 } 7046 7047 case DKIOCGMEDIAINFO: 7048 { 7049 ASSERT(vdc->vdisk_size != 0); 7050 ASSERT(vdc->minfo->dki_capacity != 0); 7051 rv = ddi_copyout(vdc->minfo, (void *)arg, 7052 sizeof (struct dk_minfo), mode); 7053 if (rv != 0) 7054 return (EFAULT); 7055 7056 return (0); 7057 } 7058 7059 case DKIOCFLUSHWRITECACHE: 7060 { 7061 struct dk_callback *dkc = 7062 (struct dk_callback *)(uintptr_t)arg; 7063 vdc_dk_arg_t *dkarg = NULL; 7064 7065 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 7066 instance, mode); 7067 7068 /* 7069 * If arg is NULL, then there is no callback function 7070 * registered and the call operates synchronously; we 7071 * break and continue with the rest of the function and 7072 * wait for vds to return (i.e. after the request to 7073 * vds returns successfully, all writes completed prior 7074 * to the ioctl will have been flushed from the disk 7075 * write cache to persistent media. 7076 * 7077 * If a callback function is registered, we dispatch 7078 * the request on a task queue and return immediately. 7079 * The callback will deal with informing the calling 7080 * thread that the flush request is completed. 7081 */ 7082 if (dkc == NULL) 7083 break; 7084 7085 /* 7086 * the asynchronous callback is only supported if 7087 * invoked from within the kernel 7088 */ 7089 if ((mode & FKIOCTL) == 0) 7090 return (ENOTSUP); 7091 7092 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7093 7094 dkarg->mode = mode; 7095 dkarg->dev = dev; 7096 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7097 7098 mutex_enter(&vdc->lock); 7099 vdc->dkio_flush_pending++; 7100 dkarg->vdc = vdc; 7101 mutex_exit(&vdc->lock); 7102 7103 /* put the request on a task queue */ 7104 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7105 (void *)dkarg, DDI_SLEEP); 7106 if (rv == NULL) { 7107 /* clean up if dispatch fails */ 7108 mutex_enter(&vdc->lock); 7109 vdc->dkio_flush_pending--; 7110 mutex_exit(&vdc->lock); 7111 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7112 } 7113 7114 return (rv == NULL ? ENOMEM : 0); 7115 } 7116 } 7117 7118 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7119 ASSERT(iop->op != 0); 7120 7121 /* check if the vDisk server handles the operation for this vDisk */ 7122 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7123 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7124 vdc->instance, iop->op); 7125 return (ENOTSUP); 7126 } 7127 7128 /* LDC requires that the memory being mapped is 8-byte aligned */ 7129 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7130 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7131 instance, len, alloc_len); 7132 7133 if (alloc_len > 0) 7134 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7135 7136 /* 7137 * Call the conversion function for this ioctl which, if necessary, 7138 * converts from the Solaris format to the format ARC'ed 7139 * as part of the vDisk protocol (FWARC 2006/195) 7140 */ 7141 ASSERT(iop->convert != NULL); 7142 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7143 if (rv != 0) { 7144 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7145 instance, rv, cmd); 7146 if (mem_p != NULL) 7147 kmem_free(mem_p, alloc_len); 7148 return (rv); 7149 } 7150 7151 /* 7152 * send request to vds to service the ioctl. 7153 */ 7154 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7155 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7156 VIO_both_dir, B_TRUE); 7157 7158 if (rv != 0) { 7159 /* 7160 * This is not necessarily an error. The ioctl could 7161 * be returning a value such as ENOTTY to indicate 7162 * that the ioctl is not applicable. 7163 */ 7164 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7165 instance, rv, cmd); 7166 if (mem_p != NULL) 7167 kmem_free(mem_p, alloc_len); 7168 7169 return (rv); 7170 } 7171 7172 /* 7173 * Call the conversion function (if it exists) for this ioctl 7174 * which converts from the format ARC'ed as part of the vDisk 7175 * protocol (FWARC 2006/195) back to a format understood by 7176 * the rest of Solaris. 7177 */ 7178 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7179 if (rv != 0) { 7180 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7181 instance, rv, cmd); 7182 if (mem_p != NULL) 7183 kmem_free(mem_p, alloc_len); 7184 return (rv); 7185 } 7186 7187 if (mem_p != NULL) 7188 kmem_free(mem_p, alloc_len); 7189 7190 return (rv); 7191 } 7192 7193 /* 7194 * Function: 7195 * 7196 * Description: 7197 * This is an empty conversion function used by ioctl calls which 7198 * do not need to convert the data being passed in/out to userland 7199 */ 7200 static int 7201 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7202 { 7203 _NOTE(ARGUNUSED(vdc)) 7204 _NOTE(ARGUNUSED(from)) 7205 _NOTE(ARGUNUSED(to)) 7206 _NOTE(ARGUNUSED(mode)) 7207 _NOTE(ARGUNUSED(dir)) 7208 7209 return (0); 7210 } 7211 7212 static int 7213 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7214 int mode, int dir) 7215 { 7216 _NOTE(ARGUNUSED(vdc)) 7217 7218 if (dir == VD_COPYIN) 7219 return (0); /* nothing to do */ 7220 7221 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7222 return (EFAULT); 7223 7224 return (0); 7225 } 7226 7227 static int 7228 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7229 int mode, int dir) 7230 { 7231 _NOTE(ARGUNUSED(vdc)) 7232 7233 if (dir == VD_COPYOUT) 7234 return (0); /* nothing to do */ 7235 7236 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7237 return (EFAULT); 7238 7239 return (0); 7240 } 7241 7242 /* 7243 * Function: 7244 * vdc_get_vtoc_convert() 7245 * 7246 * Description: 7247 * This routine performs the necessary convertions from the DKIOCGVTOC 7248 * Solaris structure to the format defined in FWARC 2006/195. 7249 * 7250 * In the struct vtoc definition, the timestamp field is marked as not 7251 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7252 * However SVM uses that field to check it can write into the VTOC, 7253 * so we fake up the info of that field. 7254 * 7255 * Arguments: 7256 * vdc - the vDisk client 7257 * from - the buffer containing the data to be copied from 7258 * to - the buffer to be copied to 7259 * mode - flags passed to ioctl() call 7260 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7261 * 7262 * Return Code: 7263 * 0 - Success 7264 * ENXIO - incorrect buffer passed in. 7265 * EFAULT - ddi_copyout routine encountered an error. 7266 */ 7267 static int 7268 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7269 { 7270 int i; 7271 struct vtoc vtoc; 7272 struct vtoc32 vtoc32; 7273 struct extvtoc evtoc; 7274 int rv; 7275 7276 if (dir != VD_COPYOUT) 7277 return (0); /* nothing to do */ 7278 7279 if ((from == NULL) || (to == NULL)) 7280 return (ENXIO); 7281 7282 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7283 return (EOVERFLOW); 7284 7285 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7286 7287 /* fake the VTOC timestamp field */ 7288 for (i = 0; i < V_NUMPAR; i++) { 7289 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7290 } 7291 7292 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7293 /* LINTED E_ASSIGN_NARROW_CONV */ 7294 extvtoctovtoc32(evtoc, vtoc32); 7295 rv = ddi_copyout(&vtoc32, to, sizeof (vtoc32), mode); 7296 if (rv != 0) 7297 rv = EFAULT; 7298 } else { 7299 extvtoctovtoc(evtoc, vtoc); 7300 rv = ddi_copyout(&vtoc, to, sizeof (vtoc), mode); 7301 if (rv != 0) 7302 rv = EFAULT; 7303 } 7304 7305 return (rv); 7306 } 7307 7308 /* 7309 * Function: 7310 * vdc_set_vtoc_convert() 7311 * 7312 * Description: 7313 * This routine performs the necessary convertions from the DKIOCSVTOC 7314 * Solaris structure to the format defined in FWARC 2006/195. 7315 * 7316 * Arguments: 7317 * vdc - the vDisk client 7318 * from - Buffer with data 7319 * to - Buffer where data is to be copied to 7320 * mode - flags passed to ioctl 7321 * dir - direction of copy (in or out) 7322 * 7323 * Return Code: 7324 * 0 - Success 7325 * ENXIO - Invalid buffer passed in 7326 * EFAULT - ddi_copyin of data failed 7327 */ 7328 static int 7329 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7330 { 7331 void *uvtoc; 7332 struct vtoc vtoc; 7333 struct vtoc32 vtoc32; 7334 struct extvtoc evtoc; 7335 int i, rv; 7336 7337 if ((from == NULL) || (to == NULL)) 7338 return (ENXIO); 7339 7340 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7341 return (EOVERFLOW); 7342 7343 uvtoc = (dir == VD_COPYIN)? from : to; 7344 7345 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7346 rv = ddi_copyin(uvtoc, &vtoc32, sizeof (vtoc32), mode); 7347 if (rv != 0) 7348 return (EFAULT); 7349 vtoc32toextvtoc(vtoc32, evtoc); 7350 } else { 7351 rv = ddi_copyin(uvtoc, &vtoc, sizeof (vtoc), mode); 7352 if (rv != 0) 7353 return (EFAULT); 7354 vtoctoextvtoc(vtoc, evtoc); 7355 } 7356 7357 if (dir == VD_COPYOUT) { 7358 /* 7359 * The disk label may have changed. Revalidate the disk 7360 * geometry. This will also update the device nodes. 7361 */ 7362 vdc_validate(vdc); 7363 7364 /* 7365 * We also need to keep track of the timestamp fields. 7366 */ 7367 for (i = 0; i < V_NUMPAR; i++) { 7368 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7369 } 7370 7371 } else { 7372 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7373 } 7374 7375 return (0); 7376 } 7377 7378 static int 7379 vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7380 { 7381 int i, rv; 7382 struct extvtoc evtoc; 7383 7384 if (dir != VD_COPYOUT) 7385 return (0); /* nothing to do */ 7386 7387 if ((from == NULL) || (to == NULL)) 7388 return (ENXIO); 7389 7390 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7391 7392 /* fake the VTOC timestamp field */ 7393 for (i = 0; i < V_NUMPAR; i++) { 7394 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7395 } 7396 7397 rv = ddi_copyout(&evtoc, to, sizeof (struct extvtoc), mode); 7398 if (rv != 0) 7399 rv = EFAULT; 7400 7401 return (rv); 7402 } 7403 7404 static int 7405 vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7406 { 7407 void *uvtoc; 7408 struct extvtoc evtoc; 7409 int i, rv; 7410 7411 if ((from == NULL) || (to == NULL)) 7412 return (ENXIO); 7413 7414 uvtoc = (dir == VD_COPYIN)? from : to; 7415 7416 rv = ddi_copyin(uvtoc, &evtoc, sizeof (struct extvtoc), mode); 7417 if (rv != 0) 7418 return (EFAULT); 7419 7420 if (dir == VD_COPYOUT) { 7421 /* 7422 * The disk label may have changed. Revalidate the disk 7423 * geometry. This will also update the device nodes. 7424 */ 7425 vdc_validate(vdc); 7426 7427 /* 7428 * We also need to keep track of the timestamp fields. 7429 */ 7430 for (i = 0; i < V_NUMPAR; i++) { 7431 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7432 } 7433 7434 } else { 7435 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7436 } 7437 7438 return (0); 7439 } 7440 7441 /* 7442 * Function: 7443 * vdc_get_geom_convert() 7444 * 7445 * Description: 7446 * This routine performs the necessary convertions from the DKIOCGGEOM, 7447 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7448 * defined in FWARC 2006/195 7449 * 7450 * Arguments: 7451 * vdc - the vDisk client 7452 * from - Buffer with data 7453 * to - Buffer where data is to be copied to 7454 * mode - flags passed to ioctl 7455 * dir - direction of copy (in or out) 7456 * 7457 * Return Code: 7458 * 0 - Success 7459 * ENXIO - Invalid buffer passed in 7460 * EFAULT - ddi_copyout of data failed 7461 */ 7462 static int 7463 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7464 { 7465 _NOTE(ARGUNUSED(vdc)) 7466 7467 struct dk_geom geom; 7468 int copy_len = sizeof (struct dk_geom); 7469 int rv = 0; 7470 7471 if (dir != VD_COPYOUT) 7472 return (0); /* nothing to do */ 7473 7474 if ((from == NULL) || (to == NULL)) 7475 return (ENXIO); 7476 7477 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7478 rv = ddi_copyout(&geom, to, copy_len, mode); 7479 if (rv != 0) 7480 rv = EFAULT; 7481 7482 return (rv); 7483 } 7484 7485 /* 7486 * Function: 7487 * vdc_set_geom_convert() 7488 * 7489 * Description: 7490 * This routine performs the necessary convertions from the DKIOCSGEOM 7491 * Solaris structure to the format defined in FWARC 2006/195. 7492 * 7493 * Arguments: 7494 * vdc - the vDisk client 7495 * from - Buffer with data 7496 * to - Buffer where data is to be copied to 7497 * mode - flags passed to ioctl 7498 * dir - direction of copy (in or out) 7499 * 7500 * Return Code: 7501 * 0 - Success 7502 * ENXIO - Invalid buffer passed in 7503 * EFAULT - ddi_copyin of data failed 7504 */ 7505 static int 7506 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7507 { 7508 _NOTE(ARGUNUSED(vdc)) 7509 7510 vd_geom_t vdgeom; 7511 void *tmp_mem = NULL; 7512 int copy_len = sizeof (struct dk_geom); 7513 int rv = 0; 7514 7515 if (dir != VD_COPYIN) 7516 return (0); /* nothing to do */ 7517 7518 if ((from == NULL) || (to == NULL)) 7519 return (ENXIO); 7520 7521 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7522 7523 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7524 if (rv != 0) { 7525 kmem_free(tmp_mem, copy_len); 7526 return (EFAULT); 7527 } 7528 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7529 bcopy(&vdgeom, to, sizeof (vdgeom)); 7530 kmem_free(tmp_mem, copy_len); 7531 7532 return (0); 7533 } 7534 7535 static int 7536 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7537 { 7538 _NOTE(ARGUNUSED(vdc)) 7539 7540 vd_efi_t *vd_efi; 7541 dk_efi_t dk_efi; 7542 int rv = 0; 7543 void *uaddr; 7544 7545 if ((from == NULL) || (to == NULL)) 7546 return (ENXIO); 7547 7548 if (dir == VD_COPYIN) { 7549 7550 vd_efi = (vd_efi_t *)to; 7551 7552 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7553 if (rv != 0) 7554 return (EFAULT); 7555 7556 vd_efi->lba = dk_efi.dki_lba; 7557 vd_efi->length = dk_efi.dki_length; 7558 bzero(vd_efi->data, vd_efi->length); 7559 7560 } else { 7561 7562 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7563 if (rv != 0) 7564 return (EFAULT); 7565 7566 uaddr = dk_efi.dki_data; 7567 7568 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7569 7570 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7571 7572 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7573 mode); 7574 if (rv != 0) 7575 return (EFAULT); 7576 7577 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7578 } 7579 7580 return (0); 7581 } 7582 7583 static int 7584 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7585 { 7586 _NOTE(ARGUNUSED(vdc)) 7587 7588 dk_efi_t dk_efi; 7589 void *uaddr; 7590 7591 if (dir == VD_COPYOUT) { 7592 /* 7593 * The disk label may have changed. Revalidate the disk 7594 * geometry. This will also update the device nodes. 7595 */ 7596 vdc_validate(vdc); 7597 return (0); 7598 } 7599 7600 if ((from == NULL) || (to == NULL)) 7601 return (ENXIO); 7602 7603 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7604 return (EFAULT); 7605 7606 uaddr = dk_efi.dki_data; 7607 7608 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7609 7610 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7611 return (EFAULT); 7612 7613 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7614 7615 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7616 7617 return (0); 7618 } 7619 7620 7621 /* -------------------------------------------------------------------------- */ 7622 7623 /* 7624 * Function: 7625 * vdc_create_fake_geometry() 7626 * 7627 * Description: 7628 * This routine fakes up the disk info needed for some DKIO ioctls such 7629 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7630 * 7631 * Note: This function must not be called until the vDisk attributes have 7632 * been exchanged as part of the handshake with the vDisk server. 7633 * 7634 * Arguments: 7635 * vdc - soft state pointer for this instance of the device driver. 7636 * 7637 * Return Code: 7638 * none. 7639 */ 7640 static void 7641 vdc_create_fake_geometry(vdc_t *vdc) 7642 { 7643 ASSERT(vdc != NULL); 7644 ASSERT(vdc->max_xfer_sz != 0); 7645 7646 /* 7647 * DKIOCINFO support 7648 */ 7649 if (vdc->cinfo == NULL) 7650 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7651 7652 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7653 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7654 /* max_xfer_sz is #blocks so we don't need to divide by vdisk_bsize */ 7655 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7656 7657 /* 7658 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7659 * operation is supported, otherwise the controller type is DKC_DIRECT. 7660 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7661 * controller type is always DKC_DIRECT in that case. 7662 * 7663 * If the virtual disk is backed by a physical CD/DVD device or 7664 * an ISO image, modify the controller type to indicate this 7665 */ 7666 switch (vdc->vdisk_media) { 7667 case VD_MEDIA_CD: 7668 case VD_MEDIA_DVD: 7669 vdc->cinfo->dki_ctype = DKC_CDROM; 7670 break; 7671 case VD_MEDIA_FIXED: 7672 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7673 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7674 else 7675 vdc->cinfo->dki_ctype = DKC_DIRECT; 7676 break; 7677 default: 7678 /* in the case of v1.0 we default to a fixed disk */ 7679 vdc->cinfo->dki_ctype = DKC_DIRECT; 7680 break; 7681 } 7682 vdc->cinfo->dki_flags = DKI_FMTVOL; 7683 vdc->cinfo->dki_cnum = 0; 7684 vdc->cinfo->dki_addr = 0; 7685 vdc->cinfo->dki_space = 0; 7686 vdc->cinfo->dki_prio = 0; 7687 vdc->cinfo->dki_vec = 0; 7688 vdc->cinfo->dki_unit = vdc->instance; 7689 vdc->cinfo->dki_slave = 0; 7690 /* 7691 * The partition number will be created on the fly depending on the 7692 * actual slice (i.e. minor node) that is used to request the data. 7693 */ 7694 vdc->cinfo->dki_partition = 0; 7695 7696 /* 7697 * DKIOCGMEDIAINFO support 7698 */ 7699 if (vdc->minfo == NULL) 7700 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7701 7702 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7703 vdc->minfo->dki_media_type = 7704 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7705 } else { 7706 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7707 } 7708 7709 vdc->minfo->dki_capacity = vdc->vdisk_size; 7710 vdc->minfo->dki_lbsize = vdc->vdisk_bsize; 7711 } 7712 7713 static ushort_t 7714 vdc_lbl2cksum(struct dk_label *label) 7715 { 7716 int count; 7717 ushort_t sum, *sp; 7718 7719 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7720 sp = (ushort_t *)label; 7721 sum = 0; 7722 while (count--) { 7723 sum ^= *sp++; 7724 } 7725 7726 return (sum); 7727 } 7728 7729 static void 7730 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 7731 { 7732 vd_err_stats_t *stp; 7733 7734 ASSERT(MUTEX_HELD(&vdc->lock)); 7735 ASSERT(xfr_size != 0); 7736 7737 /* 7738 * If the disk size is unknown or sizes are unchanged then don't 7739 * update anything. 7740 */ 7741 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 7742 (blk_size == vdc->vdisk_bsize && dsk_size == vdc->vdisk_size && 7743 xfr_size == vdc->max_xfer_sz)) 7744 return; 7745 7746 /* 7747 * We don't know at compile time what the vDisk server will think 7748 * are good values but we apply a large (arbitrary) upper bound to 7749 * prevent memory exhaustion in vdc if it was allocating a DRing 7750 * based of huge values sent by the server. We probably will never 7751 * exceed this except if the message was garbage. 7752 */ 7753 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 7754 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 7755 " using max supported by vdc", vdc->instance); 7756 xfr_size = maxphys / blk_size; 7757 } 7758 7759 vdc->max_xfer_sz = xfr_size; 7760 vdc->vdisk_bsize = blk_size; 7761 vdc->vdisk_size = dsk_size; 7762 7763 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 7764 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 7765 7766 vdc->minfo->dki_capacity = dsk_size; 7767 vdc->minfo->dki_lbsize = (uint_t)blk_size; 7768 } 7769 7770 /* 7771 * Update information about the VIO block size. The VIO block size is the 7772 * same as the vdisk block size which is stored in vdc->vdisk_bsize so we 7773 * do not store that information again. 7774 * 7775 * However, buf structures will always use a logical block size of 512 bytes 7776 * (DEV_BSIZE) and we will need to convert logical block numbers to VIO block 7777 * numbers for each read or write operation using vdc_strategy(). To speed up 7778 * this conversion, we expect the VIO block size to be a power of 2 and a 7779 * multiple 512 bytes (DEV_BSIZE), and we cache some useful information. 7780 * 7781 * The function return EINVAL if the new VIO block size (blk_size) is not a 7782 * power of 2 or not a multiple of 512 bytes, otherwise it returns 0. 7783 */ 7784 static int 7785 vdc_update_vio_bsize(vdc_t *vdc, uint32_t blk_size) 7786 { 7787 uint32_t ratio, n; 7788 int nshift = 0; 7789 7790 vdc->vio_bmask = 0; 7791 vdc->vio_bshift = 0; 7792 7793 ASSERT(blk_size > 0); 7794 7795 if ((blk_size % DEV_BSIZE) != 0) 7796 return (EINVAL); 7797 7798 ratio = blk_size / DEV_BSIZE; 7799 7800 for (n = ratio; n > 1; n >>= 1) { 7801 if ((n & 0x1) != 0) { 7802 /* blk_size is not a power of 2 */ 7803 return (EINVAL); 7804 } 7805 nshift++; 7806 } 7807 7808 vdc->vio_bshift = nshift; 7809 vdc->vio_bmask = ratio - 1; 7810 7811 return (0); 7812 } 7813 7814 /* 7815 * Function: 7816 * vdc_validate_geometry 7817 * 7818 * Description: 7819 * This routine discovers the label and geometry of the disk. It stores 7820 * the disk label and related information in the vdc structure. If it 7821 * fails to validate the geometry or to discover the disk label then 7822 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7823 * 7824 * Arguments: 7825 * vdc - soft state pointer for this instance of the device driver. 7826 * 7827 * Return Code: 7828 * 0 - success. 7829 * EINVAL - unknown disk label. 7830 * ENOTSUP - geometry not applicable (EFI label). 7831 * EIO - error accessing the disk. 7832 */ 7833 static int 7834 vdc_validate_geometry(vdc_t *vdc) 7835 { 7836 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7837 dev_t dev; 7838 int rv, rval; 7839 struct dk_label *label; 7840 struct dk_geom geom; 7841 struct extvtoc vtoc; 7842 efi_gpt_t *gpt; 7843 efi_gpe_t *gpe; 7844 vd_efi_dev_t edev; 7845 7846 ASSERT(vdc != NULL); 7847 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7848 ASSERT(MUTEX_HELD(&vdc->lock)); 7849 7850 mutex_exit(&vdc->lock); 7851 /* 7852 * Check the disk capacity in case it has changed. If that fails then 7853 * we proceed and we will be using the disk size we currently have. 7854 */ 7855 (void) vdc_check_capacity(vdc); 7856 dev = makedevice(ddi_driver_major(vdc->dip), 7857 VD_MAKE_DEV(vdc->instance, 0)); 7858 7859 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7860 if (rv == 0) 7861 rv = vd_process_ioctl(dev, DKIOCGEXTVTOC, (caddr_t)&vtoc, 7862 FKIOCTL, &rval); 7863 7864 if (rv == ENOTSUP) { 7865 /* 7866 * If the device does not support VTOC then we try 7867 * to read an EFI label. 7868 * 7869 * We need to know the block size and the disk size to 7870 * be able to read an EFI label. 7871 */ 7872 if (vdc->vdisk_size == 0) { 7873 mutex_enter(&vdc->lock); 7874 vdc_store_label_unk(vdc); 7875 return (EIO); 7876 } 7877 7878 VDC_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7879 7880 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7881 7882 if (rv) { 7883 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7884 vdc->instance, rv); 7885 mutex_enter(&vdc->lock); 7886 vdc_store_label_unk(vdc); 7887 return (EIO); 7888 } 7889 7890 mutex_enter(&vdc->lock); 7891 vdc_store_label_efi(vdc, gpt, gpe); 7892 vd_efi_free(&edev, gpt, gpe); 7893 return (ENOTSUP); 7894 } 7895 7896 if (rv != 0) { 7897 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7898 vdc->instance, rv); 7899 mutex_enter(&vdc->lock); 7900 vdc_store_label_unk(vdc); 7901 if (rv != EINVAL) 7902 rv = EIO; 7903 return (rv); 7904 } 7905 7906 /* check that geometry and vtoc are valid */ 7907 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7908 vtoc.v_sanity != VTOC_SANE) { 7909 mutex_enter(&vdc->lock); 7910 vdc_store_label_unk(vdc); 7911 return (EINVAL); 7912 } 7913 7914 /* 7915 * We have a disk and a valid VTOC. However this does not mean 7916 * that the disk currently have a VTOC label. The returned VTOC may 7917 * be a default VTOC to be used for configuring the disk (this is 7918 * what is done for disk image). So we read the label from the 7919 * beginning of the disk to ensure we really have a VTOC label. 7920 * 7921 * FUTURE: This could be the default way for reading the VTOC 7922 * from the disk as opposed to sending the VD_OP_GET_VTOC 7923 * to the server. This will be the default if vdc is implemented 7924 * ontop of cmlb. 7925 */ 7926 7927 /* 7928 * Single slice disk does not support read using an absolute disk 7929 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7930 */ 7931 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7932 mutex_enter(&vdc->lock); 7933 if (vtoc.v_nparts != 1) { 7934 vdc_store_label_unk(vdc); 7935 return (EINVAL); 7936 } 7937 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7938 return (0); 7939 } 7940 7941 if (vtoc.v_nparts != V_NUMPAR) { 7942 mutex_enter(&vdc->lock); 7943 vdc_store_label_unk(vdc); 7944 return (EINVAL); 7945 } 7946 7947 /* 7948 * Most CD/DVDs do not have a disk label and the label is 7949 * generated by the disk driver. So the on-disk label check 7950 * below may fail and we return now to avoid this problem. 7951 */ 7952 if (vdc->vdisk_media == VD_MEDIA_CD || 7953 vdc->vdisk_media == VD_MEDIA_DVD) { 7954 mutex_enter(&vdc->lock); 7955 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7956 return (0); 7957 } 7958 7959 /* 7960 * Read disk label from start of disk 7961 */ 7962 label = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP); 7963 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7964 bioinit(buf); 7965 buf->b_un.b_addr = (caddr_t)label; 7966 buf->b_bcount = vdc->vdisk_bsize; 7967 buf->b_flags = B_BUSY | B_READ; 7968 buf->b_dev = cmpdev(dev); 7969 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)label, 7970 vdc->vdisk_bsize, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7971 if (rv) { 7972 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7973 vdc->instance); 7974 } else if (ddi_in_panic()) { 7975 rv = vdc_drain_response(vdc, CB_STRATEGY, buf); 7976 if (rv == 0) { 7977 rv = geterror(buf); 7978 } 7979 } else { 7980 rv = biowait(buf); 7981 } 7982 biofini(buf); 7983 kmem_free(buf, sizeof (buf_t)); 7984 7985 if (rv != 0 || label->dkl_magic != DKL_MAGIC || 7986 label->dkl_cksum != vdc_lbl2cksum(label)) { 7987 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7988 vdc->instance); 7989 kmem_free(label, vdc->vdisk_bsize); 7990 mutex_enter(&vdc->lock); 7991 vdc_store_label_unk(vdc); 7992 return (EINVAL); 7993 } 7994 7995 kmem_free(label, vdc->vdisk_bsize); 7996 mutex_enter(&vdc->lock); 7997 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7998 return (0); 7999 } 8000 8001 /* 8002 * Function: 8003 * vdc_validate 8004 * 8005 * Description: 8006 * This routine discovers the label of the disk and create the 8007 * appropriate device nodes if the label has changed. 8008 * 8009 * Arguments: 8010 * vdc - soft state pointer for this instance of the device driver. 8011 * 8012 * Return Code: 8013 * none. 8014 */ 8015 static void 8016 vdc_validate(vdc_t *vdc) 8017 { 8018 vd_disk_label_t old_label; 8019 vd_slice_t old_slice[V_NUMPAR]; 8020 int rv; 8021 8022 ASSERT(!MUTEX_HELD(&vdc->lock)); 8023 8024 mutex_enter(&vdc->lock); 8025 8026 /* save the current label and vtoc */ 8027 old_label = vdc->vdisk_label; 8028 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 8029 8030 /* check the geometry */ 8031 (void) vdc_validate_geometry(vdc); 8032 8033 /* if the disk label has changed, update device nodes */ 8034 if (vdc->vdisk_label != old_label) { 8035 8036 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 8037 rv = vdc_create_device_nodes_efi(vdc); 8038 else 8039 rv = vdc_create_device_nodes_vtoc(vdc); 8040 8041 if (rv != 0) { 8042 DMSG(vdc, 0, "![%d] Failed to update device nodes", 8043 vdc->instance); 8044 } 8045 } 8046 8047 mutex_exit(&vdc->lock); 8048 } 8049 8050 static void 8051 vdc_validate_task(void *arg) 8052 { 8053 vdc_t *vdc = (vdc_t *)arg; 8054 8055 vdc_validate(vdc); 8056 8057 mutex_enter(&vdc->lock); 8058 ASSERT(vdc->validate_pending > 0); 8059 vdc->validate_pending--; 8060 mutex_exit(&vdc->lock); 8061 } 8062 8063 /* 8064 * Function: 8065 * vdc_setup_devid() 8066 * 8067 * Description: 8068 * This routine discovers the devid of a vDisk. It requests the devid of 8069 * the underlying device from the vDisk server, builds an encapsulated 8070 * devid based on the retrieved devid and registers that new devid to 8071 * the vDisk. 8072 * 8073 * Arguments: 8074 * vdc - soft state pointer for this instance of the device driver. 8075 * 8076 * Return Code: 8077 * 0 - A devid was succesfully registered for the vDisk 8078 */ 8079 static int 8080 vdc_setup_devid(vdc_t *vdc) 8081 { 8082 int rv; 8083 vd_devid_t *vd_devid; 8084 size_t bufsize, bufid_len; 8085 8086 /* 8087 * At first sight, we don't know the size of the devid that the 8088 * server will return but this size will be encoded into the 8089 * reply. So we do a first request using a default size then we 8090 * check if this size was large enough. If not then we do a second 8091 * request with the correct size returned by the server. Note that 8092 * ldc requires size to be 8-byte aligned. 8093 */ 8094 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 8095 sizeof (uint64_t)); 8096 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8097 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8098 8099 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 8100 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 8101 8102 DMSG(vdc, 2, "sync_op returned %d\n", rv); 8103 8104 if (rv) { 8105 kmem_free(vd_devid, bufsize); 8106 return (rv); 8107 } 8108 8109 if (vd_devid->length > bufid_len) { 8110 /* 8111 * The returned devid is larger than the buffer used. Try again 8112 * with a buffer with the right size. 8113 */ 8114 kmem_free(vd_devid, bufsize); 8115 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 8116 sizeof (uint64_t)); 8117 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8118 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8119 8120 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 8121 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 8122 VIO_both_dir, B_TRUE); 8123 8124 if (rv) { 8125 kmem_free(vd_devid, bufsize); 8126 return (rv); 8127 } 8128 } 8129 8130 /* 8131 * The virtual disk should have the same device id as the one associated 8132 * with the physical disk it is mapped on, otherwise sharing a disk 8133 * between a LDom and a non-LDom may not work (for example for a shared 8134 * SVM disk set). 8135 * 8136 * The DDI framework does not allow creating a device id with any 8137 * type so we first create a device id of type DEVID_ENCAP and then 8138 * we restore the orignal type of the physical device. 8139 */ 8140 8141 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 8142 8143 /* build an encapsulated devid based on the returned devid */ 8144 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 8145 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 8146 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 8147 kmem_free(vd_devid, bufsize); 8148 return (1); 8149 } 8150 8151 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 8152 8153 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 8154 8155 kmem_free(vd_devid, bufsize); 8156 8157 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 8158 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 8159 return (1); 8160 } 8161 8162 return (0); 8163 } 8164 8165 static void 8166 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 8167 { 8168 int i, nparts; 8169 8170 ASSERT(MUTEX_HELD(&vdc->lock)); 8171 8172 vdc->vdisk_label = VD_DISK_LABEL_EFI; 8173 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8174 bzero(vdc->geom, sizeof (struct dk_geom)); 8175 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8176 8177 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 8178 8179 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 8180 8181 if (gpe[i].efi_gpe_StartingLBA == 0 && 8182 gpe[i].efi_gpe_EndingLBA == 0) { 8183 continue; 8184 } 8185 8186 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 8187 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 8188 gpe[i].efi_gpe_StartingLBA + 1; 8189 } 8190 8191 ASSERT(vdc->vdisk_size != 0); 8192 vdc->slice[VD_EFI_WD_SLICE].start = 0; 8193 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 8194 8195 } 8196 8197 static void 8198 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct extvtoc *vtoc) 8199 { 8200 int i; 8201 8202 ASSERT(MUTEX_HELD(&vdc->lock)); 8203 ASSERT(vdc->vdisk_bsize == vtoc->v_sectorsz); 8204 8205 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 8206 bcopy(vtoc, vdc->vtoc, sizeof (struct extvtoc)); 8207 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 8208 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8209 8210 for (i = 0; i < vtoc->v_nparts; i++) { 8211 vdc->slice[i].start = vtoc->v_part[i].p_start; 8212 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8213 } 8214 } 8215 8216 static void 8217 vdc_store_label_unk(vdc_t *vdc) 8218 { 8219 ASSERT(MUTEX_HELD(&vdc->lock)); 8220 8221 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8222 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8223 bzero(vdc->geom, sizeof (struct dk_geom)); 8224 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8225 } 8226