1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static void vsw_get_md_properties(vsw_t *vswp); 81 static int vsw_get_physaddr(vsw_t *); 82 static int vsw_setup_layer2(vsw_t *); 83 static int vsw_setup_layer3(vsw_t *); 84 85 /* MAC layer routines */ 86 static int vsw_mac_attach(vsw_t *vswp); 87 static void vsw_mac_detach(vsw_t *vswp); 88 static int vsw_get_hw_maddr(vsw_t *); 89 static int vsw_set_hw(vsw_t *, vsw_port_t *); 90 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *); 91 static int vsw_unset_hw(vsw_t *, vsw_port_t *); 92 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *); 93 static int vsw_reconfig_hw(vsw_t *); 94 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 95 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 96 static int vsw_mac_register(vsw_t *); 97 static int vsw_mac_unregister(vsw_t *); 98 static int vsw_m_stat(void *, uint_t, uint64_t *); 99 static void vsw_m_stop(void *arg); 100 static int vsw_m_start(void *arg); 101 static int vsw_m_unicst(void *arg, const uint8_t *); 102 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 103 static int vsw_m_promisc(void *arg, boolean_t); 104 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 105 106 /* MDEG routines */ 107 static void vsw_mdeg_register(vsw_t *vswp); 108 static void vsw_mdeg_unregister(vsw_t *vswp); 109 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 110 111 /* Port add/deletion routines */ 112 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 113 static int vsw_port_attach(vsw_t *vswp, int p_instance, 114 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 115 static int vsw_detach_ports(vsw_t *vswp); 116 static int vsw_port_detach(vsw_t *vswp, int p_instance); 117 static int vsw_port_delete(vsw_port_t *port); 118 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 119 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 120 static int vsw_init_ldcs(vsw_port_t *port); 121 static int vsw_uninit_ldcs(vsw_port_t *port); 122 static int vsw_ldc_init(vsw_ldc_t *ldcp); 123 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 124 static int vsw_drain_ldcs(vsw_port_t *port); 125 static int vsw_drain_port_taskq(vsw_port_t *port); 126 static void vsw_marker_task(void *); 127 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 128 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 129 130 /* Interrupt routines */ 131 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 132 133 /* Handshake routines */ 134 static void vsw_restart_handshake(vsw_ldc_t *); 135 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 136 static void vsw_next_milestone(vsw_ldc_t *); 137 static int vsw_supported_version(vio_ver_msg_t *); 138 139 /* Data processing routines */ 140 static void vsw_process_pkt(void *); 141 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 142 static void vsw_process_ctrl_pkt(void *); 143 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 144 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 145 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 146 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 147 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 148 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 149 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 150 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 151 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 152 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 153 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 154 155 /* Switching/data transmit routines */ 156 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 157 vsw_port_t *port, mac_resource_handle_t); 158 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 159 vsw_port_t *port, mac_resource_handle_t); 160 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 161 vsw_port_t *port); 162 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 163 vsw_port_t *port); 164 static int vsw_portsend(vsw_port_t *, mblk_t *); 165 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 166 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 167 168 /* Packet creation routines */ 169 static void vsw_send_ver(vsw_ldc_t *); 170 static void vsw_send_attr(vsw_ldc_t *); 171 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 172 static void vsw_send_dring_info(vsw_ldc_t *); 173 static void vsw_send_rdx(vsw_ldc_t *); 174 175 static void vsw_send_msg(vsw_ldc_t *, void *, int); 176 177 /* Forwarding database (FDB) routines */ 178 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 179 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 180 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 181 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 182 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 183 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 184 static void vsw_del_addr(uint8_t, void *, uint64_t); 185 static void vsw_del_mcst_port(vsw_port_t *); 186 static void vsw_del_mcst_vsw(vsw_t *); 187 188 /* Dring routines */ 189 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 190 static void vsw_create_privring(vsw_ldc_t *); 191 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 192 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 193 int *); 194 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 195 196 static void vsw_set_lane_attr(vsw_t *, lane_t *); 197 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 198 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 199 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 200 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 201 202 /* Misc support routines */ 203 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 204 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 205 static int vsw_free_ring(dring_info_t *); 206 207 208 /* Debugging routines */ 209 static void dump_flags(uint64_t); 210 static void display_state(void); 211 static void display_lane(lane_t *); 212 static void display_ring(dring_info_t *); 213 214 int vsw_num_handshakes = 3; /* # of handshake attempts */ 215 int vsw_wretries = 100; /* # of write attempts */ 216 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 217 int vsw_desc_delay = 0; /* delay in us */ 218 int vsw_read_attempts = 5; /* # of reads of descriptor */ 219 220 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 221 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 222 223 224 /* 225 * mode specific frame switching function 226 */ 227 void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, 228 mac_resource_handle_t); 229 230 static mac_callbacks_t vsw_m_callbacks = { 231 0, 232 vsw_m_stat, 233 vsw_m_start, 234 vsw_m_stop, 235 vsw_m_promisc, 236 vsw_m_multicst, 237 vsw_m_unicst, 238 vsw_m_tx, 239 NULL, 240 NULL, 241 NULL 242 }; 243 244 static struct cb_ops vsw_cb_ops = { 245 nulldev, /* cb_open */ 246 nulldev, /* cb_close */ 247 nodev, /* cb_strategy */ 248 nodev, /* cb_print */ 249 nodev, /* cb_dump */ 250 nodev, /* cb_read */ 251 nodev, /* cb_write */ 252 nodev, /* cb_ioctl */ 253 nodev, /* cb_devmap */ 254 nodev, /* cb_mmap */ 255 nodev, /* cb_segmap */ 256 nochpoll, /* cb_chpoll */ 257 ddi_prop_op, /* cb_prop_op */ 258 NULL, /* cb_stream */ 259 D_MP, /* cb_flag */ 260 CB_REV, /* rev */ 261 nodev, /* int (*cb_aread)() */ 262 nodev /* int (*cb_awrite)() */ 263 }; 264 265 static struct dev_ops vsw_ops = { 266 DEVO_REV, /* devo_rev */ 267 0, /* devo_refcnt */ 268 vsw_getinfo, /* devo_getinfo */ 269 nulldev, /* devo_identify */ 270 nulldev, /* devo_probe */ 271 vsw_attach, /* devo_attach */ 272 vsw_detach, /* devo_detach */ 273 nodev, /* devo_reset */ 274 &vsw_cb_ops, /* devo_cb_ops */ 275 (struct bus_ops *)NULL, /* devo_bus_ops */ 276 ddi_power /* devo_power */ 277 }; 278 279 extern struct mod_ops mod_driverops; 280 static struct modldrv vswmodldrv = { 281 &mod_driverops, 282 "sun4v Virtual Switch Driver %I%", 283 &vsw_ops, 284 }; 285 286 #define LDC_ENTER_LOCK(ldcp) \ 287 mutex_enter(&((ldcp)->ldc_cblock));\ 288 mutex_enter(&((ldcp)->ldc_txlock)); 289 #define LDC_EXIT_LOCK(ldcp) \ 290 mutex_exit(&((ldcp)->ldc_txlock));\ 291 mutex_exit(&((ldcp)->ldc_cblock)); 292 293 /* Driver soft state ptr */ 294 static void *vsw_state; 295 296 /* 297 * Linked list of "vsw_t" structures - one per instance. 298 */ 299 vsw_t *vsw_head = NULL; 300 krwlock_t vsw_rw; 301 302 /* 303 * Property names 304 */ 305 static char vdev_propname[] = "virtual-device"; 306 static char vsw_propname[] = "virtual-network-switch"; 307 static char physdev_propname[] = "vsw-phys-dev"; 308 static char smode_propname[] = "vsw-switch-mode"; 309 static char macaddr_propname[] = "local-mac-address"; 310 static char remaddr_propname[] = "remote-mac-address"; 311 static char ldcids_propname[] = "ldc-ids"; 312 static char chan_propname[] = "channel-endpoint"; 313 static char id_propname[] = "id"; 314 static char reg_propname[] = "reg"; 315 316 /* supported versions */ 317 static ver_sup_t vsw_versions[] = { {1, 0} }; 318 319 /* 320 * Matching criteria passed to the MDEG to register interest 321 * in changes to 'virtual-device-port' nodes identified by their 322 * 'id' property. 323 */ 324 static md_prop_match_t vport_prop_match[] = { 325 { MDET_PROP_VAL, "id" }, 326 { MDET_LIST_END, NULL } 327 }; 328 329 static mdeg_node_match_t vport_match = { "virtual-device-port", 330 vport_prop_match }; 331 332 /* 333 * Specification of an MD node passed to the MDEG to filter any 334 * 'vport' nodes that do not belong to the specified node. This 335 * template is copied for each vsw instance and filled in with 336 * the appropriate 'cfg-handle' value before being passed to the MDEG. 337 */ 338 static mdeg_prop_spec_t vsw_prop_template[] = { 339 { MDET_PROP_STR, "name", vsw_propname }, 340 { MDET_PROP_VAL, "cfg-handle", NULL }, 341 { MDET_LIST_END, NULL, NULL } 342 }; 343 344 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 345 346 /* 347 * Print debug messages - set to 0x1f to enable all msgs 348 * or 0x0 to turn all off. 349 */ 350 int vswdbg = 0x0; 351 352 /* 353 * debug levels: 354 * 0x01: Function entry/exit tracing 355 * 0x02: Internal function messages 356 * 0x04: Verbose internal messages 357 * 0x08: Warning messages 358 * 0x10: Error messages 359 */ 360 361 static void 362 vswdebug(vsw_t *vswp, const char *fmt, ...) 363 { 364 char buf[512]; 365 va_list ap; 366 367 va_start(ap, fmt); 368 (void) vsprintf(buf, fmt, ap); 369 va_end(ap); 370 371 if (vswp == NULL) 372 cmn_err(CE_CONT, "%s\n", buf); 373 else 374 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 375 } 376 377 /* 378 * For the moment the state dump routines have their own 379 * private flag. 380 */ 381 #define DUMP_STATE 0 382 383 #if DUMP_STATE 384 385 #define DUMP_TAG(tag) \ 386 { \ 387 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 388 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 389 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 390 } 391 392 #define DUMP_TAG_PTR(tag) \ 393 { \ 394 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 395 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 396 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 397 } 398 399 #define DUMP_FLAGS(flags) dump_flags(flags); 400 #define DISPLAY_STATE() display_state() 401 402 #else 403 404 #define DUMP_TAG(tag) 405 #define DUMP_TAG_PTR(tag) 406 #define DUMP_FLAGS(state) 407 #define DISPLAY_STATE() 408 409 #endif /* DUMP_STATE */ 410 411 #ifdef DEBUG 412 413 #define D1 \ 414 if (vswdbg & 0x01) \ 415 vswdebug 416 417 #define D2 \ 418 if (vswdbg & 0x02) \ 419 vswdebug 420 421 #define D3 \ 422 if (vswdbg & 0x04) \ 423 vswdebug 424 425 #define DWARN \ 426 if (vswdbg & 0x08) \ 427 vswdebug 428 429 #define DERR \ 430 if (vswdbg & 0x10) \ 431 vswdebug 432 433 #else 434 435 #define DERR if (0) vswdebug 436 #define DWARN if (0) vswdebug 437 #define D1 if (0) vswdebug 438 #define D2 if (0) vswdebug 439 #define D3 if (0) vswdebug 440 441 #endif /* DEBUG */ 442 443 static struct modlinkage modlinkage = { 444 MODREV_1, 445 &vswmodldrv, 446 NULL 447 }; 448 449 int 450 _init(void) 451 { 452 int status; 453 454 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 455 456 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 457 if (status != 0) { 458 return (status); 459 } 460 461 mac_init_ops(&vsw_ops, "vsw"); 462 status = mod_install(&modlinkage); 463 if (status != 0) { 464 ddi_soft_state_fini(&vsw_state); 465 } 466 return (status); 467 } 468 469 int 470 _fini(void) 471 { 472 int status; 473 474 status = mod_remove(&modlinkage); 475 if (status != 0) 476 return (status); 477 mac_fini_ops(&vsw_ops); 478 ddi_soft_state_fini(&vsw_state); 479 480 rw_destroy(&vsw_rw); 481 482 return (status); 483 } 484 485 int 486 _info(struct modinfo *modinfop) 487 { 488 return (mod_info(&modlinkage, modinfop)); 489 } 490 491 static int 492 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 493 { 494 vsw_t *vswp; 495 int instance, i; 496 char hashname[MAXNAMELEN]; 497 char qname[TASKQ_NAMELEN]; 498 int rv = 1; 499 enum { PROG_init = 0x0, PROG_if_lock = 0x1, 500 PROG_fdb = 0x2, PROG_mfdb = 0x4, 501 PROG_report_dev = 0x8, PROG_plist = 0x10, 502 PROG_taskq = 0x20} 503 progress; 504 505 progress = PROG_init; 506 507 switch (cmd) { 508 case DDI_ATTACH: 509 break; 510 case DDI_RESUME: 511 /* nothing to do for this non-device */ 512 return (DDI_SUCCESS); 513 case DDI_PM_RESUME: 514 default: 515 return (DDI_FAILURE); 516 } 517 518 instance = ddi_get_instance(dip); 519 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 520 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 521 return (DDI_FAILURE); 522 } 523 vswp = ddi_get_soft_state(vsw_state, instance); 524 525 if (vswp == NULL) { 526 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 527 goto vsw_attach_fail; 528 } 529 530 vswp->dip = dip; 531 vswp->instance = instance; 532 ddi_set_driver_private(dip, (caddr_t)vswp); 533 534 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 535 536 progress |= PROG_if_lock; 537 538 /* 539 * Get the various properties such as physical device name 540 * (vsw-phys-dev), switch mode etc from the MD. 541 */ 542 vsw_get_md_properties(vswp); 543 544 /* setup the unicast forwarding database */ 545 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 546 vswp->instance); 547 D2(vswp, "creating unicast hash table (%s)...", hashname); 548 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 549 mod_hash_null_valdtor, sizeof (void *)); 550 551 progress |= PROG_fdb; 552 553 /* setup the multicast fowarding database */ 554 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 555 vswp->instance); 556 D2(vswp, "creating multicast hash table %s)...", hashname); 557 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 558 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 559 mod_hash_null_valdtor, sizeof (void *)); 560 561 progress |= PROG_mfdb; 562 563 /* 564 * create lock protecting list of multicast addresses 565 * which could come via m_multicst() entry point when plumbed. 566 */ 567 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 568 vswp->mcap = NULL; 569 570 ddi_report_dev(vswp->dip); 571 572 progress |= PROG_report_dev; 573 574 WRITE_ENTER(&vsw_rw); 575 vswp->next = vsw_head; 576 vsw_head = vswp; 577 RW_EXIT(&vsw_rw); 578 579 /* setup the port list */ 580 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 581 vswp->plist.head = NULL; 582 583 progress |= PROG_plist; 584 585 /* 586 * Create the taskq which will process all the VIO 587 * control messages. 588 */ 589 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 590 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 591 TASKQ_DEFAULTPRI, 0)) == NULL) { 592 cmn_err(CE_WARN, "Unable to create task queue"); 593 goto vsw_attach_fail; 594 } 595 596 progress |= PROG_taskq; 597 598 /* select best switching mode */ 599 for (i = 0; i < vswp->smode_num; i++) { 600 vswp->smode_idx = i; 601 switch (vswp->smode[i]) { 602 case VSW_LAYER2: 603 case VSW_LAYER2_PROMISC: 604 rv = vsw_setup_layer2(vswp); 605 break; 606 607 case VSW_LAYER3: 608 rv = vsw_setup_layer3(vswp); 609 break; 610 611 default: 612 DERR(vswp, "unknown switch mode"); 613 rv = 1; 614 break; 615 } 616 617 if (rv == 0) 618 break; 619 } 620 621 if (rv == 1) { 622 cmn_err(CE_WARN, "Unable to setup switching mode"); 623 goto vsw_attach_fail; 624 } 625 626 D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); 627 628 /* 629 * Register with the MAC layer as a network device so 630 * we can be plumbed if desired. 631 * 632 * Do this in both layer 2 and layer 3 mode. 633 */ 634 vswp->if_state &= ~VSW_IF_UP; 635 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 636 if (vsw_mac_register(vswp) != 0) { 637 cmn_err(CE_WARN, "Unable to register as provider " 638 " with MAC layer, continuing with attach"); 639 } 640 } 641 642 /* prevent auto-detaching */ 643 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 644 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 645 cmn_err(CE_NOTE, "Unable to set \"%s\" property for " 646 "instance %u", DDI_NO_AUTODETACH, instance); 647 } 648 649 /* 650 * Now we have everything setup, register for MD change 651 * events. 652 */ 653 vsw_mdeg_register(vswp); 654 655 return (DDI_SUCCESS); 656 657 vsw_attach_fail: 658 DERR(NULL, "vsw_attach: failed"); 659 660 if (progress & PROG_taskq) 661 ddi_taskq_destroy(vswp->taskq_p); 662 663 if (progress & PROG_plist) 664 rw_destroy(&vswp->plist.lockrw); 665 666 if (progress & PROG_report_dev) { 667 ddi_remove_minor_node(dip, NULL); 668 mutex_destroy(&vswp->mca_lock); 669 } 670 671 if (progress & PROG_mfdb) { 672 mod_hash_destroy_hash(vswp->mfdb); 673 vswp->mfdb = NULL; 674 rw_destroy(&vswp->mfdbrw); 675 } 676 677 if (progress & PROG_fdb) { 678 mod_hash_destroy_hash(vswp->fdb); 679 vswp->fdb = NULL; 680 } 681 682 if (progress & PROG_if_lock) 683 rw_destroy(&vswp->if_lockrw); 684 685 ddi_soft_state_free(vsw_state, instance); 686 return (DDI_FAILURE); 687 } 688 689 static int 690 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 691 { 692 vio_mblk_pool_t *poolp, *npoolp; 693 vsw_t **vswpp, *vswp; 694 int instance; 695 696 instance = ddi_get_instance(dip); 697 vswp = ddi_get_soft_state(vsw_state, instance); 698 699 if (vswp == NULL) { 700 return (DDI_FAILURE); 701 } 702 703 switch (cmd) { 704 case DDI_DETACH: 705 break; 706 case DDI_SUSPEND: 707 case DDI_PM_SUSPEND: 708 default: 709 return (DDI_FAILURE); 710 } 711 712 D2(vswp, "detaching instance %d", instance); 713 714 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 715 if (vsw_mac_unregister(vswp) != 0) { 716 cmn_err(CE_WARN, "Unable to detach from MAC layer"); 717 return (DDI_FAILURE); 718 } 719 rw_destroy(&vswp->if_lockrw); 720 } 721 722 vsw_mdeg_unregister(vswp); 723 724 /* remove mac layer callback */ 725 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 726 mac_rx_remove(vswp->mh, vswp->mrh); 727 vswp->mrh = NULL; 728 } 729 730 if (vsw_detach_ports(vswp) != 0) { 731 cmn_err(CE_WARN, "Unable to detach ports"); 732 return (DDI_FAILURE); 733 } 734 735 /* 736 * Now that the ports have been deleted, stop and close 737 * the physical device. 738 */ 739 if (vswp->mh != NULL) { 740 mac_stop(vswp->mh); 741 mac_close(vswp->mh); 742 743 vswp->mh = NULL; 744 vswp->txinfo = NULL; 745 } 746 747 /* 748 * Destroy any free pools that may still exist. 749 */ 750 poolp = vswp->rxh; 751 while (poolp != NULL) { 752 npoolp = vswp->rxh = poolp->nextp; 753 if (vio_destroy_mblks(poolp) != 0) { 754 vswp->rxh = poolp; 755 return (DDI_FAILURE); 756 } 757 poolp = npoolp; 758 } 759 760 /* 761 * Remove this instance from any entries it may be on in 762 * the hash table by using the list of addresses maintained 763 * in the vsw_t structure. 764 */ 765 vsw_del_mcst_vsw(vswp); 766 767 vswp->mcap = NULL; 768 mutex_destroy(&vswp->mca_lock); 769 770 /* 771 * By now any pending tasks have finished and the underlying 772 * ldc's have been destroyed, so its safe to delete the control 773 * message taskq. 774 */ 775 if (vswp->taskq_p != NULL) 776 ddi_taskq_destroy(vswp->taskq_p); 777 778 /* 779 * At this stage all the data pointers in the hash table 780 * should be NULL, as all the ports have been removed and will 781 * have deleted themselves from the port lists which the data 782 * pointers point to. Hence we can destroy the table using the 783 * default destructors. 784 */ 785 D2(vswp, "vsw_detach: destroying hash tables.."); 786 mod_hash_destroy_hash(vswp->fdb); 787 vswp->fdb = NULL; 788 789 WRITE_ENTER(&vswp->mfdbrw); 790 mod_hash_destroy_hash(vswp->mfdb); 791 vswp->mfdb = NULL; 792 RW_EXIT(&vswp->mfdbrw); 793 rw_destroy(&vswp->mfdbrw); 794 795 ddi_remove_minor_node(dip, NULL); 796 797 rw_destroy(&vswp->plist.lockrw); 798 WRITE_ENTER(&vsw_rw); 799 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 800 if (*vswpp == vswp) { 801 *vswpp = vswp->next; 802 break; 803 } 804 } 805 RW_EXIT(&vsw_rw); 806 ddi_soft_state_free(vsw_state, instance); 807 808 return (DDI_SUCCESS); 809 } 810 811 static int 812 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 813 { 814 _NOTE(ARGUNUSED(dip)) 815 816 vsw_t *vswp = NULL; 817 dev_t dev = (dev_t)arg; 818 int instance; 819 820 instance = getminor(dev); 821 822 switch (infocmd) { 823 case DDI_INFO_DEVT2DEVINFO: 824 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 825 *result = NULL; 826 return (DDI_FAILURE); 827 } 828 *result = vswp->dip; 829 return (DDI_SUCCESS); 830 831 case DDI_INFO_DEVT2INSTANCE: 832 *result = (void *)(uintptr_t)instance; 833 return (DDI_SUCCESS); 834 835 default: 836 *result = NULL; 837 return (DDI_FAILURE); 838 } 839 } 840 841 /* 842 * Get the properties from our MD node. 843 */ 844 static void 845 vsw_get_md_properties(vsw_t *vswp) 846 { 847 md_t *mdp = NULL; 848 int num_nodes = 0; 849 int len = 0, listsz = 0; 850 int num_vdev = 0; 851 int i, idx; 852 boolean_t found_node = B_FALSE; 853 char *smode = NULL; 854 char *curr_mode = NULL; 855 char *physname = NULL; 856 char *node_name = NULL; 857 char *dev; 858 uint64_t macaddr = 0; 859 uint64_t md_inst, obp_inst; 860 mde_cookie_t *listp = NULL; 861 mde_cookie_t rootnode; 862 863 D1(vswp, "%s: enter", __func__); 864 865 /* 866 * Further down we compare the obp 'reg' property to the 867 * 'cfg-handle' property in the vsw MD node to determine 868 * if the node refers to this particular instance. So if 869 * we can't read the obp value then there is no point 870 * in proceeding further. 871 */ 872 if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, 873 DDI_PROP_DONTPASS, reg_propname) != 1) { 874 cmn_err(CE_WARN, "Unable to read %s property " 875 "from OBP device node", reg_propname); 876 return; 877 } 878 879 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 880 DDI_PROP_DONTPASS, reg_propname, 0); 881 882 D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); 883 884 if ((mdp = md_get_handle()) == NULL) { 885 DERR(vswp, "%s: unable to init MD", __func__); 886 return; 887 } 888 889 if ((num_nodes = md_node_count(mdp)) <= 0) { 890 DERR(vswp, "%s: invalid number of nodes found %d", 891 __func__, num_nodes); 892 (void) md_fini_handle(mdp); 893 return; 894 } 895 896 D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); 897 898 /* allocate enough space for node list */ 899 listsz = num_nodes * sizeof (mde_cookie_t); 900 listp = kmem_zalloc(listsz, KM_SLEEP); 901 902 rootnode = md_root_node(mdp); 903 904 /* Get the list of virtual devices */ 905 num_vdev = md_scan_dag(mdp, rootnode, 906 md_find_name(mdp, vdev_propname), 907 md_find_name(mdp, "fwd"), listp); 908 909 if (num_vdev <= 0) { 910 DERR(vswp, "%s: didn't find any virtual-device nodes in MD", 911 __func__); 912 goto md_prop_exit; 913 } 914 915 D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); 916 917 /* Look for the virtual switch nodes in the list */ 918 for (idx = 0; idx < num_vdev; idx++) { 919 if (md_get_prop_str(mdp, listp[idx], 920 "name", &node_name) != 0) { 921 DERR(vswp, "%s: unable to get node name", __func__); 922 continue; 923 924 } 925 926 if (strcmp(node_name, vsw_propname) == 0) { 927 /* Virtual switch node */ 928 if (md_get_prop_val(mdp, listp[idx], 929 "cfg-handle", &md_inst) != 0) { 930 DERR(vswp, "%s: unable to get cfg-handle from" 931 " node %d", __func__, idx); 932 goto md_prop_exit; 933 } else if (md_inst == obp_inst) { 934 D2(vswp, "%s: found matching node (%d)" 935 " 0x%llx == 0x%llx", __func__, idx, 936 md_inst, obp_inst); 937 found_node = B_TRUE; 938 break; 939 } 940 } 941 } 942 943 if (!found_node) { 944 DWARN(vswp, "%s: couldn't find correct vsw node", __func__); 945 goto md_prop_exit; 946 } 947 948 /* 949 * Now, having found the correct node, get the various properties. 950 */ 951 952 if (md_get_prop_data(mdp, listp[idx], physdev_propname, 953 (uint8_t **)(&physname), &len) != 0) { 954 cmn_err(CE_WARN, "%s: unable to get name(s) of physical " 955 "device(s) from MD", __func__); 956 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 957 cmn_err(CE_WARN, "%s is too long a device name", physname); 958 } else { 959 (void) strncpy(vswp->physname, physname, strlen(physname) + 1); 960 vswp->mdprops |= VSW_MD_PHYSNAME; 961 D2(vswp, "%s: using first device specified (%s)", 962 __func__, vswp->physname); 963 } 964 965 #ifdef DEBUG 966 /* 967 * As a temporary measure to aid testing we check to see if there 968 * is a vsw.conf file present. If there is we use the value of the 969 * vsw_physname property in the file as the name of the physical 970 * device, overriding the value from the MD. 971 * 972 * There may be multiple devices listed, but for the moment 973 * we just use the first one. 974 */ 975 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 976 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 977 if ((strlen(dev) + 1) > LIFNAMSIZ) { 978 cmn_err(CE_WARN, "%s is too long a device name", dev); 979 } else { 980 cmn_err(CE_NOTE, "%s: using device name (%s) from " 981 "config file", __func__, dev); 982 983 (void) strncpy(vswp->physname, dev, strlen(dev) + 1); 984 vswp->mdprops |= VSW_MD_PHYSNAME; 985 } 986 987 ddi_prop_free(dev); 988 989 } 990 #endif 991 992 /* mac address for vswitch device itself */ 993 if (md_get_prop_val(mdp, listp[idx], 994 macaddr_propname, &macaddr) != 0) { 995 cmn_err(CE_WARN, "!Unable to get MAC address from MD"); 996 997 /* 998 * Fallback to using the mac address of the physical 999 * device. 1000 */ 1001 if (vsw_get_physaddr(vswp) == 0) { 1002 cmn_err(CE_NOTE, "!Using MAC address from physical " 1003 "device (%s)", vswp->physname); 1004 } 1005 } else { 1006 READ_ENTER(&vswp->if_lockrw); 1007 for (i = ETHERADDRL - 1; i >= 0; i--) { 1008 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 1009 macaddr >>= 8; 1010 } 1011 RW_EXIT(&vswp->if_lockrw); 1012 vswp->mdprops |= VSW_MD_MACADDR; 1013 } 1014 1015 /* 1016 * Get the switch-mode property. The modes are listed in 1017 * decreasing order of preference, i.e. prefered mode is 1018 * first item in list. 1019 */ 1020 len = 0; 1021 vswp->smode_num = 0; 1022 if (md_get_prop_data(mdp, listp[idx], smode_propname, 1023 (uint8_t **)(&smode), &len) != 0) { 1024 /* 1025 * Unable to get switch-mode property from MD, nothing 1026 * more we can do. 1027 */ 1028 cmn_err(CE_WARN, "!unable to get switch mode property"); 1029 goto md_prop_exit; 1030 } 1031 1032 curr_mode = smode; 1033 /* 1034 * Modes of operation: 1035 * 'switched' - layer 2 switching, underlying HW in 1036 * programmed mode. 1037 * 'promiscuous' - layer 2 switching, underlying HW in 1038 * promiscuous mode. 1039 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 1040 * in non-promiscuous mode. 1041 */ 1042 while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) { 1043 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 1044 if (strcmp(curr_mode, "switched") == 0) { 1045 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1046 } else if (strcmp(curr_mode, "promiscuous") == 0) { 1047 vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC; 1048 } else if (strcmp(curr_mode, "routed") == 0) { 1049 vswp->smode[vswp->smode_num++] = VSW_LAYER3; 1050 } else { 1051 cmn_err(CE_WARN, "Unknown switch mode %s, setting to" 1052 " default switched mode", curr_mode); 1053 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1054 } 1055 curr_mode += strlen(curr_mode) + 1; 1056 } 1057 1058 D2(vswp, "%d switching modes specified", vswp->smode_num); 1059 1060 if (vswp->smode_num > 0) 1061 vswp->mdprops |= VSW_MD_SMODE; 1062 1063 md_prop_exit: 1064 (void) md_fini_handle(mdp); 1065 1066 kmem_free(listp, listsz); 1067 1068 D1(vswp, "%s: exit", __func__); 1069 } 1070 1071 /* 1072 * Get the mac address of the physical device. 1073 * 1074 * Returns 0 on success, 1 on failure. 1075 */ 1076 static int 1077 vsw_get_physaddr(vsw_t *vswp) 1078 { 1079 mac_handle_t mh; 1080 char drv[LIFNAMSIZ]; 1081 uint_t ddi_instance; 1082 1083 D1(vswp, "%s: enter", __func__); 1084 1085 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1086 return (1); 1087 1088 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1089 cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname); 1090 return (1); 1091 } 1092 1093 READ_ENTER(&vswp->if_lockrw); 1094 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1095 RW_EXIT(&vswp->if_lockrw); 1096 1097 mac_close(mh); 1098 1099 vswp->mdprops |= VSW_DEV_MACADDR; 1100 1101 D1(vswp, "%s: exit", __func__); 1102 1103 return (0); 1104 } 1105 1106 /* 1107 * Check to see if the card supports the setting of multiple unicst 1108 * addresses. 1109 * 1110 * Returns 0 if card supports the programming of multiple unicast addresses 1111 * and there are free address slots available, otherwise returns 1. 1112 */ 1113 static int 1114 vsw_get_hw_maddr(vsw_t *vswp) 1115 { 1116 D1(vswp, "%s: enter", __func__); 1117 1118 if (vswp->mh == NULL) { 1119 return (1); 1120 } 1121 1122 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1123 DWARN(vswp, "Unable to get capabilities of" 1124 " underlying device (%s)", vswp->physname); 1125 return (1); 1126 } 1127 1128 if (vswp->maddr.maddr_naddrfree == 0) { 1129 cmn_err(CE_WARN, "!device %s has no free unicast address slots", 1130 vswp->physname); 1131 return (1); 1132 } 1133 1134 D2(vswp, "%s: %d addrs : %d free", __func__, 1135 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1136 1137 D1(vswp, "%s: exit", __func__); 1138 1139 return (0); 1140 } 1141 1142 /* 1143 * Setup for layer 2 switching. 1144 * 1145 * Returns 0 on success, 1 on failure. 1146 */ 1147 static int 1148 vsw_setup_layer2(vsw_t *vswp) 1149 { 1150 D1(vswp, "%s: enter", __func__); 1151 1152 vsw_switch_frame = vsw_switch_l2_frame; 1153 1154 /* 1155 * Attempt to link into the MAC layer so we can get 1156 * and send packets out over the physical adapter. 1157 */ 1158 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1159 if (vsw_mac_attach(vswp) != 0) { 1160 /* 1161 * Registration with the MAC layer has failed, 1162 * so return 1 so that can fall back to next 1163 * prefered switching method. 1164 */ 1165 cmn_err(CE_WARN, "!Unable to join as MAC layer " 1166 "client"); 1167 return (1); 1168 } 1169 1170 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1171 /* 1172 * Verify that underlying device can support multiple 1173 * unicast mac addresses, and has free capacity. 1174 */ 1175 if (vsw_get_hw_maddr(vswp) != 0) { 1176 cmn_err(CE_WARN, "!unable to setup switching"); 1177 vsw_mac_detach(vswp); 1178 return (1); 1179 } 1180 } 1181 1182 } else { 1183 /* 1184 * No physical device name found in MD which is 1185 * required for layer 2. 1186 */ 1187 cmn_err(CE_WARN, "!no physical device name specified"); 1188 return (1); 1189 } 1190 1191 D1(vswp, "%s: exit", __func__); 1192 1193 return (0); 1194 } 1195 1196 static int 1197 vsw_setup_layer3(vsw_t *vswp) 1198 { 1199 D1(vswp, "%s: enter", __func__); 1200 1201 D2(vswp, "%s: operating in layer 3 mode", __func__); 1202 vsw_switch_frame = vsw_switch_l3_frame; 1203 1204 D1(vswp, "%s: exit", __func__); 1205 1206 return (0); 1207 } 1208 1209 /* 1210 * Link into the MAC layer to gain access to the services provided by 1211 * the underlying physical device driver (which should also have 1212 * registered with the MAC layer). 1213 * 1214 * Only when in layer 2 mode. 1215 */ 1216 static int 1217 vsw_mac_attach(vsw_t *vswp) 1218 { 1219 char drv[LIFNAMSIZ]; 1220 uint_t ddi_instance; 1221 1222 D1(vswp, "vsw_mac_attach: enter"); 1223 1224 vswp->mh = NULL; 1225 vswp->mrh = NULL; 1226 1227 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1228 1229 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1230 cmn_err(CE_WARN, "invalid device name: %s", vswp->physname); 1231 goto mac_fail_exit; 1232 } 1233 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1234 cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); 1235 goto mac_fail_exit; 1236 } 1237 1238 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1239 1240 /* register our rx callback function */ 1241 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1242 1243 /* get the MAC tx fn */ 1244 vswp->txinfo = mac_tx_get(vswp->mh); 1245 1246 /* start the interface */ 1247 if (mac_start(vswp->mh) != 0) { 1248 cmn_err(CE_WARN, "could not start mac interface"); 1249 goto mac_fail_exit; 1250 } 1251 1252 D1(vswp, "vsw_mac_attach: exit"); 1253 return (0); 1254 1255 mac_fail_exit: 1256 if (vswp->mh != NULL) { 1257 if (vswp->mrh != NULL) 1258 mac_rx_remove(vswp->mh, vswp->mrh); 1259 1260 mac_close(vswp->mh); 1261 } 1262 1263 vswp->mrh = NULL; 1264 vswp->mh = NULL; 1265 vswp->txinfo = NULL; 1266 1267 D1(vswp, "vsw_mac_attach: fail exit"); 1268 return (1); 1269 } 1270 1271 static void 1272 vsw_mac_detach(vsw_t *vswp) 1273 { 1274 D1(vswp, "vsw_mac_detach: enter"); 1275 1276 if (vswp->mh != NULL) { 1277 if (vswp->mrh != NULL) 1278 mac_rx_remove(vswp->mh, vswp->mrh); 1279 1280 mac_stop(vswp->mh); 1281 mac_close(vswp->mh); 1282 } 1283 1284 vswp->mrh = NULL; 1285 vswp->mh = NULL; 1286 vswp->txinfo = NULL; 1287 1288 D1(vswp, "vsw_mac_detach: exit"); 1289 } 1290 1291 /* 1292 * Depending on the mode specified, the capabilites and capacity 1293 * of the underlying device setup the physical device. 1294 * 1295 * If in layer 3 mode, then do nothing. 1296 * 1297 * If in layer 2 programmed mode attempt to program the unicast address 1298 * associated with the port into the physical device. If this is not 1299 * possible due to resource exhaustion or simply because the device does 1300 * not support multiple unicast addresses then if required fallback onto 1301 * putting the card into promisc mode. 1302 * 1303 * If in promisc mode then simply set the card into promisc mode. 1304 * 1305 * Returns 0 success, 1 on failure. 1306 */ 1307 static int 1308 vsw_set_hw(vsw_t *vswp, vsw_port_t *port) 1309 { 1310 mac_multi_addr_t mac_addr; 1311 void *mah; 1312 int err; 1313 1314 D1(vswp, "%s: enter", __func__); 1315 1316 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1317 return (0); 1318 1319 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1320 return (vsw_set_hw_promisc(vswp, port)); 1321 } 1322 1323 if (vswp->maddr.maddr_handle == NULL) 1324 return (1); 1325 1326 mah = vswp->maddr.maddr_handle; 1327 1328 /* 1329 * Attempt to program the unicast address into the HW. 1330 */ 1331 mac_addr.mma_addrlen = ETHERADDRL; 1332 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1333 1334 err = vswp->maddr.maddr_add(mah, &mac_addr); 1335 if (err != 0) { 1336 cmn_err(CE_WARN, "!failed to program addr " 1337 "%x:%x:%x:%x:%x:%x for port %d into device %s " 1338 ": err %d", port->p_macaddr.ether_addr_octet[0], 1339 port->p_macaddr.ether_addr_octet[1], 1340 port->p_macaddr.ether_addr_octet[2], 1341 port->p_macaddr.ether_addr_octet[3], 1342 port->p_macaddr.ether_addr_octet[4], 1343 port->p_macaddr.ether_addr_octet[5], 1344 port->p_instance, vswp->physname, err); 1345 1346 /* 1347 * Mark that attempt should be made to re-config sometime 1348 * in future if a port is deleted. 1349 */ 1350 vswp->recfg_reqd = B_TRUE; 1351 1352 /* 1353 * Only 1 mode specified, nothing more to do. 1354 */ 1355 if (vswp->smode_num == 1) 1356 return (err); 1357 1358 /* 1359 * If promiscuous was next mode specified try to 1360 * set the card into that mode. 1361 */ 1362 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1363 (vswp->smode[vswp->smode_idx + 1] 1364 == VSW_LAYER2_PROMISC)) { 1365 vswp->smode_idx += 1; 1366 return (vsw_set_hw_promisc(vswp, port)); 1367 } 1368 return (err); 1369 } 1370 1371 port->addr_slot = mac_addr.mma_slot; 1372 port->addr_set = VSW_ADDR_HW; 1373 1374 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d " 1375 "into slot %d of device %s", 1376 port->p_macaddr.ether_addr_octet[0], 1377 port->p_macaddr.ether_addr_octet[1], 1378 port->p_macaddr.ether_addr_octet[2], 1379 port->p_macaddr.ether_addr_octet[3], 1380 port->p_macaddr.ether_addr_octet[4], 1381 port->p_macaddr.ether_addr_octet[5], 1382 port->p_instance, port->addr_slot, vswp->physname); 1383 1384 D1(vswp, "%s: exit", __func__); 1385 1386 return (0); 1387 } 1388 1389 /* 1390 * If in layer 3 mode do nothing. 1391 * 1392 * If in layer 2 switched mode remove the address from the physical 1393 * device. 1394 * 1395 * If in layer 2 promiscuous mode disable promisc mode. 1396 * 1397 * Returns 0 on success. 1398 */ 1399 static int 1400 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port) 1401 { 1402 int err; 1403 void *mah; 1404 1405 D1(vswp, "%s: enter", __func__); 1406 1407 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1408 return (0); 1409 1410 if (port->addr_set == VSW_ADDR_PROMISC) { 1411 return (vsw_unset_hw_promisc(vswp, port)); 1412 } 1413 1414 if (port->addr_set == VSW_ADDR_HW) { 1415 if (vswp->mh == NULL) 1416 return (1); 1417 1418 if (vswp->maddr.maddr_handle == NULL) 1419 return (1); 1420 1421 mah = vswp->maddr.maddr_handle; 1422 1423 err = vswp->maddr.maddr_remove(mah, port->addr_slot); 1424 if (err != 0) { 1425 cmn_err(CE_WARN, "!Unable to remove addr " 1426 "%x:%x:%x:%x:%x:%x for port %d from device %s" 1427 " : (err %d)", 1428 port->p_macaddr.ether_addr_octet[0], 1429 port->p_macaddr.ether_addr_octet[1], 1430 port->p_macaddr.ether_addr_octet[2], 1431 port->p_macaddr.ether_addr_octet[3], 1432 port->p_macaddr.ether_addr_octet[4], 1433 port->p_macaddr.ether_addr_octet[5], 1434 port->p_instance, vswp->physname, err); 1435 return (err); 1436 } 1437 1438 port->addr_set = VSW_ADDR_UNSET; 1439 1440 D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for " 1441 "port %d from device %s", 1442 port->p_macaddr.ether_addr_octet[0], 1443 port->p_macaddr.ether_addr_octet[1], 1444 port->p_macaddr.ether_addr_octet[2], 1445 port->p_macaddr.ether_addr_octet[3], 1446 port->p_macaddr.ether_addr_octet[4], 1447 port->p_macaddr.ether_addr_octet[5], 1448 port->p_instance, vswp->physname); 1449 } 1450 1451 D1(vswp, "%s: exit", __func__); 1452 return (0); 1453 } 1454 1455 /* 1456 * Set network card into promisc mode. 1457 * 1458 * Returns 0 on success, 1 on failure. 1459 */ 1460 static int 1461 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1462 { 1463 D1(vswp, "%s: enter", __func__); 1464 1465 if (vswp->mh == NULL) 1466 return (1); 1467 1468 if (vswp->promisc_cnt++ == 0) { 1469 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1470 vswp->promisc_cnt--; 1471 return (1); 1472 } 1473 cmn_err(CE_NOTE, "!switching device %s into promiscuous mode", 1474 vswp->physname); 1475 } 1476 port->addr_set = VSW_ADDR_PROMISC; 1477 1478 D1(vswp, "%s: exit", __func__); 1479 1480 return (0); 1481 } 1482 1483 /* 1484 * Turn off promiscuous mode on network card. 1485 * 1486 * Returns 0 on success, 1 on failure. 1487 */ 1488 static int 1489 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1490 { 1491 vsw_port_list_t *plist = &vswp->plist; 1492 1493 D1(vswp, "%s: enter", __func__); 1494 1495 if (vswp->mh == NULL) 1496 return (1); 1497 1498 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1499 1500 if (--vswp->promisc_cnt == 0) { 1501 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1502 vswp->promisc_cnt++; 1503 return (1); 1504 } 1505 1506 /* 1507 * We are exiting promisc mode either because we were 1508 * only in promisc mode because we had failed over from 1509 * switched mode due to HW resource issues, or the user 1510 * wanted the card in promisc mode for all the ports and 1511 * the last port is now being deleted. Tweak the message 1512 * accordingly. 1513 */ 1514 if (plist->num_ports != 0) { 1515 cmn_err(CE_NOTE, "!switching device %s back to " 1516 "programmed mode", vswp->physname); 1517 } else { 1518 cmn_err(CE_NOTE, "!switching device %s out of " 1519 "promiscuous mode", vswp->physname); 1520 } 1521 } 1522 port->addr_set = VSW_ADDR_UNSET; 1523 1524 D1(vswp, "%s: exit", __func__); 1525 return (0); 1526 } 1527 1528 /* 1529 * Determine whether or not we are operating in our prefered 1530 * mode and if not whether the physical resources now allow us 1531 * to operate in it. 1532 * 1533 * Should only be invoked after port which is being deleted has been 1534 * removed from the port list. 1535 */ 1536 static int 1537 vsw_reconfig_hw(vsw_t *vswp) 1538 { 1539 vsw_port_list_t *plist = &vswp->plist; 1540 mac_multi_addr_t mac_addr; 1541 vsw_port_t *tp; 1542 void *mah; 1543 int rv = 0; 1544 int s_idx; 1545 1546 D1(vswp, "%s: enter", __func__); 1547 1548 if (vswp->maddr.maddr_handle == NULL) 1549 return (1); 1550 1551 /* 1552 * Check if there are now sufficient HW resources to 1553 * attempt a re-config. 1554 */ 1555 if (plist->num_ports > vswp->maddr.maddr_naddrfree) 1556 return (1); 1557 1558 /* 1559 * If we are in layer 2 (i.e. switched) or would like to be 1560 * in layer 2 then check if any ports need to be programmed 1561 * into the HW. 1562 * 1563 * This can happen in two cases - switched was specified as 1564 * the prefered mode of operation but we exhausted the HW 1565 * resources and so failed over to the next specifed mode, 1566 * or switched was the only mode specified so after HW 1567 * resources were exhausted there was nothing more we 1568 * could do. 1569 */ 1570 if (vswp->smode_idx > 0) 1571 s_idx = vswp->smode_idx - 1; 1572 else 1573 s_idx = vswp->smode_idx; 1574 1575 if (vswp->smode[s_idx] == VSW_LAYER2) { 1576 mah = vswp->maddr.maddr_handle; 1577 1578 D2(vswp, "%s: attempting reconfig..", __func__); 1579 1580 /* 1581 * Scan the port list for any port whose address has not 1582 * be programmed in HW - there should be a max of one. 1583 */ 1584 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1585 if (tp->addr_set != VSW_ADDR_HW) { 1586 mac_addr.mma_addrlen = ETHERADDRL; 1587 ether_copy(&tp->p_macaddr, &mac_addr.mma_addr); 1588 1589 rv = vswp->maddr.maddr_add(mah, &mac_addr); 1590 if (rv != 0) { 1591 DWARN(vswp, "Error setting addr in " 1592 "HW for port %d err %d", 1593 tp->p_instance, rv); 1594 goto reconfig_err_exit; 1595 } 1596 tp->addr_slot = mac_addr.mma_slot; 1597 1598 D2(vswp, "re-programmed port %d " 1599 "addr %x:%x:%x:%x:%x:%x into slot %d" 1600 " of device %s", tp->p_instance, 1601 tp->p_macaddr.ether_addr_octet[0], 1602 tp->p_macaddr.ether_addr_octet[1], 1603 tp->p_macaddr.ether_addr_octet[2], 1604 tp->p_macaddr.ether_addr_octet[3], 1605 tp->p_macaddr.ether_addr_octet[4], 1606 tp->p_macaddr.ether_addr_octet[5], 1607 tp->addr_slot, vswp->physname); 1608 1609 /* 1610 * If up to now we had to put the card into 1611 * promisc mode to see this address, we 1612 * can now safely disable promisc mode. 1613 */ 1614 if (tp->addr_set == VSW_ADDR_PROMISC) 1615 (void) vsw_unset_hw_promisc(vswp, tp); 1616 1617 tp->addr_set = VSW_ADDR_HW; 1618 } 1619 } 1620 1621 /* no further re-config needed */ 1622 vswp->recfg_reqd = B_FALSE; 1623 1624 vswp->smode_idx = s_idx; 1625 1626 return (0); 1627 } 1628 1629 reconfig_err_exit: 1630 return (rv); 1631 } 1632 1633 /* 1634 * receive callback routine. Invoked by MAC layer when there 1635 * are pkts being passed up from physical device. 1636 * 1637 * PERF: It may be more efficient when the card is in promisc 1638 * mode to check the dest address of the pkts here (against 1639 * the FDB) rather than checking later. Needs to be investigated. 1640 */ 1641 static void 1642 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 1643 { 1644 _NOTE(ARGUNUSED(mrh)) 1645 1646 vsw_t *vswp = (vsw_t *)arg; 1647 1648 ASSERT(vswp != NULL); 1649 1650 D1(vswp, "vsw_rx_cb: enter"); 1651 1652 /* switch the chain of packets received */ 1653 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 1654 1655 D1(vswp, "vsw_rx_cb: exit"); 1656 } 1657 1658 /* 1659 * Send a message out over the physical device via the MAC layer. 1660 * 1661 * Returns any mblks that it was unable to transmit. 1662 */ 1663 static mblk_t * 1664 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 1665 { 1666 const mac_txinfo_t *mtp; 1667 mblk_t *nextp; 1668 1669 if (vswp->mh == NULL) { 1670 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 1671 return (mp); 1672 } else { 1673 for (;;) { 1674 nextp = mp->b_next; 1675 mp->b_next = NULL; 1676 1677 mtp = vswp->txinfo; 1678 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 1679 mp->b_next = nextp; 1680 break; 1681 } 1682 1683 if ((mp = nextp) == NULL) 1684 break; 1685 1686 } 1687 1688 } 1689 1690 return (mp); 1691 } 1692 1693 /* 1694 * Register with the MAC layer as a network device, so we 1695 * can be plumbed if necessary. 1696 */ 1697 static int 1698 vsw_mac_register(vsw_t *vswp) 1699 { 1700 mac_register_t *macp; 1701 int rv; 1702 1703 D1(vswp, "%s: enter", __func__); 1704 1705 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 1706 return (EINVAL); 1707 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1708 macp->m_driver = vswp; 1709 macp->m_dip = vswp->dip; 1710 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 1711 macp->m_callbacks = &vsw_m_callbacks; 1712 macp->m_min_sdu = 0; 1713 macp->m_max_sdu = ETHERMTU; 1714 rv = mac_register(macp, &vswp->if_mh); 1715 mac_free(macp); 1716 if (rv == 0) 1717 vswp->if_state |= VSW_IF_REG; 1718 1719 D1(vswp, "%s: exit", __func__); 1720 1721 return (rv); 1722 } 1723 1724 static int 1725 vsw_mac_unregister(vsw_t *vswp) 1726 { 1727 int rv = 0; 1728 1729 D1(vswp, "%s: enter", __func__); 1730 1731 WRITE_ENTER(&vswp->if_lockrw); 1732 1733 if (vswp->if_state & VSW_IF_REG) { 1734 rv = mac_unregister(vswp->if_mh); 1735 if (rv != 0) { 1736 DWARN(vswp, "%s: unable to unregister from MAC " 1737 "framework", __func__); 1738 1739 RW_EXIT(&vswp->if_lockrw); 1740 D1(vswp, "%s: fail exit", __func__); 1741 return (rv); 1742 } 1743 1744 /* mark i/f as down and unregistered */ 1745 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 1746 } 1747 RW_EXIT(&vswp->if_lockrw); 1748 1749 vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR); 1750 1751 D1(vswp, "%s: exit", __func__); 1752 1753 return (rv); 1754 } 1755 1756 static int 1757 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 1758 { 1759 vsw_t *vswp = (vsw_t *)arg; 1760 1761 D1(vswp, "%s: enter", __func__); 1762 1763 if (vswp->mh == NULL) 1764 return (EINVAL); 1765 1766 /* return stats from underlying device */ 1767 *val = mac_stat_get(vswp->mh, stat); 1768 return (0); 1769 } 1770 1771 static void 1772 vsw_m_stop(void *arg) 1773 { 1774 vsw_t *vswp = (vsw_t *)arg; 1775 1776 D1(vswp, "%s: enter", __func__); 1777 1778 WRITE_ENTER(&vswp->if_lockrw); 1779 vswp->if_state &= ~VSW_IF_UP; 1780 RW_EXIT(&vswp->if_lockrw); 1781 1782 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 1783 } 1784 1785 static int 1786 vsw_m_start(void *arg) 1787 { 1788 vsw_t *vswp = (vsw_t *)arg; 1789 1790 D1(vswp, "%s: enter", __func__); 1791 1792 WRITE_ENTER(&vswp->if_lockrw); 1793 vswp->if_state |= VSW_IF_UP; 1794 RW_EXIT(&vswp->if_lockrw); 1795 1796 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 1797 return (0); 1798 } 1799 1800 /* 1801 * Change the local interface address. 1802 */ 1803 static int 1804 vsw_m_unicst(void *arg, const uint8_t *macaddr) 1805 { 1806 vsw_t *vswp = (vsw_t *)arg; 1807 1808 D1(vswp, "%s: enter", __func__); 1809 1810 WRITE_ENTER(&vswp->if_lockrw); 1811 ether_copy(macaddr, &vswp->if_addr); 1812 RW_EXIT(&vswp->if_lockrw); 1813 1814 D1(vswp, "%s: exit", __func__); 1815 1816 return (0); 1817 } 1818 1819 static int 1820 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 1821 { 1822 vsw_t *vswp = (vsw_t *)arg; 1823 mcst_addr_t *mcst_p = NULL; 1824 uint64_t addr = 0x0; 1825 int i, ret = 0; 1826 1827 D1(vswp, "%s: enter", __func__); 1828 1829 /* 1830 * Convert address into form that can be used 1831 * as hash table key. 1832 */ 1833 for (i = 0; i < ETHERADDRL; i++) { 1834 addr = (addr << 8) | mca[i]; 1835 } 1836 1837 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 1838 1839 if (add) { 1840 D2(vswp, "%s: adding multicast", __func__); 1841 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 1842 /* 1843 * Update the list of multicast addresses 1844 * contained within the vsw_t structure to 1845 * include this new one. 1846 */ 1847 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 1848 if (mcst_p == NULL) { 1849 DERR(vswp, "%s unable to alloc mem", __func__); 1850 return (1); 1851 } 1852 mcst_p->addr = addr; 1853 1854 mutex_enter(&vswp->mca_lock); 1855 mcst_p->nextp = vswp->mcap; 1856 vswp->mcap = mcst_p; 1857 mutex_exit(&vswp->mca_lock); 1858 1859 /* 1860 * Call into the underlying driver to program the 1861 * address into HW. 1862 */ 1863 if (vswp->mh != NULL) { 1864 ret = mac_multicst_add(vswp->mh, mca); 1865 if (ret != 0) { 1866 cmn_err(CE_WARN, "!unable to add " 1867 "multicast address"); 1868 goto vsw_remove_addr; 1869 } 1870 } 1871 } else { 1872 cmn_err(CE_WARN, "!unable to add multicast address"); 1873 } 1874 return (ret); 1875 } 1876 1877 vsw_remove_addr: 1878 1879 D2(vswp, "%s: removing multicast", __func__); 1880 /* 1881 * Remove the address from the hash table.. 1882 */ 1883 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 1884 1885 /* 1886 * ..and then from the list maintained in the 1887 * vsw_t structure. 1888 */ 1889 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 1890 1891 if (vswp->mh != NULL) 1892 (void) mac_multicst_remove(vswp->mh, mca); 1893 } 1894 1895 D1(vswp, "%s: exit", __func__); 1896 1897 return (0); 1898 } 1899 1900 static int 1901 vsw_m_promisc(void *arg, boolean_t on) 1902 { 1903 vsw_t *vswp = (vsw_t *)arg; 1904 1905 D1(vswp, "%s: enter", __func__); 1906 1907 WRITE_ENTER(&vswp->if_lockrw); 1908 if (on) 1909 vswp->if_state |= VSW_IF_PROMISC; 1910 else 1911 vswp->if_state &= ~VSW_IF_PROMISC; 1912 RW_EXIT(&vswp->if_lockrw); 1913 1914 D1(vswp, "%s: exit", __func__); 1915 1916 return (0); 1917 } 1918 1919 static mblk_t * 1920 vsw_m_tx(void *arg, mblk_t *mp) 1921 { 1922 vsw_t *vswp = (vsw_t *)arg; 1923 1924 D1(vswp, "%s: enter", __func__); 1925 1926 vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 1927 1928 D1(vswp, "%s: exit", __func__); 1929 1930 return (NULL); 1931 } 1932 1933 /* 1934 * Register for machine description (MD) updates. 1935 */ 1936 static void 1937 vsw_mdeg_register(vsw_t *vswp) 1938 { 1939 mdeg_prop_spec_t *pspecp; 1940 mdeg_node_spec_t *inst_specp; 1941 mdeg_handle_t mdeg_hdl; 1942 size_t templatesz; 1943 int inst, rv; 1944 1945 D1(vswp, "%s: enter", __func__); 1946 1947 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 1948 DDI_PROP_DONTPASS, reg_propname, -1); 1949 if (inst == -1) { 1950 DERR(vswp, "%s: unable to get %s property", 1951 __func__, reg_propname); 1952 return; 1953 } 1954 1955 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 1956 1957 /* 1958 * Allocate and initialize a per-instance copy 1959 * of the global property spec array that will 1960 * uniquely identify this vsw instance. 1961 */ 1962 templatesz = sizeof (vsw_prop_template); 1963 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 1964 1965 bcopy(vsw_prop_template, pspecp, templatesz); 1966 1967 VSW_SET_MDEG_PROP_INST(pspecp, inst); 1968 1969 /* initialize the complete prop spec structure */ 1970 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 1971 inst_specp->namep = "virtual-device"; 1972 inst_specp->specp = pspecp; 1973 1974 /* perform the registration */ 1975 rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, 1976 (void *)vswp, &mdeg_hdl); 1977 1978 if (rv != MDEG_SUCCESS) { 1979 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 1980 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 1981 kmem_free(pspecp, templatesz); 1982 return; 1983 } 1984 1985 /* save off data that will be needed later */ 1986 vswp->inst_spec = inst_specp; 1987 vswp->mdeg_hdl = mdeg_hdl; 1988 1989 D1(vswp, "%s: exit", __func__); 1990 } 1991 1992 static void 1993 vsw_mdeg_unregister(vsw_t *vswp) 1994 { 1995 D1(vswp, "vsw_mdeg_unregister: enter"); 1996 1997 (void) mdeg_unregister(vswp->mdeg_hdl); 1998 1999 if (vswp->inst_spec->specp != NULL) { 2000 (void) kmem_free(vswp->inst_spec->specp, 2001 sizeof (vsw_prop_template)); 2002 vswp->inst_spec->specp = NULL; 2003 } 2004 2005 if (vswp->inst_spec != NULL) { 2006 (void) kmem_free(vswp->inst_spec, 2007 sizeof (mdeg_node_spec_t)); 2008 vswp->inst_spec = NULL; 2009 } 2010 2011 D1(vswp, "vsw_mdeg_unregister: exit"); 2012 } 2013 2014 static int 2015 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2016 { 2017 vsw_t *vswp; 2018 int idx; 2019 md_t *mdp; 2020 mde_cookie_t node; 2021 uint64_t inst; 2022 2023 if (resp == NULL) 2024 return (MDEG_FAILURE); 2025 2026 vswp = (vsw_t *)cb_argp; 2027 2028 D1(vswp, "%s: added %d : removed %d : matched %d", 2029 __func__, resp->added.nelem, resp->removed.nelem, 2030 resp->match_prev.nelem); 2031 2032 /* process added ports */ 2033 for (idx = 0; idx < resp->added.nelem; idx++) { 2034 mdp = resp->added.mdp; 2035 node = resp->added.mdep[idx]; 2036 2037 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2038 2039 if (vsw_port_add(vswp, mdp, &node) != 0) { 2040 cmn_err(CE_WARN, "Unable to add new port (0x%lx)", 2041 node); 2042 } 2043 } 2044 2045 /* process removed ports */ 2046 for (idx = 0; idx < resp->removed.nelem; idx++) { 2047 mdp = resp->removed.mdp; 2048 node = resp->removed.mdep[idx]; 2049 2050 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2051 DERR(vswp, "%s: prop(%s) not found port(%d)", 2052 __func__, id_propname, idx); 2053 continue; 2054 } 2055 2056 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2057 2058 if (vsw_port_detach(vswp, inst) != 0) { 2059 cmn_err(CE_WARN, "Unable to remove port %ld", inst); 2060 } 2061 } 2062 2063 /* 2064 * Currently no support for updating already active ports. 2065 * So, ignore the match_curr and match_priv arrays for now. 2066 */ 2067 2068 D1(vswp, "%s: exit", __func__); 2069 2070 return (MDEG_SUCCESS); 2071 } 2072 2073 /* 2074 * Add a new port to the system. 2075 * 2076 * Returns 0 on success, 1 on failure. 2077 */ 2078 int 2079 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 2080 { 2081 uint64_t ldc_id; 2082 uint8_t *addrp; 2083 int i, addrsz; 2084 int num_nodes = 0, nchan = 0; 2085 int listsz = 0; 2086 mde_cookie_t *listp = NULL; 2087 struct ether_addr ea; 2088 uint64_t macaddr; 2089 uint64_t inst = 0; 2090 vsw_port_t *port; 2091 2092 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 2093 DWARN(vswp, "%s: prop(%s) not found", __func__, 2094 id_propname); 2095 return (1); 2096 } 2097 2098 /* 2099 * Find the channel endpoint node(s) (which should be under this 2100 * port node) which contain the channel id(s). 2101 */ 2102 if ((num_nodes = md_node_count(mdp)) <= 0) { 2103 DERR(vswp, "%s: invalid number of nodes found (%d)", 2104 __func__, num_nodes); 2105 return (1); 2106 } 2107 2108 /* allocate enough space for node list */ 2109 listsz = num_nodes * sizeof (mde_cookie_t); 2110 listp = kmem_zalloc(listsz, KM_SLEEP); 2111 2112 nchan = md_scan_dag(mdp, *node, 2113 md_find_name(mdp, chan_propname), 2114 md_find_name(mdp, "fwd"), listp); 2115 2116 if (nchan <= 0) { 2117 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 2118 kmem_free(listp, listsz); 2119 return (1); 2120 } 2121 2122 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 2123 2124 /* use property from first node found */ 2125 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 2126 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 2127 id_propname); 2128 kmem_free(listp, listsz); 2129 return (1); 2130 } 2131 2132 /* don't need list any more */ 2133 kmem_free(listp, listsz); 2134 2135 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 2136 2137 /* read mac-address property */ 2138 if (md_get_prop_data(mdp, *node, remaddr_propname, 2139 &addrp, &addrsz)) { 2140 DWARN(vswp, "%s: prop(%s) not found", 2141 __func__, remaddr_propname); 2142 return (1); 2143 } 2144 2145 if (addrsz < ETHERADDRL) { 2146 DWARN(vswp, "%s: invalid address size", __func__); 2147 return (1); 2148 } 2149 2150 macaddr = *((uint64_t *)addrp); 2151 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 2152 2153 for (i = ETHERADDRL - 1; i >= 0; i--) { 2154 ea.ether_addr_octet[i] = macaddr & 0xFF; 2155 macaddr >>= 8; 2156 } 2157 2158 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 2159 DERR(vswp, "%s: failed to attach port", __func__); 2160 return (1); 2161 } 2162 2163 port = vsw_lookup_port(vswp, (int)inst); 2164 2165 /* just successfuly created the port, so it should exist */ 2166 ASSERT(port != NULL); 2167 2168 return (0); 2169 } 2170 2171 /* 2172 * Attach the specified port. 2173 * 2174 * Returns 0 on success, 1 on failure. 2175 */ 2176 static int 2177 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 2178 struct ether_addr *macaddr) 2179 { 2180 vsw_port_list_t *plist = &vswp->plist; 2181 vsw_port_t *port, **prev_port; 2182 int i; 2183 2184 D1(vswp, "%s: enter : port %d", __func__, p_instance); 2185 2186 /* port already exists? */ 2187 READ_ENTER(&plist->lockrw); 2188 for (port = plist->head; port != NULL; port = port->p_next) { 2189 if (port->p_instance == p_instance) { 2190 DWARN(vswp, "%s: port instance %d already attached", 2191 __func__, p_instance); 2192 RW_EXIT(&plist->lockrw); 2193 return (1); 2194 } 2195 } 2196 RW_EXIT(&plist->lockrw); 2197 2198 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 2199 port->p_vswp = vswp; 2200 port->p_instance = p_instance; 2201 port->p_ldclist.num_ldcs = 0; 2202 port->p_ldclist.head = NULL; 2203 port->addr_set = VSW_ADDR_UNSET; 2204 2205 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 2206 2207 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 2208 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 2209 2210 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 2211 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 2212 2213 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 2214 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 2215 port->state = VSW_PORT_INIT; 2216 2217 if (nids > VSW_PORT_MAX_LDCS) { 2218 D2(vswp, "%s: using first of %d ldc ids", 2219 __func__, nids); 2220 nids = VSW_PORT_MAX_LDCS; 2221 } 2222 2223 D2(vswp, "%s: %d nids", __func__, nids); 2224 for (i = 0; i < nids; i++) { 2225 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 2226 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 2227 DERR(vswp, "%s: ldc_attach failed", __func__); 2228 2229 rw_destroy(&port->p_ldclist.lockrw); 2230 2231 cv_destroy(&port->ref_cv); 2232 mutex_destroy(&port->ref_lock); 2233 2234 cv_destroy(&port->state_cv); 2235 mutex_destroy(&port->state_lock); 2236 2237 mutex_destroy(&port->tx_lock); 2238 mutex_destroy(&port->mca_lock); 2239 kmem_free(port, sizeof (vsw_port_t)); 2240 return (1); 2241 } 2242 } 2243 2244 ether_copy(macaddr, &port->p_macaddr); 2245 2246 WRITE_ENTER(&plist->lockrw); 2247 2248 /* create the fdb entry for this port/mac address */ 2249 (void) vsw_add_fdb(vswp, port); 2250 2251 (void) vsw_set_hw(vswp, port); 2252 2253 /* link it into the list of ports for this vsw instance */ 2254 prev_port = (vsw_port_t **)(&plist->head); 2255 port->p_next = *prev_port; 2256 *prev_port = port; 2257 plist->num_ports++; 2258 RW_EXIT(&plist->lockrw); 2259 2260 /* 2261 * Initialise the port and any ldc's under it. 2262 */ 2263 (void) vsw_init_ldcs(port); 2264 2265 D1(vswp, "%s: exit", __func__); 2266 return (0); 2267 } 2268 2269 /* 2270 * Detach the specified port. 2271 * 2272 * Returns 0 on success, 1 on failure. 2273 */ 2274 static int 2275 vsw_port_detach(vsw_t *vswp, int p_instance) 2276 { 2277 vsw_port_t *port = NULL; 2278 vsw_port_list_t *plist = &vswp->plist; 2279 2280 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 2281 2282 WRITE_ENTER(&plist->lockrw); 2283 2284 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 2285 RW_EXIT(&plist->lockrw); 2286 return (1); 2287 } 2288 2289 if (vsw_plist_del_node(vswp, port)) { 2290 RW_EXIT(&plist->lockrw); 2291 return (1); 2292 } 2293 2294 /* Remove address if was programmed into HW. */ 2295 (void) vsw_unset_hw(vswp, port); 2296 2297 /* Remove the fdb entry for this port/mac address */ 2298 (void) vsw_del_fdb(vswp, port); 2299 2300 /* Remove any multicast addresses.. */ 2301 vsw_del_mcst_port(port); 2302 2303 /* 2304 * No longer need to hold writer lock on port list now 2305 * that we have unlinked the target port from the list. 2306 */ 2307 RW_EXIT(&plist->lockrw); 2308 2309 READ_ENTER(&plist->lockrw); 2310 2311 if (vswp->recfg_reqd) 2312 (void) vsw_reconfig_hw(vswp); 2313 2314 RW_EXIT(&plist->lockrw); 2315 2316 if (vsw_port_delete(port)) { 2317 return (1); 2318 } 2319 2320 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 2321 return (0); 2322 } 2323 2324 /* 2325 * Detach all active ports. 2326 * 2327 * Returns 0 on success, 1 on failure. 2328 */ 2329 static int 2330 vsw_detach_ports(vsw_t *vswp) 2331 { 2332 vsw_port_list_t *plist = &vswp->plist; 2333 vsw_port_t *port = NULL; 2334 2335 D1(vswp, "%s: enter", __func__); 2336 2337 WRITE_ENTER(&plist->lockrw); 2338 2339 while ((port = plist->head) != NULL) { 2340 if (vsw_plist_del_node(vswp, port)) { 2341 DERR(vswp, "%s: Error deleting port %d" 2342 " from port list", __func__, 2343 port->p_instance); 2344 RW_EXIT(&plist->lockrw); 2345 return (1); 2346 } 2347 2348 /* Remove address if was programmed into HW. */ 2349 (void) vsw_unset_hw(vswp, port); 2350 2351 /* Remove the fdb entry for this port/mac address */ 2352 (void) vsw_del_fdb(vswp, port); 2353 2354 /* Remove any multicast addresses.. */ 2355 vsw_del_mcst_port(port); 2356 2357 /* 2358 * No longer need to hold the lock on the port list 2359 * now that we have unlinked the target port from the 2360 * list. 2361 */ 2362 RW_EXIT(&plist->lockrw); 2363 if (vsw_port_delete(port)) { 2364 DERR(vswp, "%s: Error deleting port %d", 2365 __func__, port->p_instance); 2366 return (1); 2367 } 2368 WRITE_ENTER(&plist->lockrw); 2369 } 2370 RW_EXIT(&plist->lockrw); 2371 2372 D1(vswp, "%s: exit", __func__); 2373 2374 return (0); 2375 } 2376 2377 /* 2378 * Delete the specified port. 2379 * 2380 * Returns 0 on success, 1 on failure. 2381 */ 2382 static int 2383 vsw_port_delete(vsw_port_t *port) 2384 { 2385 vsw_ldc_list_t *ldcl; 2386 vsw_t *vswp = port->p_vswp; 2387 2388 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 2389 2390 (void) vsw_uninit_ldcs(port); 2391 2392 /* 2393 * Wait for any pending ctrl msg tasks which reference this 2394 * port to finish. 2395 */ 2396 if (vsw_drain_port_taskq(port)) 2397 return (1); 2398 2399 /* 2400 * Wait for port reference count to hit zero. 2401 */ 2402 mutex_enter(&port->ref_lock); 2403 while (port->ref_cnt != 0) 2404 cv_wait(&port->ref_cv, &port->ref_lock); 2405 mutex_exit(&port->ref_lock); 2406 2407 /* 2408 * Wait for any active callbacks to finish 2409 */ 2410 if (vsw_drain_ldcs(port)) 2411 return (1); 2412 2413 ldcl = &port->p_ldclist; 2414 WRITE_ENTER(&ldcl->lockrw); 2415 while (ldcl->num_ldcs > 0) { 2416 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 2417 cmn_err(CE_WARN, "unable to detach ldc %ld", 2418 ldcl->head->ldc_id); 2419 RW_EXIT(&ldcl->lockrw); 2420 return (1); 2421 } 2422 } 2423 RW_EXIT(&ldcl->lockrw); 2424 2425 rw_destroy(&port->p_ldclist.lockrw); 2426 2427 mutex_destroy(&port->mca_lock); 2428 mutex_destroy(&port->tx_lock); 2429 cv_destroy(&port->ref_cv); 2430 mutex_destroy(&port->ref_lock); 2431 2432 cv_destroy(&port->state_cv); 2433 mutex_destroy(&port->state_lock); 2434 2435 kmem_free(port, sizeof (vsw_port_t)); 2436 2437 D1(vswp, "%s: exit", __func__); 2438 2439 return (0); 2440 } 2441 2442 /* 2443 * Attach a logical domain channel (ldc) under a specified port. 2444 * 2445 * Returns 0 on success, 1 on failure. 2446 */ 2447 static int 2448 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 2449 { 2450 vsw_t *vswp = port->p_vswp; 2451 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2452 vsw_ldc_t *ldcp = NULL; 2453 ldc_attr_t attr; 2454 ldc_status_t istatus; 2455 int status = DDI_FAILURE; 2456 int rv; 2457 2458 D1(vswp, "%s: enter", __func__); 2459 2460 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 2461 if (ldcp == NULL) { 2462 DERR(vswp, "%s: kmem_zalloc failed", __func__); 2463 return (1); 2464 } 2465 ldcp->ldc_id = ldc_id; 2466 2467 /* allocate pool of receive mblks */ 2468 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 2469 if (rv) { 2470 DWARN(vswp, "%s: unable to create free mblk pool for" 2471 " channel %ld (rv %d)", __func__, ldc_id, rv); 2472 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2473 return (1); 2474 } 2475 2476 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 2477 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 2478 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 2479 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 2480 2481 /* required for handshake with peer */ 2482 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 2483 ldcp->peer_session = 0; 2484 ldcp->session_status = 0; 2485 2486 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 2487 ldcp->hss_id = 1; /* Initial handshake session id */ 2488 2489 /* only set for outbound lane, inbound set by peer */ 2490 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 2491 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 2492 vsw_set_lane_attr(vswp, &ldcp->lane_out); 2493 2494 attr.devclass = LDC_DEV_NT_SVC; 2495 attr.instance = ddi_get_instance(vswp->dip); 2496 attr.mode = LDC_MODE_UNRELIABLE; 2497 attr.mtu = VSW_LDC_MTU; 2498 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 2499 if (status != 0) { 2500 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 2501 __func__, ldc_id, status); 2502 goto ldc_attach_fail; 2503 } 2504 2505 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 2506 if (status != 0) { 2507 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 2508 __func__, ldc_id, status); 2509 (void) ldc_fini(ldcp->ldc_handle); 2510 goto ldc_attach_fail; 2511 } 2512 2513 2514 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2515 DERR(vswp, "%s: ldc_status failed", __func__); 2516 return (1); 2517 } 2518 2519 ldcp->ldc_status = istatus; 2520 ldcp->ldc_port = port; 2521 ldcp->ldc_vswp = vswp; 2522 2523 /* link it into the list of channels for this port */ 2524 WRITE_ENTER(&ldcl->lockrw); 2525 ldcp->ldc_next = ldcl->head; 2526 ldcl->head = ldcp; 2527 ldcl->num_ldcs++; 2528 RW_EXIT(&ldcl->lockrw); 2529 2530 D1(vswp, "%s: exit", __func__); 2531 return (0); 2532 2533 ldc_attach_fail: 2534 mutex_destroy(&ldcp->ldc_txlock); 2535 mutex_destroy(&ldcp->ldc_cblock); 2536 2537 cv_destroy(&ldcp->drain_cv); 2538 2539 if (ldcp->rxh != NULL) { 2540 if (vio_destroy_mblks(ldcp->rxh) != 0) { 2541 /* 2542 * Something odd has happened, as the destroy 2543 * will only fail if some mblks have been allocated 2544 * from the pool already (which shouldn't happen) 2545 * and have not been returned. 2546 * 2547 * Add the pool pointer to a list maintained in 2548 * the device instance. Another attempt will be made 2549 * to free the pool when the device itself detaches. 2550 */ 2551 cmn_err(CE_WARN, "Creation of ldc channel %ld failed" 2552 " and cannot destroy associated mblk pool", 2553 ldc_id); 2554 ldcp->rxh->nextp = vswp->rxh; 2555 vswp->rxh = ldcp->rxh; 2556 } 2557 } 2558 mutex_destroy(&ldcp->drain_cv_lock); 2559 mutex_destroy(&ldcp->hss_lock); 2560 2561 mutex_destroy(&ldcp->lane_in.seq_lock); 2562 mutex_destroy(&ldcp->lane_out.seq_lock); 2563 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2564 2565 return (1); 2566 } 2567 2568 /* 2569 * Detach a logical domain channel (ldc) belonging to a 2570 * particular port. 2571 * 2572 * Returns 0 on success, 1 on failure. 2573 */ 2574 static int 2575 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 2576 { 2577 vsw_t *vswp = port->p_vswp; 2578 vsw_ldc_t *ldcp, *prev_ldcp; 2579 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2580 int rv; 2581 2582 prev_ldcp = ldcl->head; 2583 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 2584 if (ldcp->ldc_id == ldc_id) { 2585 break; 2586 } 2587 } 2588 2589 /* specified ldc id not found */ 2590 if (ldcp == NULL) { 2591 DERR(vswp, "%s: ldcp = NULL", __func__); 2592 return (1); 2593 } 2594 2595 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 2596 2597 /* 2598 * Before we can close the channel we must release any mapped 2599 * resources (e.g. drings). 2600 */ 2601 vsw_free_lane_resources(ldcp, INBOUND); 2602 vsw_free_lane_resources(ldcp, OUTBOUND); 2603 2604 /* 2605 * If the close fails we are in serious trouble, as won't 2606 * be able to delete the parent port. 2607 */ 2608 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 2609 DERR(vswp, "%s: error %d closing channel %lld", 2610 __func__, rv, ldcp->ldc_id); 2611 return (1); 2612 } 2613 2614 (void) ldc_fini(ldcp->ldc_handle); 2615 2616 ldcp->ldc_status = LDC_INIT; 2617 ldcp->ldc_handle = NULL; 2618 ldcp->ldc_vswp = NULL; 2619 2620 if (ldcp->rxh != NULL) { 2621 if (vio_destroy_mblks(ldcp->rxh)) { 2622 /* 2623 * Mostly likely some mblks are still in use and 2624 * have not been returned to the pool. Add the pool 2625 * to the list maintained in the device instance. 2626 * Another attempt will be made to destroy the pool 2627 * when the device detaches. 2628 */ 2629 ldcp->rxh->nextp = vswp->rxh; 2630 vswp->rxh = ldcp->rxh; 2631 } 2632 } 2633 2634 mutex_destroy(&ldcp->ldc_txlock); 2635 mutex_destroy(&ldcp->ldc_cblock); 2636 cv_destroy(&ldcp->drain_cv); 2637 mutex_destroy(&ldcp->drain_cv_lock); 2638 mutex_destroy(&ldcp->hss_lock); 2639 mutex_destroy(&ldcp->lane_in.seq_lock); 2640 mutex_destroy(&ldcp->lane_out.seq_lock); 2641 2642 /* unlink it from the list */ 2643 prev_ldcp = ldcp->ldc_next; 2644 ldcl->num_ldcs--; 2645 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2646 2647 return (0); 2648 } 2649 2650 /* 2651 * Open and attempt to bring up the channel. Note that channel 2652 * can only be brought up if peer has also opened channel. 2653 * 2654 * Returns 0 if can open and bring up channel, otherwise 2655 * returns 1. 2656 */ 2657 static int 2658 vsw_ldc_init(vsw_ldc_t *ldcp) 2659 { 2660 vsw_t *vswp = ldcp->ldc_vswp; 2661 ldc_status_t istatus = 0; 2662 int rv; 2663 2664 D1(vswp, "%s: enter", __func__); 2665 2666 LDC_ENTER_LOCK(ldcp); 2667 2668 /* don't start at 0 in case clients don't like that */ 2669 ldcp->next_ident = 1; 2670 2671 rv = ldc_open(ldcp->ldc_handle); 2672 if (rv != 0) { 2673 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 2674 __func__, ldcp->ldc_id, rv); 2675 LDC_EXIT_LOCK(ldcp); 2676 return (1); 2677 } 2678 2679 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2680 DERR(vswp, "%s: unable to get status", __func__); 2681 LDC_EXIT_LOCK(ldcp); 2682 return (1); 2683 2684 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 2685 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 2686 __func__, ldcp->ldc_id, istatus); 2687 LDC_EXIT_LOCK(ldcp); 2688 return (1); 2689 } 2690 2691 ldcp->ldc_status = istatus; 2692 rv = ldc_up(ldcp->ldc_handle); 2693 if (rv != 0) { 2694 /* 2695 * Not a fatal error for ldc_up() to fail, as peer 2696 * end point may simply not be ready yet. 2697 */ 2698 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 2699 ldcp->ldc_id, rv); 2700 LDC_EXIT_LOCK(ldcp); 2701 return (1); 2702 } 2703 2704 /* 2705 * ldc_up() call is non-blocking so need to explicitly 2706 * check channel status to see if in fact the channel 2707 * is UP. 2708 */ 2709 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2710 DERR(vswp, "%s: unable to get status", __func__); 2711 LDC_EXIT_LOCK(ldcp); 2712 return (1); 2713 2714 } else if (istatus != LDC_UP) { 2715 DERR(vswp, "%s: id(%lld) status(%d) is not UP", 2716 __func__, ldcp->ldc_id, istatus); 2717 } else { 2718 ldcp->ldc_status = istatus; 2719 } 2720 2721 LDC_EXIT_LOCK(ldcp); 2722 2723 D1(vswp, "%s: exit", __func__); 2724 return (0); 2725 } 2726 2727 /* disable callbacks on the channel */ 2728 static int 2729 vsw_ldc_uninit(vsw_ldc_t *ldcp) 2730 { 2731 vsw_t *vswp = ldcp->ldc_vswp; 2732 int rv; 2733 2734 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 2735 2736 LDC_ENTER_LOCK(ldcp); 2737 2738 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 2739 if (rv != 0) { 2740 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 2741 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 2742 LDC_EXIT_LOCK(ldcp); 2743 return (1); 2744 } 2745 2746 ldcp->ldc_status = LDC_INIT; 2747 2748 LDC_EXIT_LOCK(ldcp); 2749 2750 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 2751 2752 return (0); 2753 } 2754 2755 static int 2756 vsw_init_ldcs(vsw_port_t *port) 2757 { 2758 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2759 vsw_ldc_t *ldcp; 2760 2761 READ_ENTER(&ldcl->lockrw); 2762 ldcp = ldcl->head; 2763 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2764 (void) vsw_ldc_init(ldcp); 2765 } 2766 RW_EXIT(&ldcl->lockrw); 2767 2768 return (0); 2769 } 2770 2771 static int 2772 vsw_uninit_ldcs(vsw_port_t *port) 2773 { 2774 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2775 vsw_ldc_t *ldcp; 2776 2777 D1(NULL, "vsw_uninit_ldcs: enter\n"); 2778 2779 READ_ENTER(&ldcl->lockrw); 2780 ldcp = ldcl->head; 2781 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2782 (void) vsw_ldc_uninit(ldcp); 2783 } 2784 RW_EXIT(&ldcl->lockrw); 2785 2786 D1(NULL, "vsw_uninit_ldcs: exit\n"); 2787 2788 return (0); 2789 } 2790 2791 /* 2792 * Wait until the callback(s) associated with the ldcs under the specified 2793 * port have completed. 2794 * 2795 * Prior to this function being invoked each channel under this port 2796 * should have been quiesced via ldc_set_cb_mode(DISABLE). 2797 * 2798 * A short explaination of what we are doing below.. 2799 * 2800 * The simplest approach would be to have a reference counter in 2801 * the ldc structure which is increment/decremented by the callbacks as 2802 * they use the channel. The drain function could then simply disable any 2803 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 2804 * there is a tiny window here - before the callback is able to get the lock 2805 * on the channel it is interrupted and this function gets to execute. It 2806 * sees that the ref count is zero and believes its free to delete the 2807 * associated data structures. 2808 * 2809 * We get around this by taking advantage of the fact that before the ldc 2810 * framework invokes a callback it sets a flag to indicate that there is a 2811 * callback active (or about to become active). If when we attempt to 2812 * unregister a callback when this active flag is set then the unregister 2813 * will fail with EWOULDBLOCK. 2814 * 2815 * If the unregister fails we do a cv_timedwait. We will either be signaled 2816 * by the callback as it is exiting (note we have to wait a short period to 2817 * allow the callback to return fully to the ldc framework and it to clear 2818 * the active flag), or by the timer expiring. In either case we again attempt 2819 * the unregister. We repeat this until we can succesfully unregister the 2820 * callback. 2821 * 2822 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 2823 * the case where the callback has finished but the ldc framework has not yet 2824 * cleared the active flag. In this case we would never get a cv_signal. 2825 */ 2826 static int 2827 vsw_drain_ldcs(vsw_port_t *port) 2828 { 2829 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2830 vsw_ldc_t *ldcp; 2831 vsw_t *vswp = port->p_vswp; 2832 2833 D1(vswp, "%s: enter", __func__); 2834 2835 READ_ENTER(&ldcl->lockrw); 2836 2837 ldcp = ldcl->head; 2838 2839 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2840 /* 2841 * If we can unregister the channel callback then we 2842 * know that there is no callback either running or 2843 * scheduled to run for this channel so move on to next 2844 * channel in the list. 2845 */ 2846 mutex_enter(&ldcp->drain_cv_lock); 2847 2848 /* prompt active callbacks to quit */ 2849 ldcp->drain_state = VSW_LDC_DRAINING; 2850 2851 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 2852 D2(vswp, "%s: unreg callback for chan %ld", __func__, 2853 ldcp->ldc_id); 2854 mutex_exit(&ldcp->drain_cv_lock); 2855 continue; 2856 } else { 2857 /* 2858 * If we end up here we know that either 1) a callback 2859 * is currently executing, 2) is about to start (i.e. 2860 * the ldc framework has set the active flag but 2861 * has not actually invoked the callback yet, or 3) 2862 * has finished and has returned to the ldc framework 2863 * but the ldc framework has not yet cleared the 2864 * active bit. 2865 * 2866 * Wait for it to finish. 2867 */ 2868 while (ldc_unreg_callback(ldcp->ldc_handle) 2869 == EWOULDBLOCK) 2870 (void) cv_timedwait(&ldcp->drain_cv, 2871 &ldcp->drain_cv_lock, lbolt + hz); 2872 2873 mutex_exit(&ldcp->drain_cv_lock); 2874 D2(vswp, "%s: unreg callback for chan %ld after " 2875 "timeout", __func__, ldcp->ldc_id); 2876 } 2877 } 2878 RW_EXIT(&ldcl->lockrw); 2879 2880 D1(vswp, "%s: exit", __func__); 2881 return (0); 2882 } 2883 2884 /* 2885 * Wait until all tasks which reference this port have completed. 2886 * 2887 * Prior to this function being invoked each channel under this port 2888 * should have been quiesced via ldc_set_cb_mode(DISABLE). 2889 */ 2890 static int 2891 vsw_drain_port_taskq(vsw_port_t *port) 2892 { 2893 vsw_t *vswp = port->p_vswp; 2894 2895 D1(vswp, "%s: enter", __func__); 2896 2897 /* 2898 * Mark the port as in the process of being detached, and 2899 * dispatch a marker task to the queue so we know when all 2900 * relevant tasks have completed. 2901 */ 2902 mutex_enter(&port->state_lock); 2903 port->state = VSW_PORT_DETACHING; 2904 2905 if ((vswp->taskq_p == NULL) || 2906 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 2907 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 2908 DERR(vswp, "%s: unable to dispatch marker task", 2909 __func__); 2910 mutex_exit(&port->state_lock); 2911 return (1); 2912 } 2913 2914 /* 2915 * Wait for the marker task to finish. 2916 */ 2917 while (port->state != VSW_PORT_DETACHABLE) 2918 cv_wait(&port->state_cv, &port->state_lock); 2919 2920 mutex_exit(&port->state_lock); 2921 2922 D1(vswp, "%s: exit", __func__); 2923 2924 return (0); 2925 } 2926 2927 static void 2928 vsw_marker_task(void *arg) 2929 { 2930 vsw_port_t *port = arg; 2931 vsw_t *vswp = port->p_vswp; 2932 2933 D1(vswp, "%s: enter", __func__); 2934 2935 mutex_enter(&port->state_lock); 2936 2937 /* 2938 * No further tasks should be dispatched which reference 2939 * this port so ok to mark it as safe to detach. 2940 */ 2941 port->state = VSW_PORT_DETACHABLE; 2942 2943 cv_signal(&port->state_cv); 2944 2945 mutex_exit(&port->state_lock); 2946 2947 D1(vswp, "%s: exit", __func__); 2948 } 2949 2950 static vsw_port_t * 2951 vsw_lookup_port(vsw_t *vswp, int p_instance) 2952 { 2953 vsw_port_list_t *plist = &vswp->plist; 2954 vsw_port_t *port; 2955 2956 for (port = plist->head; port != NULL; port = port->p_next) { 2957 if (port->p_instance == p_instance) { 2958 D2(vswp, "vsw_lookup_port: found p_instance\n"); 2959 return (port); 2960 } 2961 } 2962 2963 return (NULL); 2964 } 2965 2966 /* 2967 * Search for and remove the specified port from the port 2968 * list. Returns 0 if able to locate and remove port, otherwise 2969 * returns 1. 2970 */ 2971 static int 2972 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 2973 { 2974 vsw_port_list_t *plist = &vswp->plist; 2975 vsw_port_t *curr_p, *prev_p; 2976 2977 if (plist->head == NULL) 2978 return (1); 2979 2980 curr_p = prev_p = plist->head; 2981 2982 while (curr_p != NULL) { 2983 if (curr_p == port) { 2984 if (prev_p == curr_p) { 2985 plist->head = curr_p->p_next; 2986 } else { 2987 prev_p->p_next = curr_p->p_next; 2988 } 2989 plist->num_ports--; 2990 break; 2991 } else { 2992 prev_p = curr_p; 2993 curr_p = curr_p->p_next; 2994 } 2995 } 2996 return (0); 2997 } 2998 2999 /* 3000 * Interrupt handler for ldc messages. 3001 */ 3002 static uint_t 3003 vsw_ldc_cb(uint64_t event, caddr_t arg) 3004 { 3005 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3006 vsw_t *vswp = ldcp->ldc_vswp; 3007 ldc_status_t lstatus; 3008 int rv; 3009 3010 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3011 3012 mutex_enter(&ldcp->ldc_cblock); 3013 3014 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 3015 mutex_exit(&ldcp->ldc_cblock); 3016 return (LDC_SUCCESS); 3017 } 3018 3019 if (event & LDC_EVT_UP) { 3020 /* 3021 * Channel has come up, get the state and then start 3022 * the handshake. 3023 */ 3024 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3025 if (rv != 0) { 3026 cmn_err(CE_WARN, "Unable to read channel state"); 3027 } 3028 ldcp->ldc_status = lstatus; 3029 3030 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 3031 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3032 3033 vsw_restart_handshake(ldcp); 3034 3035 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3036 } 3037 3038 if (event & LDC_EVT_READ) { 3039 /* 3040 * Data available for reading. 3041 */ 3042 D2(vswp, "%s: id(ld) event(%llx) data READ", 3043 __func__, ldcp->ldc_id, event); 3044 3045 vsw_process_pkt(ldcp); 3046 3047 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3048 3049 goto vsw_cb_exit; 3050 } 3051 3052 if (event & LDC_EVT_RESET) { 3053 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3054 if (rv != 0) { 3055 cmn_err(CE_WARN, "Unable to read channel state"); 3056 } else { 3057 ldcp->ldc_status = lstatus; 3058 } 3059 D2(vswp, "%s: id(%ld) event(%llx) RESET: status (%ld)", 3060 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3061 } 3062 3063 if (event & LDC_EVT_DOWN) { 3064 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3065 if (rv != 0) { 3066 cmn_err(CE_WARN, "Unable to read channel state"); 3067 } else { 3068 ldcp->ldc_status = lstatus; 3069 } 3070 3071 D2(vswp, "%s: id(%ld) event(%llx) DOWN: status (%ld)", 3072 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3073 3074 } 3075 3076 /* 3077 * Catch either LDC_EVT_WRITE which we don't support or any 3078 * unknown event. 3079 */ 3080 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 3081 | LDC_EVT_DOWN | LDC_EVT_READ)) { 3082 3083 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 3084 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3085 } 3086 3087 vsw_cb_exit: 3088 mutex_exit(&ldcp->ldc_cblock); 3089 3090 /* 3091 * Let the drain function know we are finishing if it 3092 * is waiting. 3093 */ 3094 mutex_enter(&ldcp->drain_cv_lock); 3095 if (ldcp->drain_state == VSW_LDC_DRAINING) 3096 cv_signal(&ldcp->drain_cv); 3097 mutex_exit(&ldcp->drain_cv_lock); 3098 3099 return (LDC_SUCCESS); 3100 } 3101 3102 /* 3103 * (Re)start a handshake with our peer by sending them 3104 * our version info. 3105 */ 3106 static void 3107 vsw_restart_handshake(vsw_ldc_t *ldcp) 3108 { 3109 vsw_t *vswp = ldcp->ldc_vswp; 3110 vsw_port_t *port; 3111 vsw_ldc_list_t *ldcl; 3112 3113 D1(vswp, "vsw_restart_handshake: enter"); 3114 3115 port = ldcp->ldc_port; 3116 ldcl = &port->p_ldclist; 3117 3118 WRITE_ENTER(&ldcl->lockrw); 3119 3120 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 3121 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 3122 3123 vsw_free_lane_resources(ldcp, INBOUND); 3124 vsw_free_lane_resources(ldcp, OUTBOUND); 3125 RW_EXIT(&ldcl->lockrw); 3126 3127 ldcp->lane_in.lstate = 0; 3128 ldcp->lane_out.lstate = 0; 3129 3130 /* 3131 * Remove parent port from any multicast groups 3132 * it may have registered with. Client must resend 3133 * multicast add command after handshake completes. 3134 */ 3135 (void) vsw_del_fdb(vswp, port); 3136 3137 vsw_del_mcst_port(port); 3138 3139 ldcp->hphase = VSW_MILESTONE0; 3140 3141 ldcp->peer_session = 0; 3142 ldcp->session_status = 0; 3143 3144 /* 3145 * We now increment the transaction group id. This allows 3146 * us to identify and disard any tasks which are still pending 3147 * on the taskq and refer to the handshake session we are about 3148 * to restart. These stale messages no longer have any real 3149 * meaning. 3150 */ 3151 mutex_enter(&ldcp->hss_lock); 3152 ldcp->hss_id++; 3153 mutex_exit(&ldcp->hss_lock); 3154 3155 if (ldcp->hcnt++ > vsw_num_handshakes) { 3156 cmn_err(CE_WARN, "exceeded number of permitted " 3157 "handshake attempts (%d) on channel %ld", 3158 ldcp->hcnt, ldcp->ldc_id); 3159 return; 3160 } 3161 3162 vsw_send_ver(ldcp); 3163 3164 D1(vswp, "vsw_restart_handshake: exit"); 3165 } 3166 3167 /* 3168 * returns 0 if legal for event signified by flag to have 3169 * occured at the time it did. Otherwise returns 1. 3170 */ 3171 int 3172 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 3173 { 3174 vsw_t *vswp = ldcp->ldc_vswp; 3175 uint64_t state; 3176 uint64_t phase; 3177 3178 if (dir == INBOUND) 3179 state = ldcp->lane_in.lstate; 3180 else 3181 state = ldcp->lane_out.lstate; 3182 3183 phase = ldcp->hphase; 3184 3185 switch (flag) { 3186 case VSW_VER_INFO_RECV: 3187 if (phase > VSW_MILESTONE0) { 3188 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 3189 " when in state %d\n", ldcp->ldc_id, phase); 3190 vsw_restart_handshake(ldcp); 3191 return (1); 3192 } 3193 break; 3194 3195 case VSW_VER_ACK_RECV: 3196 case VSW_VER_NACK_RECV: 3197 if (!(state & VSW_VER_INFO_SENT)) { 3198 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 3199 " or VER_NACK when in state %d\n", 3200 ldcp->ldc_id, phase); 3201 vsw_restart_handshake(ldcp); 3202 return (1); 3203 } else 3204 state &= ~VSW_VER_INFO_SENT; 3205 break; 3206 3207 case VSW_ATTR_INFO_RECV: 3208 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 3209 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 3210 " when in state %d\n", ldcp->ldc_id, phase); 3211 vsw_restart_handshake(ldcp); 3212 return (1); 3213 } 3214 break; 3215 3216 case VSW_ATTR_ACK_RECV: 3217 case VSW_ATTR_NACK_RECV: 3218 if (!(state & VSW_ATTR_INFO_SENT)) { 3219 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 3220 " or ATTR_NACK when in state %d\n", 3221 ldcp->ldc_id, phase); 3222 vsw_restart_handshake(ldcp); 3223 return (1); 3224 } else 3225 state &= ~VSW_ATTR_INFO_SENT; 3226 break; 3227 3228 case VSW_DRING_INFO_RECV: 3229 if (phase < VSW_MILESTONE1) { 3230 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 3231 " when in state %d\n", ldcp->ldc_id, phase); 3232 vsw_restart_handshake(ldcp); 3233 return (1); 3234 } 3235 break; 3236 3237 case VSW_DRING_ACK_RECV: 3238 case VSW_DRING_NACK_RECV: 3239 if (!(state & VSW_DRING_INFO_SENT)) { 3240 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 3241 " or DRING_NACK when in state %d\n", 3242 ldcp->ldc_id, phase); 3243 vsw_restart_handshake(ldcp); 3244 return (1); 3245 } else 3246 state &= ~VSW_DRING_INFO_SENT; 3247 break; 3248 3249 case VSW_RDX_INFO_RECV: 3250 if (phase < VSW_MILESTONE3) { 3251 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 3252 " when in state %d\n", ldcp->ldc_id, phase); 3253 vsw_restart_handshake(ldcp); 3254 return (1); 3255 } 3256 break; 3257 3258 case VSW_RDX_ACK_RECV: 3259 case VSW_RDX_NACK_RECV: 3260 if (!(state & VSW_RDX_INFO_SENT)) { 3261 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 3262 " or RDX_NACK when in state %d\n", 3263 ldcp->ldc_id, phase); 3264 vsw_restart_handshake(ldcp); 3265 return (1); 3266 } else 3267 state &= ~VSW_RDX_INFO_SENT; 3268 break; 3269 3270 case VSW_MCST_INFO_RECV: 3271 if (phase < VSW_MILESTONE3) { 3272 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 3273 " when in state %d\n", ldcp->ldc_id, phase); 3274 vsw_restart_handshake(ldcp); 3275 return (1); 3276 } 3277 break; 3278 3279 default: 3280 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 3281 ldcp->ldc_id, flag); 3282 return (1); 3283 } 3284 3285 if (dir == INBOUND) 3286 ldcp->lane_in.lstate = state; 3287 else 3288 ldcp->lane_out.lstate = state; 3289 3290 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 3291 3292 return (0); 3293 } 3294 3295 void 3296 vsw_next_milestone(vsw_ldc_t *ldcp) 3297 { 3298 vsw_t *vswp = ldcp->ldc_vswp; 3299 3300 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 3301 ldcp->ldc_id, ldcp->hphase); 3302 3303 DUMP_FLAGS(ldcp->lane_in.lstate); 3304 DUMP_FLAGS(ldcp->lane_out.lstate); 3305 3306 switch (ldcp->hphase) { 3307 3308 case VSW_MILESTONE0: 3309 /* 3310 * If we haven't started to handshake with our peer, 3311 * start to do so now. 3312 */ 3313 if (ldcp->lane_out.lstate == 0) { 3314 D2(vswp, "%s: (chan %lld) starting handshake " 3315 "with peer", __func__, ldcp->ldc_id); 3316 vsw_restart_handshake(ldcp); 3317 } 3318 3319 /* 3320 * Only way to pass this milestone is to have successfully 3321 * negotiated version info. 3322 */ 3323 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 3324 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 3325 3326 D2(vswp, "%s: (chan %lld) leaving milestone 0", 3327 __func__, ldcp->ldc_id); 3328 3329 /* 3330 * Next milestone is passed when attribute 3331 * information has been successfully exchanged. 3332 */ 3333 ldcp->hphase = VSW_MILESTONE1; 3334 vsw_send_attr(ldcp); 3335 3336 } 3337 break; 3338 3339 case VSW_MILESTONE1: 3340 /* 3341 * Only way to pass this milestone is to have successfully 3342 * negotiated attribute information. 3343 */ 3344 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 3345 3346 ldcp->hphase = VSW_MILESTONE2; 3347 3348 /* 3349 * If the peer device has said it wishes to 3350 * use descriptor rings then we send it our ring 3351 * info, otherwise we just set up a private ring 3352 * which we use an internal buffer 3353 */ 3354 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 3355 vsw_send_dring_info(ldcp); 3356 } 3357 break; 3358 3359 3360 case VSW_MILESTONE2: 3361 /* 3362 * If peer has indicated in its attribute message that 3363 * it wishes to use descriptor rings then the only way 3364 * to pass this milestone is for us to have received 3365 * valid dring info. 3366 * 3367 * If peer is not using descriptor rings then just fall 3368 * through. 3369 */ 3370 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 3371 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 3372 break; 3373 3374 D2(vswp, "%s: (chan %lld) leaving milestone 2", 3375 __func__, ldcp->ldc_id); 3376 3377 ldcp->hphase = VSW_MILESTONE3; 3378 vsw_send_rdx(ldcp); 3379 break; 3380 3381 case VSW_MILESTONE3: 3382 /* 3383 * Pass this milestone when all paramaters have been 3384 * successfully exchanged and RDX sent in both directions. 3385 * 3386 * Mark outbound lane as available to transmit data. 3387 */ 3388 if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && 3389 (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { 3390 3391 D2(vswp, "%s: (chan %lld) leaving milestone 3", 3392 __func__, ldcp->ldc_id); 3393 D2(vswp, "%s: ** handshake complete **", __func__); 3394 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 3395 ldcp->hphase = VSW_MILESTONE4; 3396 ldcp->hcnt = 0; 3397 DISPLAY_STATE(); 3398 } 3399 break; 3400 3401 case VSW_MILESTONE4: 3402 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 3403 ldcp->ldc_id); 3404 break; 3405 3406 default: 3407 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 3408 ldcp->ldc_id, ldcp->hphase); 3409 } 3410 3411 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 3412 ldcp->hphase); 3413 } 3414 3415 /* 3416 * Check if major version is supported. 3417 * 3418 * Returns 0 if finds supported major number, and if necessary 3419 * adjusts the minor field. 3420 * 3421 * Returns 1 if can't match major number exactly. Sets mjor/minor 3422 * to next lowest support values, or to zero if no other values possible. 3423 */ 3424 static int 3425 vsw_supported_version(vio_ver_msg_t *vp) 3426 { 3427 int i; 3428 3429 D1(NULL, "vsw_supported_version: enter"); 3430 3431 for (i = 0; i < VSW_NUM_VER; i++) { 3432 if (vsw_versions[i].ver_major == vp->ver_major) { 3433 /* 3434 * Matching or lower major version found. Update 3435 * minor number if necessary. 3436 */ 3437 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3438 D2(NULL, "%s: adjusting minor value" 3439 " from %d to %d", __func__, 3440 vp->ver_minor, 3441 vsw_versions[i].ver_minor); 3442 vp->ver_minor = vsw_versions[i].ver_minor; 3443 } 3444 3445 return (0); 3446 } 3447 3448 if (vsw_versions[i].ver_major < vp->ver_major) { 3449 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3450 D2(NULL, "%s: adjusting minor value" 3451 " from %d to %d", __func__, 3452 vp->ver_minor, 3453 vsw_versions[i].ver_minor); 3454 vp->ver_minor = vsw_versions[i].ver_minor; 3455 } 3456 return (1); 3457 } 3458 } 3459 3460 /* No match was possible, zero out fields */ 3461 vp->ver_major = 0; 3462 vp->ver_minor = 0; 3463 3464 D1(NULL, "vsw_supported_version: exit"); 3465 3466 return (1); 3467 } 3468 3469 /* 3470 * Main routine for processing messages received over LDC. 3471 */ 3472 static void 3473 vsw_process_pkt(void *arg) 3474 { 3475 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3476 vsw_t *vswp = ldcp->ldc_vswp; 3477 size_t msglen; 3478 vio_msg_tag_t tag; 3479 def_msg_t dmsg; 3480 int rv = 0; 3481 3482 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3483 3484 /* 3485 * If channel is up read messages until channel is empty. 3486 */ 3487 do { 3488 msglen = sizeof (dmsg); 3489 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 3490 3491 if (rv != 0) { 3492 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 3493 "len(%d)\n", __func__, ldcp->ldc_id, 3494 rv, msglen); 3495 break; 3496 } 3497 3498 if (msglen == 0) { 3499 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 3500 ldcp->ldc_id); 3501 break; 3502 } 3503 3504 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 3505 ldcp->ldc_id, msglen); 3506 3507 /* 3508 * Figure out what sort of packet we have gotten by 3509 * examining the msg tag, and then switch it appropriately. 3510 */ 3511 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 3512 3513 switch (tag.vio_msgtype) { 3514 case VIO_TYPE_CTRL: 3515 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 3516 break; 3517 case VIO_TYPE_DATA: 3518 vsw_process_data_pkt(ldcp, &dmsg, tag); 3519 break; 3520 case VIO_TYPE_ERR: 3521 vsw_process_err_pkt(ldcp, &dmsg, tag); 3522 break; 3523 default: 3524 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 3525 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 3526 break; 3527 } 3528 } while (msglen); 3529 3530 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3531 } 3532 3533 /* 3534 * Dispatch a task to process a VIO control message. 3535 */ 3536 static void 3537 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 3538 { 3539 vsw_ctrl_task_t *ctaskp = NULL; 3540 vsw_port_t *port = ldcp->ldc_port; 3541 vsw_t *vswp = port->p_vswp; 3542 3543 D1(vswp, "%s: enter", __func__); 3544 3545 /* 3546 * We need to handle RDX ACK messages in-band as once they 3547 * are exchanged it is possible that we will get an 3548 * immediate (legitimate) data packet. 3549 */ 3550 if ((tag.vio_subtype_env == VIO_RDX) && 3551 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 3552 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) 3553 return; 3554 3555 ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; 3556 vsw_next_milestone(ldcp); 3557 D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__, 3558 ldcp->ldc_id); 3559 return; 3560 } 3561 3562 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 3563 3564 if (ctaskp == NULL) { 3565 DERR(vswp, "%s: unable to alloc space for ctrl" 3566 " msg", __func__); 3567 vsw_restart_handshake(ldcp); 3568 return; 3569 } 3570 3571 ctaskp->ldcp = ldcp; 3572 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 3573 mutex_enter(&ldcp->hss_lock); 3574 ctaskp->hss_id = ldcp->hss_id; 3575 mutex_exit(&ldcp->hss_lock); 3576 3577 /* 3578 * Dispatch task to processing taskq if port is not in 3579 * the process of being detached. 3580 */ 3581 mutex_enter(&port->state_lock); 3582 if (port->state == VSW_PORT_INIT) { 3583 if ((vswp->taskq_p == NULL) || 3584 (ddi_taskq_dispatch(vswp->taskq_p, 3585 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 3586 != DDI_SUCCESS)) { 3587 DERR(vswp, "%s: unable to dispatch task to taskq", 3588 __func__); 3589 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3590 mutex_exit(&port->state_lock); 3591 vsw_restart_handshake(ldcp); 3592 return; 3593 } 3594 } else { 3595 DWARN(vswp, "%s: port %d detaching, not dispatching " 3596 "task", __func__, port->p_instance); 3597 } 3598 3599 mutex_exit(&port->state_lock); 3600 3601 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 3602 ldcp->ldc_id); 3603 D1(vswp, "%s: exit", __func__); 3604 } 3605 3606 /* 3607 * Process a VIO ctrl message. Invoked from taskq. 3608 */ 3609 static void 3610 vsw_process_ctrl_pkt(void *arg) 3611 { 3612 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 3613 vsw_ldc_t *ldcp = ctaskp->ldcp; 3614 vsw_t *vswp = ldcp->ldc_vswp; 3615 vio_msg_tag_t tag; 3616 uint16_t env; 3617 3618 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 3619 3620 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 3621 env = tag.vio_subtype_env; 3622 3623 /* stale pkt check */ 3624 mutex_enter(&ldcp->hss_lock); 3625 if (ctaskp->hss_id < ldcp->hss_id) { 3626 DWARN(vswp, "%s: discarding stale packet belonging to" 3627 " earlier (%ld) handshake session", __func__, 3628 ctaskp->hss_id); 3629 mutex_exit(&ldcp->hss_lock); 3630 return; 3631 } 3632 mutex_exit(&ldcp->hss_lock); 3633 3634 /* session id check */ 3635 if (ldcp->session_status & VSW_PEER_SESSION) { 3636 if (ldcp->peer_session != tag.vio_sid) { 3637 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 3638 __func__, ldcp->ldc_id, tag.vio_sid); 3639 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3640 vsw_restart_handshake(ldcp); 3641 return; 3642 } 3643 } 3644 3645 /* 3646 * Switch on vio_subtype envelope, then let lower routines 3647 * decide if its an INFO, ACK or NACK packet. 3648 */ 3649 switch (env) { 3650 case VIO_VER_INFO: 3651 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 3652 break; 3653 case VIO_DRING_REG: 3654 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 3655 break; 3656 case VIO_DRING_UNREG: 3657 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 3658 break; 3659 case VIO_ATTR_INFO: 3660 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 3661 break; 3662 case VNET_MCAST_INFO: 3663 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 3664 break; 3665 case VIO_RDX: 3666 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 3667 break; 3668 default: 3669 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 3670 __func__, env); 3671 } 3672 3673 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3674 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 3675 } 3676 3677 /* 3678 * Version negotiation. We can end up here either because our peer 3679 * has responded to a handshake message we have sent it, or our peer 3680 * has initiated a handshake with us. If its the former then can only 3681 * be ACK or NACK, if its the later can only be INFO. 3682 * 3683 * If its an ACK we move to the next stage of the handshake, namely 3684 * attribute exchange. If its a NACK we see if we can specify another 3685 * version, if we can't we stop. 3686 * 3687 * If it is an INFO we reset all params associated with communication 3688 * in that direction over this channel (remember connection is 3689 * essentially 2 independent simplex channels). 3690 */ 3691 void 3692 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 3693 { 3694 vio_ver_msg_t *ver_pkt; 3695 vsw_t *vswp = ldcp->ldc_vswp; 3696 3697 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 3698 3699 /* 3700 * We know this is a ctrl/version packet so 3701 * cast it into the correct structure. 3702 */ 3703 ver_pkt = (vio_ver_msg_t *)pkt; 3704 3705 switch (ver_pkt->tag.vio_subtype) { 3706 case VIO_SUBTYPE_INFO: 3707 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 3708 3709 /* 3710 * Record the session id, which we will use from now 3711 * until we see another VER_INFO msg. Even then the 3712 * session id in most cases will be unchanged, execpt 3713 * if channel was reset. 3714 */ 3715 if ((ldcp->session_status & VSW_PEER_SESSION) && 3716 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 3717 DERR(vswp, "%s: updating session id for chan %lld " 3718 "from %llx to %llx", __func__, ldcp->ldc_id, 3719 ldcp->peer_session, ver_pkt->tag.vio_sid); 3720 } 3721 3722 ldcp->peer_session = ver_pkt->tag.vio_sid; 3723 ldcp->session_status |= VSW_PEER_SESSION; 3724 3725 /* Legal message at this time ? */ 3726 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 3727 return; 3728 3729 /* 3730 * First check the device class. Currently only expect 3731 * to be talking to a network device. In the future may 3732 * also talk to another switch. 3733 */ 3734 if (ver_pkt->dev_class != VDEV_NETWORK) { 3735 DERR(vswp, "%s: illegal device class %d", __func__, 3736 ver_pkt->dev_class); 3737 3738 ver_pkt->tag.vio_sid = ldcp->local_session; 3739 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3740 3741 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3742 3743 vsw_send_msg(ldcp, (void *)ver_pkt, 3744 sizeof (vio_ver_msg_t)); 3745 3746 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 3747 vsw_next_milestone(ldcp); 3748 return; 3749 } else { 3750 ldcp->dev_class = ver_pkt->dev_class; 3751 } 3752 3753 /* 3754 * Now check the version. 3755 */ 3756 if (vsw_supported_version(ver_pkt) == 0) { 3757 /* 3758 * Support this major version and possibly 3759 * adjusted minor version. 3760 */ 3761 3762 D2(vswp, "%s: accepted ver %d:%d", __func__, 3763 ver_pkt->ver_major, ver_pkt->ver_minor); 3764 3765 /* Store accepted values */ 3766 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3767 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3768 3769 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 3770 3771 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 3772 } else { 3773 /* 3774 * NACK back with the next lower major/minor 3775 * pairing we support (if don't suuport any more 3776 * versions then they will be set to zero. 3777 */ 3778 3779 D2(vswp, "%s: replying with ver %d:%d", __func__, 3780 ver_pkt->ver_major, ver_pkt->ver_minor); 3781 3782 /* Store updated values */ 3783 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3784 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3785 3786 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3787 3788 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 3789 } 3790 3791 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3792 ver_pkt->tag.vio_sid = ldcp->local_session; 3793 vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); 3794 3795 vsw_next_milestone(ldcp); 3796 break; 3797 3798 case VIO_SUBTYPE_ACK: 3799 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 3800 3801 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 3802 return; 3803 3804 /* Store updated values */ 3805 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3806 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3807 3808 3809 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 3810 vsw_next_milestone(ldcp); 3811 3812 break; 3813 3814 case VIO_SUBTYPE_NACK: 3815 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 3816 3817 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 3818 return; 3819 3820 /* 3821 * If our peer sent us a NACK with the ver fields set to 3822 * zero then there is nothing more we can do. Otherwise see 3823 * if we support either the version suggested, or a lesser 3824 * one. 3825 */ 3826 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 3827 DERR(vswp, "%s: peer unable to negotiate any " 3828 "further.", __func__); 3829 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 3830 vsw_next_milestone(ldcp); 3831 return; 3832 } 3833 3834 /* 3835 * Check to see if we support this major version or 3836 * a lower one. If we don't then maj/min will be set 3837 * to zero. 3838 */ 3839 (void) vsw_supported_version(ver_pkt); 3840 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 3841 /* Nothing more we can do */ 3842 DERR(vswp, "%s: version negotiation failed.\n", 3843 __func__); 3844 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 3845 vsw_next_milestone(ldcp); 3846 } else { 3847 /* found a supported major version */ 3848 ldcp->lane_out.ver_major = ver_pkt->ver_major; 3849 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 3850 3851 D2(vswp, "%s: resending with updated values (%x, %x)", 3852 __func__, ver_pkt->ver_major, 3853 ver_pkt->ver_minor); 3854 3855 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 3856 ver_pkt->tag.vio_sid = ldcp->local_session; 3857 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 3858 3859 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3860 3861 vsw_send_msg(ldcp, (void *)ver_pkt, 3862 sizeof (vio_ver_msg_t)); 3863 3864 vsw_next_milestone(ldcp); 3865 3866 } 3867 break; 3868 3869 default: 3870 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 3871 ver_pkt->tag.vio_subtype); 3872 } 3873 3874 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 3875 } 3876 3877 /* 3878 * Process an attribute packet. We can end up here either because our peer 3879 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 3880 * peer has sent us an attribute INFO message 3881 * 3882 * If its an ACK we then move to the next stage of the handshake which 3883 * is to send our descriptor ring info to our peer. If its a NACK then 3884 * there is nothing more we can (currently) do. 3885 * 3886 * If we get a valid/acceptable INFO packet (and we have already negotiated 3887 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 3888 * NACK back and reset channel state to INACTIV. 3889 * 3890 * FUTURE: in time we will probably negotiate over attributes, but for 3891 * the moment unacceptable attributes are regarded as a fatal error. 3892 * 3893 */ 3894 void 3895 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 3896 { 3897 vnet_attr_msg_t *attr_pkt; 3898 vsw_t *vswp = ldcp->ldc_vswp; 3899 vsw_port_t *port = ldcp->ldc_port; 3900 uint64_t macaddr = 0; 3901 int i; 3902 3903 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 3904 3905 /* 3906 * We know this is a ctrl/attr packet so 3907 * cast it into the correct structure. 3908 */ 3909 attr_pkt = (vnet_attr_msg_t *)pkt; 3910 3911 switch (attr_pkt->tag.vio_subtype) { 3912 case VIO_SUBTYPE_INFO: 3913 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 3914 3915 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 3916 return; 3917 3918 /* 3919 * If the attributes are unacceptable then we NACK back. 3920 */ 3921 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 3922 3923 DERR(vswp, "%s (chan %d): invalid attributes", 3924 __func__, ldcp->ldc_id); 3925 3926 vsw_free_lane_resources(ldcp, INBOUND); 3927 3928 attr_pkt->tag.vio_sid = ldcp->local_session; 3929 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3930 3931 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 3932 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 3933 vsw_send_msg(ldcp, (void *)attr_pkt, 3934 sizeof (vnet_attr_msg_t)); 3935 3936 vsw_next_milestone(ldcp); 3937 return; 3938 } 3939 3940 /* 3941 * Otherwise store attributes for this lane and update 3942 * lane state. 3943 */ 3944 ldcp->lane_in.mtu = attr_pkt->mtu; 3945 ldcp->lane_in.addr = attr_pkt->addr; 3946 ldcp->lane_in.addr_type = attr_pkt->addr_type; 3947 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 3948 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 3949 3950 macaddr = ldcp->lane_in.addr; 3951 for (i = ETHERADDRL - 1; i >= 0; i--) { 3952 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 3953 macaddr >>= 8; 3954 } 3955 3956 /* create the fdb entry for this port/mac address */ 3957 (void) vsw_add_fdb(vswp, port); 3958 3959 /* setup device specifc xmit routines */ 3960 mutex_enter(&port->tx_lock); 3961 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 3962 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 3963 port->transmit = vsw_dringsend; 3964 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 3965 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 3966 vsw_create_privring(ldcp); 3967 port->transmit = vsw_descrsend; 3968 } 3969 mutex_exit(&port->tx_lock); 3970 3971 attr_pkt->tag.vio_sid = ldcp->local_session; 3972 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 3973 3974 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 3975 3976 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 3977 3978 vsw_send_msg(ldcp, (void *)attr_pkt, 3979 sizeof (vnet_attr_msg_t)); 3980 3981 vsw_next_milestone(ldcp); 3982 break; 3983 3984 case VIO_SUBTYPE_ACK: 3985 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 3986 3987 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 3988 return; 3989 3990 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 3991 vsw_next_milestone(ldcp); 3992 break; 3993 3994 case VIO_SUBTYPE_NACK: 3995 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 3996 3997 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 3998 return; 3999 4000 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 4001 vsw_next_milestone(ldcp); 4002 break; 4003 4004 default: 4005 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4006 attr_pkt->tag.vio_subtype); 4007 } 4008 4009 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4010 } 4011 4012 /* 4013 * Process a dring info packet. We can end up here either because our peer 4014 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 4015 * peer has sent us a dring INFO message. 4016 * 4017 * If we get a valid/acceptable INFO packet (and we have already negotiated 4018 * a version) we ACK back and update the lane state, otherwise we NACK back. 4019 * 4020 * FUTURE: nothing to stop client from sending us info on multiple dring's 4021 * but for the moment we will just use the first one we are given. 4022 * 4023 */ 4024 void 4025 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 4026 { 4027 vio_dring_reg_msg_t *dring_pkt; 4028 vsw_t *vswp = ldcp->ldc_vswp; 4029 ldc_mem_info_t minfo; 4030 dring_info_t *dp, *dbp; 4031 int dring_found = 0; 4032 4033 /* 4034 * We know this is a ctrl/dring packet so 4035 * cast it into the correct structure. 4036 */ 4037 dring_pkt = (vio_dring_reg_msg_t *)pkt; 4038 4039 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4040 4041 switch (dring_pkt->tag.vio_subtype) { 4042 case VIO_SUBTYPE_INFO: 4043 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4044 4045 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 4046 return; 4047 4048 /* 4049 * If the dring params are unacceptable then we NACK back. 4050 */ 4051 if (vsw_check_dring_info(dring_pkt)) { 4052 4053 DERR(vswp, "%s (%lld): invalid dring info", 4054 __func__, ldcp->ldc_id); 4055 4056 vsw_free_lane_resources(ldcp, INBOUND); 4057 4058 dring_pkt->tag.vio_sid = ldcp->local_session; 4059 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4060 4061 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4062 4063 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4064 4065 vsw_send_msg(ldcp, (void *)dring_pkt, 4066 sizeof (vio_dring_reg_msg_t)); 4067 4068 vsw_next_milestone(ldcp); 4069 return; 4070 } 4071 4072 /* 4073 * Otherwise, attempt to map in the dring using the 4074 * cookie. If that succeeds we send back a unique dring 4075 * identifier that the sending side will use in future 4076 * to refer to this descriptor ring. 4077 */ 4078 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 4079 4080 dp->num_descriptors = dring_pkt->num_descriptors; 4081 dp->descriptor_size = dring_pkt->descriptor_size; 4082 dp->options = dring_pkt->options; 4083 dp->ncookies = dring_pkt->ncookies; 4084 4085 /* 4086 * Note: should only get one cookie. Enforced in 4087 * the ldc layer. 4088 */ 4089 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 4090 sizeof (ldc_mem_cookie_t)); 4091 4092 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 4093 dp->num_descriptors, dp->descriptor_size); 4094 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 4095 dp->options, dp->ncookies); 4096 4097 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 4098 dp->ncookies, dp->num_descriptors, 4099 dp->descriptor_size, LDC_SHADOW_MAP, 4100 &(dp->handle))) != 0) { 4101 4102 DERR(vswp, "%s: dring_map failed\n", __func__); 4103 4104 kmem_free(dp, sizeof (dring_info_t)); 4105 vsw_free_lane_resources(ldcp, INBOUND); 4106 4107 dring_pkt->tag.vio_sid = ldcp->local_session; 4108 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4109 4110 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4111 4112 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4113 vsw_send_msg(ldcp, (void *)dring_pkt, 4114 sizeof (vio_dring_reg_msg_t)); 4115 4116 vsw_next_milestone(ldcp); 4117 return; 4118 } 4119 4120 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 4121 4122 DERR(vswp, "%s: dring_addr failed\n", __func__); 4123 4124 kmem_free(dp, sizeof (dring_info_t)); 4125 vsw_free_lane_resources(ldcp, INBOUND); 4126 4127 dring_pkt->tag.vio_sid = ldcp->local_session; 4128 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4129 4130 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4131 4132 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4133 vsw_send_msg(ldcp, (void *)dring_pkt, 4134 sizeof (vio_dring_reg_msg_t)); 4135 4136 vsw_next_milestone(ldcp); 4137 return; 4138 } else { 4139 /* store the address of the pub part of ring */ 4140 dp->pub_addr = minfo.vaddr; 4141 } 4142 4143 /* no private section as we are importing */ 4144 dp->priv_addr = NULL; 4145 4146 /* 4147 * Using simple mono increasing int for ident at 4148 * the moment. 4149 */ 4150 dp->ident = ldcp->next_ident; 4151 ldcp->next_ident++; 4152 4153 dp->end_idx = 0; 4154 dp->next = NULL; 4155 4156 /* 4157 * Link it onto the end of the list of drings 4158 * for this lane. 4159 */ 4160 if (ldcp->lane_in.dringp == NULL) { 4161 D2(vswp, "%s: adding first INBOUND dring", __func__); 4162 ldcp->lane_in.dringp = dp; 4163 } else { 4164 dbp = ldcp->lane_in.dringp; 4165 4166 while (dbp->next != NULL) 4167 dbp = dbp->next; 4168 4169 dbp->next = dp; 4170 } 4171 4172 /* acknowledge it */ 4173 dring_pkt->tag.vio_sid = ldcp->local_session; 4174 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4175 dring_pkt->dring_ident = dp->ident; 4176 4177 vsw_send_msg(ldcp, (void *)dring_pkt, 4178 sizeof (vio_dring_reg_msg_t)); 4179 4180 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 4181 vsw_next_milestone(ldcp); 4182 break; 4183 4184 case VIO_SUBTYPE_ACK: 4185 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4186 4187 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 4188 return; 4189 4190 /* 4191 * Peer is acknowledging our dring info and will have 4192 * sent us a dring identifier which we will use to 4193 * refer to this ring w.r.t. our peer. 4194 */ 4195 dp = ldcp->lane_out.dringp; 4196 if (dp != NULL) { 4197 /* 4198 * Find the ring this ident should be associated 4199 * with. 4200 */ 4201 if (vsw_dring_match(dp, dring_pkt)) { 4202 dring_found = 1; 4203 4204 } else while (dp != NULL) { 4205 if (vsw_dring_match(dp, dring_pkt)) { 4206 dring_found = 1; 4207 break; 4208 } 4209 dp = dp->next; 4210 } 4211 4212 if (dring_found == 0) { 4213 DERR(NULL, "%s: unrecognised ring cookie", 4214 __func__); 4215 vsw_restart_handshake(ldcp); 4216 return; 4217 } 4218 4219 } else { 4220 DERR(vswp, "%s: DRING ACK received but no drings " 4221 "allocated", __func__); 4222 vsw_restart_handshake(ldcp); 4223 return; 4224 } 4225 4226 /* store ident */ 4227 dp->ident = dring_pkt->dring_ident; 4228 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 4229 vsw_next_milestone(ldcp); 4230 break; 4231 4232 case VIO_SUBTYPE_NACK: 4233 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4234 4235 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 4236 return; 4237 4238 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 4239 vsw_next_milestone(ldcp); 4240 break; 4241 4242 default: 4243 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4244 dring_pkt->tag.vio_subtype); 4245 } 4246 4247 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4248 } 4249 4250 /* 4251 * Process a request from peer to unregister a dring. 4252 * 4253 * For the moment we just restart the handshake if our 4254 * peer endpoint attempts to unregister a dring. 4255 */ 4256 void 4257 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 4258 { 4259 vsw_t *vswp = ldcp->ldc_vswp; 4260 vio_dring_unreg_msg_t *dring_pkt; 4261 4262 /* 4263 * We know this is a ctrl/dring packet so 4264 * cast it into the correct structure. 4265 */ 4266 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 4267 4268 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4269 4270 switch (dring_pkt->tag.vio_subtype) { 4271 case VIO_SUBTYPE_INFO: 4272 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4273 4274 DWARN(vswp, "%s: restarting handshake..", __func__); 4275 vsw_restart_handshake(ldcp); 4276 break; 4277 4278 case VIO_SUBTYPE_ACK: 4279 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4280 4281 DWARN(vswp, "%s: restarting handshake..", __func__); 4282 vsw_restart_handshake(ldcp); 4283 break; 4284 4285 case VIO_SUBTYPE_NACK: 4286 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4287 4288 DWARN(vswp, "%s: restarting handshake..", __func__); 4289 vsw_restart_handshake(ldcp); 4290 break; 4291 4292 default: 4293 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4294 dring_pkt->tag.vio_subtype); 4295 vsw_restart_handshake(ldcp); 4296 } 4297 4298 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4299 } 4300 4301 #define SND_MCST_NACK(ldcp, pkt) \ 4302 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4303 pkt->tag.vio_sid = ldcp->local_session; \ 4304 vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); 4305 4306 /* 4307 * Process a multicast request from a vnet. 4308 * 4309 * Vnet's specify a multicast address that they are interested in. This 4310 * address is used as a key into the hash table which forms the multicast 4311 * forwarding database (mFDB). 4312 * 4313 * The table keys are the multicast addresses, while the table entries 4314 * are pointers to lists of ports which wish to receive packets for the 4315 * specified multicast address. 4316 * 4317 * When a multicast packet is being switched we use the address as a key 4318 * into the hash table, and then walk the appropriate port list forwarding 4319 * the pkt to each port in turn. 4320 * 4321 * If a vnet is no longer interested in a particular multicast grouping 4322 * we simply find the correct location in the hash table and then delete 4323 * the relevant port from the port list. 4324 * 4325 * To deal with the case whereby a port is being deleted without first 4326 * removing itself from the lists in the hash table, we maintain a list 4327 * of multicast addresses the port has registered an interest in, within 4328 * the port structure itself. We then simply walk that list of addresses 4329 * using them as keys into the hash table and remove the port from the 4330 * appropriate lists. 4331 */ 4332 static void 4333 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 4334 { 4335 vnet_mcast_msg_t *mcst_pkt; 4336 vsw_port_t *port = ldcp->ldc_port; 4337 vsw_t *vswp = ldcp->ldc_vswp; 4338 int i; 4339 4340 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4341 4342 /* 4343 * We know this is a ctrl/mcast packet so 4344 * cast it into the correct structure. 4345 */ 4346 mcst_pkt = (vnet_mcast_msg_t *)pkt; 4347 4348 switch (mcst_pkt->tag.vio_subtype) { 4349 case VIO_SUBTYPE_INFO: 4350 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4351 4352 /* 4353 * Check if in correct state to receive a multicast 4354 * message (i.e. handshake complete). If not reset 4355 * the handshake. 4356 */ 4357 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 4358 return; 4359 4360 /* 4361 * Before attempting to add or remove address check 4362 * that they are valid multicast addresses. 4363 * If not, then NACK back. 4364 */ 4365 for (i = 0; i < mcst_pkt->count; i++) { 4366 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 4367 DERR(vswp, "%s: invalid multicast address", 4368 __func__); 4369 SND_MCST_NACK(ldcp, mcst_pkt); 4370 return; 4371 } 4372 } 4373 4374 /* 4375 * Now add/remove the addresses. If this fails we 4376 * NACK back. 4377 */ 4378 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 4379 SND_MCST_NACK(ldcp, mcst_pkt); 4380 return; 4381 } 4382 4383 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4384 mcst_pkt->tag.vio_sid = ldcp->local_session; 4385 4386 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 4387 4388 vsw_send_msg(ldcp, (void *)mcst_pkt, 4389 sizeof (vnet_mcast_msg_t)); 4390 break; 4391 4392 case VIO_SUBTYPE_ACK: 4393 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4394 4395 /* 4396 * We shouldn't ever get a multicast ACK message as 4397 * at the moment we never request multicast addresses 4398 * to be set on some other device. This may change in 4399 * the future if we have cascading switches. 4400 */ 4401 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 4402 return; 4403 4404 /* Do nothing */ 4405 break; 4406 4407 case VIO_SUBTYPE_NACK: 4408 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4409 4410 /* 4411 * We shouldn't get a multicast NACK packet for the 4412 * same reasons as we shouldn't get a ACK packet. 4413 */ 4414 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 4415 return; 4416 4417 /* Do nothing */ 4418 break; 4419 4420 default: 4421 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4422 mcst_pkt->tag.vio_subtype); 4423 } 4424 4425 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4426 } 4427 4428 static void 4429 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 4430 { 4431 vio_rdx_msg_t *rdx_pkt; 4432 vsw_t *vswp = ldcp->ldc_vswp; 4433 4434 /* 4435 * We know this is a ctrl/rdx packet so 4436 * cast it into the correct structure. 4437 */ 4438 rdx_pkt = (vio_rdx_msg_t *)pkt; 4439 4440 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4441 4442 switch (rdx_pkt->tag.vio_subtype) { 4443 case VIO_SUBTYPE_INFO: 4444 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4445 4446 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) 4447 return; 4448 4449 rdx_pkt->tag.vio_sid = ldcp->local_session; 4450 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4451 4452 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 4453 4454 ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; 4455 4456 vsw_send_msg(ldcp, (void *)rdx_pkt, 4457 sizeof (vio_rdx_msg_t)); 4458 4459 vsw_next_milestone(ldcp); 4460 break; 4461 4462 case VIO_SUBTYPE_ACK: 4463 /* 4464 * Should be handled in-band by callback handler. 4465 */ 4466 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 4467 vsw_restart_handshake(ldcp); 4468 break; 4469 4470 case VIO_SUBTYPE_NACK: 4471 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4472 4473 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) 4474 return; 4475 4476 ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; 4477 vsw_next_milestone(ldcp); 4478 break; 4479 4480 default: 4481 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4482 rdx_pkt->tag.vio_subtype); 4483 } 4484 4485 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4486 } 4487 4488 static void 4489 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 4490 { 4491 uint16_t env = tag.vio_subtype_env; 4492 vsw_t *vswp = ldcp->ldc_vswp; 4493 4494 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4495 4496 /* session id check */ 4497 if (ldcp->session_status & VSW_PEER_SESSION) { 4498 if (ldcp->peer_session != tag.vio_sid) { 4499 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4500 __func__, ldcp->ldc_id, tag.vio_sid); 4501 vsw_restart_handshake(ldcp); 4502 return; 4503 } 4504 } 4505 4506 /* 4507 * It is an error for us to be getting data packets 4508 * before the handshake has completed. 4509 */ 4510 if (ldcp->hphase != VSW_MILESTONE4) { 4511 DERR(vswp, "%s: got data packet before handshake complete " 4512 "hphase %d (%x: %x)", __func__, ldcp->hphase, 4513 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 4514 DUMP_FLAGS(ldcp->lane_in.lstate); 4515 DUMP_FLAGS(ldcp->lane_out.lstate); 4516 vsw_restart_handshake(ldcp); 4517 return; 4518 } 4519 4520 /* 4521 * Switch on vio_subtype envelope, then let lower routines 4522 * decide if its an INFO, ACK or NACK packet. 4523 */ 4524 if (env == VIO_DRING_DATA) { 4525 vsw_process_data_dring_pkt(ldcp, dpkt); 4526 } else if (env == VIO_PKT_DATA) { 4527 vsw_process_data_raw_pkt(ldcp, dpkt); 4528 } else if (env == VIO_DESC_DATA) { 4529 vsw_process_data_ibnd_pkt(ldcp, dpkt); 4530 } else { 4531 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4532 __func__, env); 4533 } 4534 4535 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4536 } 4537 4538 #define SND_DRING_NACK(ldcp, pkt) \ 4539 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4540 pkt->tag.vio_sid = ldcp->local_session; \ 4541 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); 4542 4543 static void 4544 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 4545 { 4546 vio_dring_msg_t *dring_pkt; 4547 vnet_public_desc_t *pub_addr = NULL; 4548 vsw_private_desc_t *priv_addr = NULL; 4549 dring_info_t *dp = NULL; 4550 vsw_t *vswp = ldcp->ldc_vswp; 4551 mblk_t *mp = NULL; 4552 mblk_t *bp = NULL; 4553 mblk_t *bpt = NULL; 4554 size_t nbytes = 0; 4555 size_t off = 0; 4556 uint64_t ncookies = 0; 4557 uint64_t chain = 0; 4558 uint64_t j, len; 4559 uint32_t pos, start, datalen; 4560 uint32_t range_start, range_end; 4561 int32_t end, num, cnt = 0; 4562 int i, rv; 4563 boolean_t ack_needed = B_FALSE; 4564 boolean_t prev_desc_ack = B_FALSE; 4565 int read_attempts = 0; 4566 4567 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4568 4569 /* 4570 * We know this is a data/dring packet so 4571 * cast it into the correct structure. 4572 */ 4573 dring_pkt = (vio_dring_msg_t *)dpkt; 4574 4575 /* 4576 * Switch on the vio_subtype. If its INFO then we need to 4577 * process the data. If its an ACK we need to make sure 4578 * it makes sense (i.e did we send an earlier data/info), 4579 * and if its a NACK then we maybe attempt a retry. 4580 */ 4581 switch (dring_pkt->tag.vio_subtype) { 4582 case VIO_SUBTYPE_INFO: 4583 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 4584 4585 if ((dp = vsw_ident2dring(&ldcp->lane_in, 4586 dring_pkt->dring_ident)) == NULL) { 4587 4588 DERR(vswp, "%s(%lld): unable to find dring from " 4589 "ident 0x%llx", __func__, ldcp->ldc_id, 4590 dring_pkt->dring_ident); 4591 4592 SND_DRING_NACK(ldcp, dring_pkt); 4593 return; 4594 } 4595 4596 start = pos = dring_pkt->start_idx; 4597 end = dring_pkt->end_idx; 4598 len = dp->num_descriptors; 4599 4600 range_start = range_end = pos; 4601 4602 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 4603 __func__, ldcp->ldc_id, start, end); 4604 4605 if (end == -1) { 4606 num = -1; 4607 } else if (num >= 0) { 4608 num = end >= pos ? 4609 end - pos + 1: (len - pos + 1) + end; 4610 4611 /* basic sanity check */ 4612 if (end > len) { 4613 DERR(vswp, "%s(%lld): endpoint %lld outside " 4614 "ring length %lld", __func__, 4615 ldcp->ldc_id, end, len); 4616 4617 SND_DRING_NACK(ldcp, dring_pkt); 4618 return; 4619 } 4620 } else { 4621 DERR(vswp, "%s(%lld): invalid endpoint %lld", 4622 __func__, ldcp->ldc_id, end); 4623 SND_DRING_NACK(ldcp, dring_pkt); 4624 return; 4625 } 4626 4627 while (cnt != num) { 4628 vsw_recheck_desc: 4629 if ((rv = ldc_mem_dring_acquire(dp->handle, 4630 pos, pos)) != 0) { 4631 DERR(vswp, "%s(%lld): unable to acquire " 4632 "descriptor at pos %d: err %d", 4633 __func__, pos, ldcp->ldc_id, rv); 4634 SND_DRING_NACK(ldcp, dring_pkt); 4635 return; 4636 } 4637 4638 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 4639 4640 /* 4641 * When given a bounded range of descriptors 4642 * to process, its an error to hit a descriptor 4643 * which is not ready. In the non-bounded case 4644 * (end_idx == -1) this simply indicates we have 4645 * reached the end of the current active range. 4646 */ 4647 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 4648 /* unbound - no error */ 4649 if (end == -1) { 4650 if (read_attempts == vsw_read_attempts) 4651 break; 4652 4653 delay(drv_usectohz(vsw_desc_delay)); 4654 read_attempts++; 4655 goto vsw_recheck_desc; 4656 } 4657 4658 /* bounded - error - so NACK back */ 4659 DERR(vswp, "%s(%lld): descriptor not READY " 4660 "(%d)", __func__, ldcp->ldc_id, 4661 pub_addr->hdr.dstate); 4662 SND_DRING_NACK(ldcp, dring_pkt); 4663 return; 4664 } 4665 4666 DTRACE_PROBE1(read_attempts, int, read_attempts); 4667 4668 range_end = pos; 4669 4670 /* 4671 * If we ACK'd the previous descriptor then now 4672 * record the new range start position for later 4673 * ACK's. 4674 */ 4675 if (prev_desc_ack) { 4676 range_start = pos; 4677 4678 D2(vswp, "%s(%lld): updating range start " 4679 "to be %d", __func__, ldcp->ldc_id, 4680 range_start); 4681 4682 prev_desc_ack = B_FALSE; 4683 } 4684 4685 /* 4686 * Data is padded to align on 8 byte boundary, 4687 * datalen is actual data length, i.e. minus that 4688 * padding. 4689 */ 4690 datalen = pub_addr->nbytes; 4691 4692 /* 4693 * Does peer wish us to ACK when we have finished 4694 * with this descriptor ? 4695 */ 4696 if (pub_addr->hdr.ack) 4697 ack_needed = B_TRUE; 4698 4699 D2(vswp, "%s(%lld): processing desc %lld at pos" 4700 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 4701 __func__, ldcp->ldc_id, pos, pub_addr, 4702 pub_addr->hdr.dstate, datalen); 4703 4704 /* 4705 * Mark that we are starting to process descriptor. 4706 */ 4707 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 4708 4709 mp = vio_allocb(ldcp->rxh); 4710 if (mp == NULL) { 4711 /* 4712 * No free receive buffers available, so 4713 * fallback onto allocb(9F). Make sure that 4714 * we get a data buffer which is a multiple 4715 * of 8 as this is required by ldc_mem_copy. 4716 */ 4717 DTRACE_PROBE(allocb); 4718 mp = allocb(datalen + VNET_IPALIGN + 8, 4719 BPRI_MED); 4720 } 4721 4722 /* 4723 * Ensure that we ask ldc for an aligned 4724 * number of bytes. 4725 */ 4726 nbytes = datalen + VNET_IPALIGN; 4727 if (nbytes & 0x7) { 4728 off = 8 - (nbytes & 0x7); 4729 nbytes += off; 4730 } 4731 4732 ncookies = pub_addr->ncookies; 4733 rv = ldc_mem_copy(ldcp->ldc_handle, 4734 (caddr_t)mp->b_rptr, 0, &nbytes, 4735 pub_addr->memcookie, ncookies, 4736 LDC_COPY_IN); 4737 4738 if (rv != 0) { 4739 DERR(vswp, "%s(%d): unable to copy in " 4740 "data from %d cookies in desc %d" 4741 " (rv %d)", __func__, ldcp->ldc_id, 4742 ncookies, pos, rv); 4743 freemsg(mp); 4744 4745 pub_addr->hdr.dstate = VIO_DESC_DONE; 4746 (void) ldc_mem_dring_release(dp->handle, 4747 pos, pos); 4748 break; 4749 } else { 4750 D2(vswp, "%s(%d): copied in %ld bytes" 4751 " using %d cookies", __func__, 4752 ldcp->ldc_id, nbytes, ncookies); 4753 } 4754 4755 /* adjust the read pointer to skip over the padding */ 4756 mp->b_rptr += VNET_IPALIGN; 4757 4758 /* point to the actual end of data */ 4759 mp->b_wptr = mp->b_rptr + datalen; 4760 4761 /* build a chain of received packets */ 4762 if (bp == NULL) { 4763 /* first pkt */ 4764 bp = mp; 4765 bp->b_next = bp->b_prev = NULL; 4766 bpt = bp; 4767 chain = 1; 4768 } else { 4769 mp->b_next = NULL; 4770 mp->b_prev = bpt; 4771 bpt->b_next = mp; 4772 bpt = mp; 4773 chain++; 4774 } 4775 4776 /* mark we are finished with this descriptor */ 4777 pub_addr->hdr.dstate = VIO_DESC_DONE; 4778 4779 (void) ldc_mem_dring_release(dp->handle, pos, pos); 4780 4781 /* 4782 * Send an ACK back to peer if requested. 4783 */ 4784 if (ack_needed) { 4785 ack_needed = B_FALSE; 4786 4787 dring_pkt->start_idx = range_start; 4788 dring_pkt->end_idx = range_end; 4789 4790 DERR(vswp, "%s(%lld): processed %d %d, ACK" 4791 " requested", __func__, ldcp->ldc_id, 4792 dring_pkt->start_idx, 4793 dring_pkt->end_idx); 4794 4795 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 4796 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4797 dring_pkt->tag.vio_sid = ldcp->local_session; 4798 vsw_send_msg(ldcp, (void *)dring_pkt, 4799 sizeof (vio_dring_msg_t)); 4800 4801 prev_desc_ack = B_TRUE; 4802 range_start = pos; 4803 } 4804 4805 /* next descriptor */ 4806 pos = (pos + 1) % len; 4807 cnt++; 4808 4809 /* 4810 * Break out of loop here and stop processing to 4811 * allow some other network device (or disk) to 4812 * get access to the cpu. 4813 */ 4814 /* send the chain of packets to be switched */ 4815 if (chain > vsw_chain_len) { 4816 D3(vswp, "%s(%lld): switching chain of %d " 4817 "msgs", __func__, ldcp->ldc_id, chain); 4818 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 4819 ldcp->ldc_port, NULL); 4820 bp = NULL; 4821 break; 4822 } 4823 } 4824 4825 /* send the chain of packets to be switched */ 4826 if (bp != NULL) { 4827 D3(vswp, "%s(%lld): switching chain of %d msgs", 4828 __func__, ldcp->ldc_id, chain); 4829 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 4830 ldcp->ldc_port, NULL); 4831 } 4832 4833 DTRACE_PROBE1(msg_cnt, int, cnt); 4834 4835 /* 4836 * We are now finished so ACK back with the state 4837 * set to STOPPING so our peer knows we are finished 4838 */ 4839 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4840 dring_pkt->tag.vio_sid = ldcp->local_session; 4841 4842 dring_pkt->dring_process_state = VIO_DP_STOPPED; 4843 4844 DTRACE_PROBE(stop_process_sent); 4845 4846 /* 4847 * We have not processed any more descriptors beyond 4848 * the last one we ACK'd. 4849 */ 4850 if (prev_desc_ack) 4851 range_start = range_end; 4852 4853 dring_pkt->start_idx = range_start; 4854 dring_pkt->end_idx = range_end; 4855 4856 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 4857 __func__, ldcp->ldc_id, dring_pkt->start_idx, 4858 dring_pkt->end_idx); 4859 4860 vsw_send_msg(ldcp, (void *)dring_pkt, 4861 sizeof (vio_dring_msg_t)); 4862 break; 4863 4864 case VIO_SUBTYPE_ACK: 4865 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 4866 /* 4867 * Verify that the relevant descriptors are all 4868 * marked as DONE 4869 */ 4870 if ((dp = vsw_ident2dring(&ldcp->lane_out, 4871 dring_pkt->dring_ident)) == NULL) { 4872 DERR(vswp, "%s: unknown ident in ACK", __func__); 4873 return; 4874 } 4875 4876 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 4877 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 4878 4879 start = end = 0; 4880 start = dring_pkt->start_idx; 4881 end = dring_pkt->end_idx; 4882 len = dp->num_descriptors; 4883 4884 j = num = 0; 4885 /* calculate # descriptors taking into a/c wrap around */ 4886 num = end >= start ? end - start + 1: (len - start + 1) + end; 4887 4888 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 4889 __func__, ldcp->ldc_id, start, end, num); 4890 4891 mutex_enter(&dp->dlock); 4892 dp->last_ack_recv = end; 4893 mutex_exit(&dp->dlock); 4894 4895 for (i = start; j < num; i = (i + 1) % len, j++) { 4896 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 4897 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 4898 4899 /* 4900 * If the last descriptor in a range has the ACK 4901 * bit set then we will get two messages from our 4902 * peer relating to it. The normal ACK msg and then 4903 * a subsequent STOP msg. The first message will have 4904 * resulted in the descriptor being reclaimed and 4905 * its state set to FREE so when we encounter a non 4906 * DONE descriptor we need to check to see if its 4907 * because we have just reclaimed it. 4908 */ 4909 mutex_enter(&priv_addr->dstate_lock); 4910 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 4911 /* clear all the fields */ 4912 bzero(priv_addr->datap, priv_addr->datalen); 4913 priv_addr->datalen = 0; 4914 4915 pub_addr->hdr.dstate = VIO_DESC_FREE; 4916 pub_addr->hdr.ack = 0; 4917 4918 priv_addr->dstate = VIO_DESC_FREE; 4919 mutex_exit(&priv_addr->dstate_lock); 4920 4921 D3(vswp, "clearing descp %d : pub state " 4922 "0x%llx : priv state 0x%llx", i, 4923 pub_addr->hdr.dstate, 4924 priv_addr->dstate); 4925 4926 } else { 4927 mutex_exit(&priv_addr->dstate_lock); 4928 4929 if (dring_pkt->dring_process_state != 4930 VIO_DP_STOPPED) { 4931 DERR(vswp, "%s: descriptor %lld at pos " 4932 " 0x%llx not DONE (0x%lx)\n", 4933 __func__, i, pub_addr, 4934 pub_addr->hdr.dstate); 4935 return; 4936 } 4937 } 4938 } 4939 4940 /* 4941 * If our peer is stopping processing descriptors then 4942 * we check to make sure it has processed all the descriptors 4943 * we have updated. If not then we send it a new message 4944 * to prompt it to restart. 4945 */ 4946 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 4947 DTRACE_PROBE(stop_process_recv); 4948 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 4949 __func__, ldcp->ldc_id, dring_pkt->start_idx, 4950 dring_pkt->end_idx); 4951 4952 /* 4953 * Check next descriptor in public section of ring. 4954 * If its marked as READY then we need to prompt our 4955 * peer to start processing the ring again. 4956 */ 4957 i = (end + 1) % len; 4958 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 4959 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 4960 4961 /* 4962 * Hold the restart lock across all of this to 4963 * make sure that its not possible for us to 4964 * decide that a msg needs to be sent in the future 4965 * but the sending code having already checked is 4966 * about to exit. 4967 */ 4968 mutex_enter(&dp->restart_lock); 4969 mutex_enter(&priv_addr->dstate_lock); 4970 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 4971 4972 mutex_exit(&priv_addr->dstate_lock); 4973 4974 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 4975 dring_pkt->tag.vio_sid = ldcp->local_session; 4976 4977 mutex_enter(&ldcp->lane_out.seq_lock); 4978 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 4979 mutex_exit(&ldcp->lane_out.seq_lock); 4980 4981 dring_pkt->start_idx = (end + 1) % len; 4982 dring_pkt->end_idx = -1; 4983 4984 D2(vswp, "%s(%lld) : sending restart msg:" 4985 " %d : %d", __func__, ldcp->ldc_id, 4986 dring_pkt->start_idx, 4987 dring_pkt->end_idx); 4988 4989 vsw_send_msg(ldcp, (void *)dring_pkt, 4990 sizeof (vio_dring_msg_t)); 4991 } else { 4992 mutex_exit(&priv_addr->dstate_lock); 4993 dp->restart_reqd = B_TRUE; 4994 } 4995 mutex_exit(&dp->restart_lock); 4996 } 4997 break; 4998 4999 case VIO_SUBTYPE_NACK: 5000 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 5001 __func__, ldcp->ldc_id); 5002 /* 5003 * Something is badly wrong if we are getting NACK's 5004 * for our data pkts. So reset the channel. 5005 */ 5006 vsw_restart_handshake(ldcp); 5007 5008 break; 5009 5010 default: 5011 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5012 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 5013 } 5014 5015 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5016 } 5017 5018 /* 5019 * VIO_PKT_DATA (a.k.a raw data mode ) 5020 * 5021 * Note - currently not supported. Do nothing. 5022 */ 5023 static void 5024 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 5025 { 5026 _NOTE(ARGUNUSED(dpkt)) 5027 5028 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5029 5030 DERR(NULL, "%s (%lld): currently not supported", 5031 __func__, ldcp->ldc_id); 5032 5033 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5034 } 5035 5036 #define SND_IBND_DESC_NACK(ldcp, pkt) \ 5037 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5038 pkt->tag.vio_sid = ldcp->local_session; \ 5039 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t)); 5040 5041 /* 5042 * Process an in-band descriptor message (most likely from 5043 * OBP). 5044 */ 5045 static void 5046 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 5047 { 5048 vio_ibnd_desc_t *ibnd_desc; 5049 dring_info_t *dp = NULL; 5050 vsw_private_desc_t *priv_addr = NULL; 5051 vsw_t *vswp = ldcp->ldc_vswp; 5052 mblk_t *mp = NULL; 5053 size_t nbytes = 0; 5054 size_t off = 0; 5055 uint64_t idx = 0; 5056 uint32_t datalen = 0; 5057 uint64_t ncookies = 0; 5058 int rv; 5059 5060 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5061 5062 ibnd_desc = (vio_ibnd_desc_t *)pkt; 5063 5064 switch (ibnd_desc->hdr.tag.vio_subtype) { 5065 case VIO_SUBTYPE_INFO: 5066 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5067 5068 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5069 return; 5070 5071 /* 5072 * Data is padded to align on a 8 byte boundary, 5073 * nbytes is actual data length, i.e. minus that 5074 * padding. 5075 */ 5076 datalen = ibnd_desc->nbytes; 5077 5078 D2(vswp, "%s(%lld): processing inband desc : " 5079 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 5080 5081 ncookies = ibnd_desc->ncookies; 5082 5083 /* 5084 * allocb(9F) returns an aligned data block. We 5085 * need to ensure that we ask ldc for an aligned 5086 * number of bytes also. 5087 */ 5088 nbytes = datalen; 5089 if (nbytes & 0x7) { 5090 off = 8 - (nbytes & 0x7); 5091 nbytes += off; 5092 } 5093 5094 mp = allocb(datalen, BPRI_MED); 5095 if (mp == NULL) { 5096 DERR(vswp, "%s(%lld): allocb failed", 5097 __func__, ldcp->ldc_id); 5098 return; 5099 } 5100 5101 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 5102 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 5103 LDC_COPY_IN); 5104 5105 if (rv != 0) { 5106 DERR(vswp, "%s(%d): unable to copy in data from " 5107 "%d cookie(s)", __func__, 5108 ldcp->ldc_id, ncookies); 5109 freemsg(mp); 5110 return; 5111 } else { 5112 D2(vswp, "%s(%d): copied in %ld bytes using %d " 5113 "cookies", __func__, ldcp->ldc_id, nbytes, 5114 ncookies); 5115 } 5116 5117 /* point to the actual end of data */ 5118 mp->b_wptr = mp->b_rptr + datalen; 5119 5120 /* 5121 * We ACK back every in-band descriptor message we process 5122 */ 5123 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 5124 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 5125 vsw_send_msg(ldcp, (void *)ibnd_desc, 5126 sizeof (vio_ibnd_desc_t)); 5127 5128 /* send the packet to be switched */ 5129 vsw_switch_frame(vswp, mp, VSW_VNETPORT, 5130 ldcp->ldc_port, NULL); 5131 5132 break; 5133 5134 case VIO_SUBTYPE_ACK: 5135 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5136 5137 /* Verify the ACK is valid */ 5138 idx = ibnd_desc->hdr.desc_handle; 5139 5140 if (idx >= VSW_RING_NUM_EL) { 5141 cmn_err(CE_WARN, "%s: corrupted ACK received " 5142 "(idx %ld)", __func__, idx); 5143 return; 5144 } 5145 5146 if ((dp = ldcp->lane_out.dringp) == NULL) { 5147 DERR(vswp, "%s: no dring found", __func__); 5148 return; 5149 } 5150 5151 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5152 5153 /* move to correct location in ring */ 5154 priv_addr += idx; 5155 5156 /* 5157 * When we sent the in-band message to our peer we 5158 * marked the copy in our private ring as READY. We now 5159 * check that the descriptor we are being ACK'ed for is in 5160 * fact READY, i.e. it is one we have shared with our peer. 5161 */ 5162 mutex_enter(&priv_addr->dstate_lock); 5163 if (priv_addr->dstate != VIO_DESC_READY) { 5164 mutex_exit(&priv_addr->dstate_lock); 5165 cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not " 5166 "READY (0x%lx)", __func__, ldcp->ldc_id, idx, 5167 priv_addr->dstate); 5168 cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n", 5169 __func__, priv_addr->bound, 5170 priv_addr->ncookies); 5171 cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen); 5172 return; 5173 } else { 5174 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 5175 ldcp->ldc_id, idx); 5176 5177 /* release resources associated with sent msg */ 5178 bzero(priv_addr->datap, priv_addr->datalen); 5179 priv_addr->datalen = 0; 5180 priv_addr->dstate = VIO_DESC_FREE; 5181 mutex_exit(&priv_addr->dstate_lock); 5182 } 5183 break; 5184 5185 case VIO_SUBTYPE_NACK: 5186 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5187 5188 /* 5189 * We should only get a NACK if our peer doesn't like 5190 * something about a message we have sent it. If this 5191 * happens we just release the resources associated with 5192 * the message. (We are relying on higher layers to decide 5193 * whether or not to resend. 5194 */ 5195 5196 /* limit check */ 5197 idx = ibnd_desc->hdr.desc_handle; 5198 5199 if (idx >= VSW_RING_NUM_EL) { 5200 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 5201 __func__, idx); 5202 return; 5203 } 5204 5205 if ((dp = ldcp->lane_out.dringp) == NULL) { 5206 DERR(vswp, "%s: no dring found", __func__); 5207 return; 5208 } 5209 5210 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5211 5212 /* move to correct location in ring */ 5213 priv_addr += idx; 5214 5215 /* release resources associated with sent msg */ 5216 mutex_enter(&priv_addr->dstate_lock); 5217 bzero(priv_addr->datap, priv_addr->datalen); 5218 priv_addr->datalen = 0; 5219 priv_addr->dstate = VIO_DESC_FREE; 5220 mutex_exit(&priv_addr->dstate_lock); 5221 5222 break; 5223 5224 default: 5225 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5226 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 5227 } 5228 5229 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5230 } 5231 5232 static void 5233 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 5234 { 5235 _NOTE(ARGUNUSED(epkt)) 5236 5237 vsw_t *vswp = ldcp->ldc_vswp; 5238 uint16_t env = tag.vio_subtype_env; 5239 5240 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5241 5242 /* 5243 * Error vio_subtypes have yet to be defined. So for 5244 * the moment we can't do anything. 5245 */ 5246 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 5247 5248 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5249 } 5250 5251 /* 5252 * Switch the given ethernet frame when operating in layer 2 mode. 5253 * 5254 * vswp: pointer to the vsw instance 5255 * mp: pointer to chain of ethernet frame(s) to be switched 5256 * caller: identifies the source of this frame as: 5257 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 5258 * 2. VSW_PHYSDEV - the physical ethernet device 5259 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 5260 * arg: argument provided by the caller. 5261 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 5262 * 2. for PHYSDEV - NULL 5263 * 3. for LOCALDEV - pointer to to this vsw_t(self) 5264 */ 5265 void 5266 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 5267 vsw_port_t *arg, mac_resource_handle_t mrh) 5268 { 5269 struct ether_header *ehp; 5270 vsw_port_t *port = NULL; 5271 mblk_t *bp, *ret_m; 5272 mblk_t *nmp = NULL; 5273 vsw_port_list_t *plist = &vswp->plist; 5274 5275 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5276 5277 /* 5278 * PERF: rather than breaking up the chain here, scan it 5279 * to find all mblks heading to same destination and then 5280 * pass that sub-chain to the lower transmit functions. 5281 */ 5282 5283 /* process the chain of packets */ 5284 bp = mp; 5285 while (bp) { 5286 mp = bp; 5287 bp = bp->b_next; 5288 mp->b_next = mp->b_prev = NULL; 5289 ehp = (struct ether_header *)mp->b_rptr; 5290 5291 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5292 __func__, MBLKSIZE(mp), MBLKL(mp)); 5293 5294 READ_ENTER(&vswp->if_lockrw); 5295 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 5296 /* 5297 * If destination is VSW_LOCALDEV (vsw as an eth 5298 * interface) and if the device is up & running, 5299 * send the packet up the stack on this host. 5300 * If the virtual interface is down, drop the packet. 5301 */ 5302 if (caller != VSW_LOCALDEV) { 5303 if (vswp->if_state & VSW_IF_UP) { 5304 RW_EXIT(&vswp->if_lockrw); 5305 mac_rx(vswp->if_mh, mrh, mp); 5306 } else { 5307 RW_EXIT(&vswp->if_lockrw); 5308 /* Interface down, drop pkt */ 5309 freemsg(mp); 5310 } 5311 } else { 5312 RW_EXIT(&vswp->if_lockrw); 5313 freemsg(mp); 5314 } 5315 continue; 5316 } 5317 RW_EXIT(&vswp->if_lockrw); 5318 5319 READ_ENTER(&plist->lockrw); 5320 port = vsw_lookup_fdb(vswp, ehp); 5321 if (port) { 5322 /* 5323 * Mark the port as in-use. 5324 */ 5325 mutex_enter(&port->ref_lock); 5326 port->ref_cnt++; 5327 mutex_exit(&port->ref_lock); 5328 RW_EXIT(&plist->lockrw); 5329 5330 /* 5331 * If plumbed and in promisc mode then copy msg 5332 * and send up the stack. 5333 */ 5334 READ_ENTER(&vswp->if_lockrw); 5335 if (VSW_U_P(vswp->if_state)) { 5336 RW_EXIT(&vswp->if_lockrw); 5337 nmp = copymsg(mp); 5338 if (nmp) 5339 mac_rx(vswp->if_mh, mrh, nmp); 5340 } else { 5341 RW_EXIT(&vswp->if_lockrw); 5342 } 5343 5344 /* 5345 * If the destination is in FDB, the packet 5346 * should be forwarded to the correponding 5347 * vsw_port (connected to a vnet device - 5348 * VSW_VNETPORT) 5349 */ 5350 (void) vsw_portsend(port, mp); 5351 5352 /* 5353 * Decrement use count in port and check if 5354 * should wake delete thread. 5355 */ 5356 mutex_enter(&port->ref_lock); 5357 port->ref_cnt--; 5358 if (port->ref_cnt == 0) 5359 cv_signal(&port->ref_cv); 5360 mutex_exit(&port->ref_lock); 5361 } else { 5362 RW_EXIT(&plist->lockrw); 5363 /* 5364 * Destination not in FDB. 5365 * 5366 * If the destination is broadcast or 5367 * multicast forward the packet to all 5368 * (VNETPORTs, PHYSDEV, LOCALDEV), 5369 * except the caller. 5370 */ 5371 if (IS_BROADCAST(ehp)) { 5372 D3(vswp, "%s: BROADCAST pkt", __func__); 5373 (void) vsw_forward_all(vswp, mp, 5374 caller, arg); 5375 } else if (IS_MULTICAST(ehp)) { 5376 D3(vswp, "%s: MULTICAST pkt", __func__); 5377 (void) vsw_forward_grp(vswp, mp, 5378 caller, arg); 5379 } else { 5380 /* 5381 * If the destination is unicast, and came 5382 * from either a logical network device or 5383 * the switch itself when it is plumbed, then 5384 * send it out on the physical device and also 5385 * up the stack if the logical interface is 5386 * in promiscious mode. 5387 * 5388 * NOTE: The assumption here is that if we 5389 * cannot find the destination in our fdb, its 5390 * a unicast address, and came from either a 5391 * vnet or down the stack (when plumbed) it 5392 * must be destinded for an ethernet device 5393 * outside our ldoms. 5394 */ 5395 if (caller == VSW_VNETPORT) { 5396 READ_ENTER(&vswp->if_lockrw); 5397 if (VSW_U_P(vswp->if_state)) { 5398 RW_EXIT(&vswp->if_lockrw); 5399 nmp = copymsg(mp); 5400 if (nmp) 5401 mac_rx(vswp->if_mh, 5402 mrh, nmp); 5403 } else { 5404 RW_EXIT(&vswp->if_lockrw); 5405 } 5406 if ((ret_m = vsw_tx_msg(vswp, mp)) 5407 != NULL) { 5408 DERR(vswp, "%s: drop mblks to " 5409 "phys dev", __func__); 5410 freemsg(ret_m); 5411 } 5412 5413 } else if (caller == VSW_PHYSDEV) { 5414 /* 5415 * Pkt seen because card in promisc 5416 * mode. Send up stack if plumbed in 5417 * promisc mode, else drop it. 5418 */ 5419 READ_ENTER(&vswp->if_lockrw); 5420 if (VSW_U_P(vswp->if_state)) { 5421 RW_EXIT(&vswp->if_lockrw); 5422 mac_rx(vswp->if_mh, mrh, mp); 5423 } else { 5424 RW_EXIT(&vswp->if_lockrw); 5425 freemsg(mp); 5426 } 5427 5428 } else if (caller == VSW_LOCALDEV) { 5429 /* 5430 * Pkt came down the stack, send out 5431 * over physical device. 5432 */ 5433 if ((ret_m = vsw_tx_msg(vswp, mp)) 5434 != NULL) { 5435 DERR(vswp, "%s: drop mblks to " 5436 "phys dev", __func__); 5437 freemsg(ret_m); 5438 } 5439 } 5440 } 5441 } 5442 } 5443 D1(vswp, "%s: exit\n", __func__); 5444 } 5445 5446 /* 5447 * Switch ethernet frame when in layer 3 mode (i.e. using IP 5448 * layer to do the routing). 5449 * 5450 * There is a large amount of overlap between this function and 5451 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 5452 * both these functions. 5453 */ 5454 void 5455 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 5456 vsw_port_t *arg, mac_resource_handle_t mrh) 5457 { 5458 struct ether_header *ehp; 5459 vsw_port_t *port = NULL; 5460 mblk_t *bp = NULL; 5461 vsw_port_list_t *plist = &vswp->plist; 5462 5463 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5464 5465 /* 5466 * In layer 3 mode should only ever be switching packets 5467 * between IP layer and vnet devices. So make sure thats 5468 * who is invoking us. 5469 */ 5470 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 5471 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 5472 freemsgchain(mp); 5473 return; 5474 } 5475 5476 /* process the chain of packets */ 5477 bp = mp; 5478 while (bp) { 5479 mp = bp; 5480 bp = bp->b_next; 5481 mp->b_next = mp->b_prev = NULL; 5482 ehp = (struct ether_header *)mp->b_rptr; 5483 5484 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5485 __func__, MBLKSIZE(mp), MBLKL(mp)); 5486 5487 READ_ENTER(&plist->lockrw); 5488 port = vsw_lookup_fdb(vswp, ehp); 5489 if (port) { 5490 /* 5491 * Mark port as in-use. 5492 */ 5493 mutex_enter(&port->ref_lock); 5494 port->ref_cnt++; 5495 mutex_exit(&port->ref_lock); 5496 RW_EXIT(&plist->lockrw); 5497 5498 D2(vswp, "%s: sending to target port", __func__); 5499 (void) vsw_portsend(port, mp); 5500 5501 /* 5502 * Finished with port so decrement ref count and 5503 * check if should wake delete thread. 5504 */ 5505 mutex_enter(&port->ref_lock); 5506 port->ref_cnt--; 5507 if (port->ref_cnt == 0) 5508 cv_signal(&port->ref_cv); 5509 mutex_exit(&port->ref_lock); 5510 } else { 5511 RW_EXIT(&plist->lockrw); 5512 /* 5513 * Destination not in FDB 5514 * 5515 * If the destination is broadcast or 5516 * multicast forward the packet to all 5517 * (VNETPORTs, PHYSDEV, LOCALDEV), 5518 * except the caller. 5519 */ 5520 if (IS_BROADCAST(ehp)) { 5521 D2(vswp, "%s: BROADCAST pkt", __func__); 5522 (void) vsw_forward_all(vswp, mp, 5523 caller, arg); 5524 } else if (IS_MULTICAST(ehp)) { 5525 D2(vswp, "%s: MULTICAST pkt", __func__); 5526 (void) vsw_forward_grp(vswp, mp, 5527 caller, arg); 5528 } else { 5529 /* 5530 * Unicast pkt from vnet that we don't have 5531 * an FDB entry for, so must be destinded for 5532 * the outside world. Attempt to send up to the 5533 * IP layer to allow it to deal with it. 5534 */ 5535 if (caller == VSW_VNETPORT) { 5536 READ_ENTER(&vswp->if_lockrw); 5537 if (vswp->if_state & VSW_IF_UP) { 5538 RW_EXIT(&vswp->if_lockrw); 5539 D2(vswp, "%s: sending up", 5540 __func__); 5541 mac_rx(vswp->if_mh, mrh, mp); 5542 } else { 5543 RW_EXIT(&vswp->if_lockrw); 5544 /* Interface down, drop pkt */ 5545 D2(vswp, "%s I/F down", 5546 __func__); 5547 freemsg(mp); 5548 } 5549 } 5550 } 5551 } 5552 } 5553 5554 D1(vswp, "%s: exit", __func__); 5555 } 5556 5557 /* 5558 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 5559 * except the caller (port on which frame arrived). 5560 */ 5561 static int 5562 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 5563 { 5564 vsw_port_list_t *plist = &vswp->plist; 5565 vsw_port_t *portp; 5566 mblk_t *nmp = NULL; 5567 mblk_t *ret_m = NULL; 5568 int skip_port = 0; 5569 5570 D1(vswp, "vsw_forward_all: enter\n"); 5571 5572 /* 5573 * Broadcast message from inside ldoms so send to outside 5574 * world if in either of layer 2 modes. 5575 */ 5576 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 5577 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 5578 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 5579 5580 nmp = dupmsg(mp); 5581 if (nmp) { 5582 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 5583 DERR(vswp, "%s: dropping pkt(s) " 5584 "consisting of %ld bytes of data for" 5585 " physical device", __func__, MBLKL(ret_m)); 5586 freemsg(ret_m); 5587 } 5588 } 5589 } 5590 5591 if (caller == VSW_VNETPORT) 5592 skip_port = 1; 5593 5594 /* 5595 * Broadcast message from other vnet (layer 2 or 3) or outside 5596 * world (layer 2 only), send up stack if plumbed. 5597 */ 5598 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 5599 READ_ENTER(&vswp->if_lockrw); 5600 if (vswp->if_state & VSW_IF_UP) { 5601 RW_EXIT(&vswp->if_lockrw); 5602 nmp = copymsg(mp); 5603 if (nmp) 5604 mac_rx(vswp->if_mh, NULL, nmp); 5605 } else { 5606 RW_EXIT(&vswp->if_lockrw); 5607 } 5608 } 5609 5610 /* send it to all VNETPORTs */ 5611 READ_ENTER(&plist->lockrw); 5612 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 5613 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 5614 /* 5615 * Caution ! - don't reorder these two checks as arg 5616 * will be NULL if the caller is PHYSDEV. skip_port is 5617 * only set if caller is VNETPORT. 5618 */ 5619 if ((skip_port) && (portp == arg)) 5620 continue; 5621 else { 5622 nmp = dupmsg(mp); 5623 if (nmp) { 5624 (void) vsw_portsend(portp, nmp); 5625 } else { 5626 DERR(vswp, "vsw_forward_all: nmp NULL"); 5627 } 5628 } 5629 } 5630 RW_EXIT(&plist->lockrw); 5631 5632 freemsg(mp); 5633 5634 D1(vswp, "vsw_forward_all: exit\n"); 5635 return (0); 5636 } 5637 5638 /* 5639 * Forward pkts to any devices or interfaces which have registered 5640 * an interest in them (i.e. multicast groups). 5641 */ 5642 static int 5643 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 5644 { 5645 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 5646 mfdb_ent_t *entp = NULL; 5647 mfdb_ent_t *tpp = NULL; 5648 vsw_port_t *port; 5649 uint64_t key = 0; 5650 mblk_t *nmp = NULL; 5651 mblk_t *ret_m = NULL; 5652 boolean_t check_if = B_TRUE; 5653 5654 /* 5655 * Convert address to hash table key 5656 */ 5657 KEY_HASH(key, ehp->ether_dhost); 5658 5659 D1(vswp, "%s: key 0x%llx", __func__, key); 5660 5661 /* 5662 * If pkt came from either a vnet or down the stack (if we are 5663 * plumbed) and we are in layer 2 mode, then we send the pkt out 5664 * over the physical adapter, and then check to see if any other 5665 * vnets are interested in it. 5666 */ 5667 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 5668 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 5669 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 5670 nmp = dupmsg(mp); 5671 if (nmp) { 5672 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 5673 DERR(vswp, "%s: dropping pkt(s) " 5674 "consisting of %ld bytes of " 5675 "data for physical device", 5676 __func__, MBLKL(ret_m)); 5677 freemsg(ret_m); 5678 } 5679 } 5680 } 5681 5682 READ_ENTER(&vswp->mfdbrw); 5683 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 5684 (mod_hash_val_t *)&entp) != 0) { 5685 D3(vswp, "%s: no table entry found for addr 0x%llx", 5686 __func__, key); 5687 } else { 5688 /* 5689 * Send to list of devices associated with this address... 5690 */ 5691 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 5692 5693 /* dont send to ourselves */ 5694 if ((caller == VSW_VNETPORT) && 5695 (tpp->d_addr == (void *)arg)) { 5696 port = (vsw_port_t *)tpp->d_addr; 5697 D3(vswp, "%s: not sending to ourselves" 5698 " : port %d", __func__, 5699 port->p_instance); 5700 continue; 5701 5702 } else if ((caller == VSW_LOCALDEV) && 5703 (tpp->d_type == VSW_LOCALDEV)) { 5704 D3(vswp, "%s: not sending back up stack", 5705 __func__); 5706 continue; 5707 } 5708 5709 if (tpp->d_type == VSW_VNETPORT) { 5710 port = (vsw_port_t *)tpp->d_addr; 5711 D3(vswp, "%s: sending to port %ld for " 5712 " addr 0x%llx", __func__, 5713 port->p_instance, key); 5714 5715 nmp = dupmsg(mp); 5716 if (nmp) 5717 (void) vsw_portsend(port, nmp); 5718 } else { 5719 if (vswp->if_state & VSW_IF_UP) { 5720 nmp = copymsg(mp); 5721 if (nmp) 5722 mac_rx(vswp->if_mh, NULL, nmp); 5723 check_if = B_FALSE; 5724 D3(vswp, "%s: sending up stack" 5725 " for addr 0x%llx", __func__, 5726 key); 5727 } 5728 } 5729 } 5730 } 5731 5732 RW_EXIT(&vswp->mfdbrw); 5733 5734 /* 5735 * If the pkt came from either a vnet or from physical device, 5736 * and if we havent already sent the pkt up the stack then we 5737 * check now if we can/should (i.e. the interface is plumbed 5738 * and in promisc mode). 5739 */ 5740 if ((check_if) && 5741 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 5742 READ_ENTER(&vswp->if_lockrw); 5743 if (VSW_U_P(vswp->if_state)) { 5744 RW_EXIT(&vswp->if_lockrw); 5745 D3(vswp, "%s: (caller %d) finally sending up stack" 5746 " for addr 0x%llx", __func__, caller, key); 5747 nmp = copymsg(mp); 5748 if (nmp) 5749 mac_rx(vswp->if_mh, NULL, nmp); 5750 } else { 5751 RW_EXIT(&vswp->if_lockrw); 5752 } 5753 } 5754 5755 freemsg(mp); 5756 5757 D1(vswp, "%s: exit", __func__); 5758 5759 return (0); 5760 } 5761 5762 /* transmit the packet over the given port */ 5763 static int 5764 vsw_portsend(vsw_port_t *port, mblk_t *mp) 5765 { 5766 vsw_ldc_list_t *ldcl = &port->p_ldclist; 5767 vsw_ldc_t *ldcp; 5768 int status = 0; 5769 5770 5771 READ_ENTER(&ldcl->lockrw); 5772 /* 5773 * Note for now, we have a single channel. 5774 */ 5775 ldcp = ldcl->head; 5776 if (ldcp == NULL) { 5777 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 5778 freemsg(mp); 5779 RW_EXIT(&ldcl->lockrw); 5780 return (1); 5781 } 5782 5783 /* 5784 * Send the message out using the appropriate 5785 * transmit function which will free mblock when it 5786 * is finished with it. 5787 */ 5788 mutex_enter(&port->tx_lock); 5789 if (port->transmit != NULL) 5790 status = (*port->transmit)(ldcp, mp); 5791 else { 5792 freemsg(mp); 5793 } 5794 mutex_exit(&port->tx_lock); 5795 5796 RW_EXIT(&ldcl->lockrw); 5797 5798 return (status); 5799 } 5800 5801 /* 5802 * Send packet out via descriptor ring to a logical device. 5803 */ 5804 static int 5805 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 5806 { 5807 vio_dring_msg_t dring_pkt; 5808 dring_info_t *dp = NULL; 5809 vsw_private_desc_t *priv_desc = NULL; 5810 vnet_public_desc_t *pub = NULL; 5811 vsw_t *vswp = ldcp->ldc_vswp; 5812 mblk_t *bp; 5813 size_t n, size; 5814 caddr_t bufp; 5815 int idx; 5816 int status = LDC_TX_SUCCESS; 5817 5818 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 5819 5820 /* TODO: make test a macro */ 5821 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 5822 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 5823 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 5824 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 5825 ldcp->lane_out.lstate); 5826 freemsg(mp); 5827 return (LDC_TX_FAILURE); 5828 } 5829 5830 /* 5831 * Note - using first ring only, this may change 5832 * in the future. 5833 */ 5834 if ((dp = ldcp->lane_out.dringp) == NULL) { 5835 DERR(vswp, "%s(%lld): no dring for outbound lane on" 5836 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 5837 freemsg(mp); 5838 return (LDC_TX_FAILURE); 5839 } 5840 5841 size = msgsize(mp); 5842 if (size > (size_t)ETHERMAX) { 5843 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 5844 ldcp->ldc_id, size); 5845 freemsg(mp); 5846 return (LDC_TX_FAILURE); 5847 } 5848 5849 /* 5850 * Find a free descriptor 5851 * 5852 * Note: for the moment we are assuming that we will only 5853 * have one dring going from the switch to each of its 5854 * peers. This may change in the future. 5855 */ 5856 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 5857 D2(vswp, "%s(%lld): no descriptor available for ring " 5858 "at 0x%llx", __func__, ldcp->ldc_id, dp); 5859 5860 /* nothing more we can do */ 5861 status = LDC_TX_NORESOURCES; 5862 goto vsw_dringsend_free_exit; 5863 } else { 5864 D2(vswp, "%s(%lld): free private descriptor found at pos " 5865 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 5866 priv_desc); 5867 } 5868 5869 /* copy data into the descriptor */ 5870 bufp = priv_desc->datap; 5871 bufp += VNET_IPALIGN; 5872 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 5873 n = MBLKL(bp); 5874 bcopy(bp->b_rptr, bufp, n); 5875 bufp += n; 5876 } 5877 5878 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 5879 5880 pub = priv_desc->descp; 5881 pub->nbytes = priv_desc->datalen; 5882 5883 mutex_enter(&priv_desc->dstate_lock); 5884 pub->hdr.dstate = VIO_DESC_READY; 5885 mutex_exit(&priv_desc->dstate_lock); 5886 5887 /* 5888 * Determine whether or not we need to send a message to our 5889 * peer prompting them to read our newly updated descriptor(s). 5890 */ 5891 mutex_enter(&dp->restart_lock); 5892 if (dp->restart_reqd) { 5893 dp->restart_reqd = B_FALSE; 5894 mutex_exit(&dp->restart_lock); 5895 5896 /* 5897 * Send a vio_dring_msg to peer to prompt them to read 5898 * the updated descriptor ring. 5899 */ 5900 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 5901 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 5902 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 5903 dring_pkt.tag.vio_sid = ldcp->local_session; 5904 5905 /* Note - for now using first ring */ 5906 dring_pkt.dring_ident = dp->ident; 5907 5908 mutex_enter(&ldcp->lane_out.seq_lock); 5909 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 5910 mutex_exit(&ldcp->lane_out.seq_lock); 5911 5912 /* 5913 * If last_ack_recv is -1 then we know we've not 5914 * received any ack's yet, so this must be the first 5915 * msg sent, so set the start to the begining of the ring. 5916 */ 5917 mutex_enter(&dp->dlock); 5918 if (dp->last_ack_recv == -1) { 5919 dring_pkt.start_idx = 0; 5920 } else { 5921 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 5922 dp->num_descriptors; 5923 } 5924 dring_pkt.end_idx = -1; 5925 mutex_exit(&dp->dlock); 5926 5927 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 5928 ldcp->ldc_id, dp, dring_pkt.dring_ident); 5929 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 5930 __func__, ldcp->ldc_id, dring_pkt.start_idx, 5931 dring_pkt.end_idx, dring_pkt.seq_num); 5932 5933 vsw_send_msg(ldcp, (void *)&dring_pkt, 5934 sizeof (vio_dring_msg_t)); 5935 } else { 5936 mutex_exit(&dp->restart_lock); 5937 D2(vswp, "%s(%lld): updating descp %d", __func__, 5938 ldcp->ldc_id, idx); 5939 } 5940 5941 vsw_dringsend_free_exit: 5942 5943 /* free the message block */ 5944 freemsg(mp); 5945 5946 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 5947 return (status); 5948 } 5949 5950 /* 5951 * Send an in-band descriptor message over ldc. 5952 */ 5953 static int 5954 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 5955 { 5956 vsw_t *vswp = ldcp->ldc_vswp; 5957 vio_ibnd_desc_t ibnd_msg; 5958 vsw_private_desc_t *priv_desc = NULL; 5959 dring_info_t *dp = NULL; 5960 size_t n, size = 0; 5961 caddr_t bufp; 5962 mblk_t *bp; 5963 int idx, i; 5964 int status = LDC_TX_SUCCESS; 5965 static int warn_msg = 1; 5966 5967 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5968 5969 ASSERT(mp != NULL); 5970 5971 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 5972 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 5973 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 5974 __func__, ldcp->ldc_id, ldcp->ldc_status, 5975 ldcp->lane_out.lstate); 5976 freemsg(mp); 5977 return (LDC_TX_FAILURE); 5978 } 5979 5980 /* 5981 * only expect single dring to exist, which we use 5982 * as an internal buffer, rather than a transfer channel. 5983 */ 5984 if ((dp = ldcp->lane_out.dringp) == NULL) { 5985 DERR(vswp, "%s(%lld): no dring for outbound lane", 5986 __func__, ldcp->ldc_id); 5987 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 5988 __func__, ldcp->ldc_id, ldcp->ldc_status, 5989 ldcp->lane_out.lstate); 5990 freemsg(mp); 5991 return (LDC_TX_FAILURE); 5992 } 5993 5994 size = msgsize(mp); 5995 if (size > (size_t)ETHERMAX) { 5996 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 5997 ldcp->ldc_id, size); 5998 freemsg(mp); 5999 return (LDC_TX_FAILURE); 6000 } 6001 6002 /* 6003 * Find a free descriptor in our buffer ring 6004 */ 6005 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6006 if (warn_msg) { 6007 DERR(vswp, "%s(%lld): no descriptor available for ring " 6008 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6009 warn_msg = 0; 6010 } 6011 6012 /* nothing more we can do */ 6013 status = LDC_TX_NORESOURCES; 6014 goto vsw_descrsend_free_exit; 6015 } else { 6016 D2(vswp, "%s(%lld): free private descriptor found at pos " 6017 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 6018 priv_desc); 6019 warn_msg = 1; 6020 } 6021 6022 /* copy data into the descriptor */ 6023 bufp = priv_desc->datap; 6024 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6025 n = MBLKL(bp); 6026 bcopy(bp->b_rptr, bufp, n); 6027 bufp += n; 6028 } 6029 6030 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6031 6032 /* create and send the in-band descp msg */ 6033 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 6034 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 6035 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 6036 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 6037 6038 mutex_enter(&ldcp->lane_out.seq_lock); 6039 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 6040 mutex_exit(&ldcp->lane_out.seq_lock); 6041 6042 /* 6043 * Copy the mem cookies describing the data from the 6044 * private region of the descriptor ring into the inband 6045 * descriptor. 6046 */ 6047 for (i = 0; i < priv_desc->ncookies; i++) { 6048 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 6049 sizeof (ldc_mem_cookie_t)); 6050 } 6051 6052 ibnd_msg.hdr.desc_handle = idx; 6053 ibnd_msg.ncookies = priv_desc->ncookies; 6054 ibnd_msg.nbytes = size; 6055 6056 vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t)); 6057 6058 vsw_descrsend_free_exit: 6059 6060 /* free the allocated message blocks */ 6061 freemsg(mp); 6062 6063 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6064 return (status); 6065 } 6066 6067 static void 6068 vsw_send_ver(vsw_ldc_t *ldcp) 6069 { 6070 vsw_t *vswp = ldcp->ldc_vswp; 6071 lane_t *lp = &ldcp->lane_out; 6072 vio_ver_msg_t ver_msg; 6073 6074 D1(vswp, "%s enter", __func__); 6075 6076 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6077 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6078 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 6079 ver_msg.tag.vio_sid = ldcp->local_session; 6080 6081 ver_msg.ver_major = vsw_versions[0].ver_major; 6082 ver_msg.ver_minor = vsw_versions[0].ver_minor; 6083 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 6084 6085 lp->lstate |= VSW_VER_INFO_SENT; 6086 lp->ver_major = ver_msg.ver_major; 6087 lp->ver_minor = ver_msg.ver_minor; 6088 6089 DUMP_TAG(ver_msg.tag); 6090 6091 vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); 6092 6093 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 6094 } 6095 6096 static void 6097 vsw_send_attr(vsw_ldc_t *ldcp) 6098 { 6099 vsw_t *vswp = ldcp->ldc_vswp; 6100 lane_t *lp = &ldcp->lane_out; 6101 vnet_attr_msg_t attr_msg; 6102 6103 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6104 6105 /* 6106 * Subtype is set to INFO by default 6107 */ 6108 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6109 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6110 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 6111 attr_msg.tag.vio_sid = ldcp->local_session; 6112 6113 /* payload copied from default settings for lane */ 6114 attr_msg.mtu = lp->mtu; 6115 attr_msg.addr_type = lp->addr_type; 6116 attr_msg.xfer_mode = lp->xfer_mode; 6117 attr_msg.ack_freq = lp->xfer_mode; 6118 6119 READ_ENTER(&vswp->if_lockrw); 6120 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 6121 RW_EXIT(&vswp->if_lockrw); 6122 6123 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 6124 6125 DUMP_TAG(attr_msg.tag); 6126 6127 vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); 6128 6129 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6130 } 6131 6132 /* 6133 * Create dring info msg (which also results in the creation of 6134 * a dring). 6135 */ 6136 static vio_dring_reg_msg_t * 6137 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 6138 { 6139 vio_dring_reg_msg_t *mp; 6140 dring_info_t *dp; 6141 vsw_t *vswp = ldcp->ldc_vswp; 6142 6143 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 6144 6145 /* 6146 * If we can't create a dring, obviously no point sending 6147 * a message. 6148 */ 6149 if ((dp = vsw_create_dring(ldcp)) == NULL) 6150 return (NULL); 6151 6152 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 6153 6154 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 6155 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 6156 mp->tag.vio_subtype_env = VIO_DRING_REG; 6157 mp->tag.vio_sid = ldcp->local_session; 6158 6159 /* payload */ 6160 mp->num_descriptors = dp->num_descriptors; 6161 mp->descriptor_size = dp->descriptor_size; 6162 mp->options = dp->options; 6163 mp->ncookies = dp->ncookies; 6164 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 6165 6166 mp->dring_ident = 0; 6167 6168 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 6169 6170 return (mp); 6171 } 6172 6173 static void 6174 vsw_send_dring_info(vsw_ldc_t *ldcp) 6175 { 6176 vio_dring_reg_msg_t *dring_msg; 6177 vsw_t *vswp = ldcp->ldc_vswp; 6178 6179 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 6180 6181 dring_msg = vsw_create_dring_info_pkt(ldcp); 6182 if (dring_msg == NULL) { 6183 cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); 6184 return; 6185 } 6186 6187 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 6188 6189 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 6190 6191 vsw_send_msg(ldcp, dring_msg, 6192 sizeof (vio_dring_reg_msg_t)); 6193 6194 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 6195 6196 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 6197 } 6198 6199 static void 6200 vsw_send_rdx(vsw_ldc_t *ldcp) 6201 { 6202 vsw_t *vswp = ldcp->ldc_vswp; 6203 vio_rdx_msg_t rdx_msg; 6204 6205 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6206 6207 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6208 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6209 rdx_msg.tag.vio_subtype_env = VIO_RDX; 6210 rdx_msg.tag.vio_sid = ldcp->local_session; 6211 6212 ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; 6213 6214 DUMP_TAG(rdx_msg.tag); 6215 6216 vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); 6217 6218 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 6219 } 6220 6221 /* 6222 * Generic routine to send message out over ldc channel. 6223 */ 6224 static void 6225 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) 6226 { 6227 int rv; 6228 size_t msglen = size; 6229 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 6230 vsw_t *vswp = ldcp->ldc_vswp; 6231 6232 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 6233 ldcp->ldc_id, size); 6234 6235 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 6236 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 6237 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 6238 6239 mutex_enter(&ldcp->ldc_txlock); 6240 do { 6241 msglen = size; 6242 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 6243 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 6244 6245 mutex_exit(&ldcp->ldc_txlock); 6246 6247 if ((rv != 0) || (msglen != size)) { 6248 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 6249 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 6250 rv, size, msglen); 6251 } 6252 6253 D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", 6254 ldcp->ldc_id, msglen); 6255 } 6256 6257 /* 6258 * Add an entry into FDB, for the given mac address and port_id. 6259 * Returns 0 on success, 1 on failure. 6260 * 6261 * Lock protecting FDB must be held by calling process. 6262 */ 6263 static int 6264 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 6265 { 6266 uint64_t addr = 0; 6267 6268 D1(vswp, "%s: enter", __func__); 6269 6270 KEY_HASH(addr, port->p_macaddr); 6271 6272 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6273 6274 /* 6275 * Note: duplicate keys will be rejected by mod_hash. 6276 */ 6277 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 6278 (mod_hash_val_t)port) != 0) { 6279 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 6280 return (1); 6281 } 6282 6283 D1(vswp, "%s: exit", __func__); 6284 return (0); 6285 } 6286 6287 /* 6288 * Remove an entry from FDB. 6289 * Returns 0 on success, 1 on failure. 6290 */ 6291 static int 6292 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 6293 { 6294 uint64_t addr = 0; 6295 6296 D1(vswp, "%s: enter", __func__); 6297 6298 KEY_HASH(addr, port->p_macaddr); 6299 6300 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6301 6302 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 6303 6304 D1(vswp, "%s: enter", __func__); 6305 6306 return (0); 6307 } 6308 6309 /* 6310 * Search fdb for a given mac address. 6311 * Returns pointer to the entry if found, else returns NULL. 6312 */ 6313 static vsw_port_t * 6314 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 6315 { 6316 uint64_t key = 0; 6317 vsw_port_t *port = NULL; 6318 6319 D1(vswp, "%s: enter", __func__); 6320 6321 KEY_HASH(key, ehp->ether_dhost); 6322 6323 D2(vswp, "%s: key = 0x%llx", __func__, key); 6324 6325 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 6326 (mod_hash_val_t *)&port) != 0) { 6327 return (NULL); 6328 } 6329 6330 D1(vswp, "%s: exit", __func__); 6331 6332 return (port); 6333 } 6334 6335 /* 6336 * Add or remove multicast address(es). 6337 * 6338 * Returns 0 on success, 1 on failure. 6339 */ 6340 static int 6341 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 6342 { 6343 mcst_addr_t *mcst_p = NULL; 6344 vsw_t *vswp = port->p_vswp; 6345 uint64_t addr = 0x0; 6346 int i, ret; 6347 6348 D1(vswp, "%s: enter", __func__); 6349 6350 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 6351 6352 if (vswp->mh == NULL) 6353 return (1); 6354 6355 for (i = 0; i < mcst_pkt->count; i++) { 6356 /* 6357 * Convert address into form that can be used 6358 * as hash table key. 6359 */ 6360 KEY_HASH(addr, mcst_pkt->mca[i]); 6361 6362 /* 6363 * Add or delete the specified address/port combination. 6364 */ 6365 if (mcst_pkt->set == 0x1) { 6366 D3(vswp, "%s: adding multicast address 0x%llx for " 6367 "port %ld", __func__, addr, port->p_instance); 6368 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6369 /* 6370 * Update the list of multicast 6371 * addresses contained within the 6372 * port structure to include this new 6373 * one. 6374 */ 6375 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 6376 KM_NOSLEEP); 6377 if (mcst_p == NULL) { 6378 DERR(vswp, "%s: unable to alloc mem", 6379 __func__); 6380 return (1); 6381 } 6382 6383 mcst_p->nextp = NULL; 6384 mcst_p->addr = addr; 6385 6386 mutex_enter(&port->mca_lock); 6387 mcst_p->nextp = port->mcap; 6388 port->mcap = mcst_p; 6389 mutex_exit(&port->mca_lock); 6390 6391 /* 6392 * Program the address into HW. If the addr 6393 * has already been programmed then the MAC 6394 * just increments a ref counter (which is 6395 * used when the address is being deleted) 6396 */ 6397 ret = mac_multicst_add(vswp->mh, 6398 (uchar_t *)&mcst_pkt->mca[i]); 6399 if (ret) { 6400 cmn_err(CE_WARN, "!unable to add " 6401 "multicast address"); 6402 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6403 addr, port); 6404 vsw_del_addr(VSW_VNETPORT, port, addr); 6405 return (ret); 6406 } 6407 6408 } else { 6409 DERR(vswp, "%s: error adding multicast " 6410 "address 0x%llx for port %ld", 6411 __func__, addr, port->p_instance); 6412 return (1); 6413 } 6414 } else { 6415 /* 6416 * Delete an entry from the multicast hash 6417 * table and update the address list 6418 * appropriately. 6419 */ 6420 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6421 D3(vswp, "%s: deleting multicast address " 6422 "0x%llx for port %ld", __func__, addr, 6423 port->p_instance); 6424 6425 vsw_del_addr(VSW_VNETPORT, port, addr); 6426 6427 /* 6428 * Remove the address from HW. The address 6429 * will actually only be removed once the ref 6430 * count within the MAC layer has dropped to 6431 * zero. I.e. we can safely call this fn even 6432 * if other ports are interested in this 6433 * address. 6434 */ 6435 (void) mac_multicst_remove(vswp->mh, 6436 (uchar_t *)&mcst_pkt->mca[i]); 6437 6438 } else { 6439 DERR(vswp, "%s: error deleting multicast " 6440 "addr 0x%llx for port %ld", 6441 __func__, addr, port->p_instance); 6442 return (1); 6443 } 6444 } 6445 } 6446 D1(vswp, "%s: exit", __func__); 6447 return (0); 6448 } 6449 6450 /* 6451 * Add a new multicast entry. 6452 * 6453 * Search hash table based on address. If match found then 6454 * update associated val (which is chain of ports), otherwise 6455 * create new key/val (addr/port) pair and insert into table. 6456 */ 6457 static int 6458 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6459 { 6460 int dup = 0; 6461 int rv = 0; 6462 mfdb_ent_t *ment = NULL; 6463 mfdb_ent_t *tmp_ent = NULL; 6464 mfdb_ent_t *new_ent = NULL; 6465 void *tgt = NULL; 6466 6467 if (devtype == VSW_VNETPORT) { 6468 /* 6469 * Being invoked from a vnet. 6470 */ 6471 ASSERT(arg != NULL); 6472 tgt = arg; 6473 D2(NULL, "%s: port %d : address 0x%llx", __func__, 6474 ((vsw_port_t *)arg)->p_instance, addr); 6475 } else { 6476 /* 6477 * We are being invoked via the m_multicst mac entry 6478 * point. 6479 */ 6480 D2(NULL, "%s: address 0x%llx", __func__, addr); 6481 tgt = (void *)vswp; 6482 } 6483 6484 WRITE_ENTER(&vswp->mfdbrw); 6485 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 6486 (mod_hash_val_t *)&ment) != 0) { 6487 6488 /* address not currently in table */ 6489 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 6490 ment->d_addr = (void *)tgt; 6491 ment->d_type = devtype; 6492 ment->nextp = NULL; 6493 6494 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 6495 (mod_hash_val_t)ment) != 0) { 6496 DERR(vswp, "%s: hash table insertion failed", __func__); 6497 kmem_free(ment, sizeof (mfdb_ent_t)); 6498 rv = 1; 6499 } else { 6500 D2(vswp, "%s: added initial entry for 0x%llx to " 6501 "table", __func__, addr); 6502 } 6503 } else { 6504 /* 6505 * Address in table. Check to see if specified port 6506 * is already associated with the address. If not add 6507 * it now. 6508 */ 6509 tmp_ent = ment; 6510 while (tmp_ent != NULL) { 6511 if (tmp_ent->d_addr == (void *)tgt) { 6512 if (devtype == VSW_VNETPORT) { 6513 DERR(vswp, "%s: duplicate port entry " 6514 "found for portid %ld and key " 6515 "0x%llx", __func__, 6516 ((vsw_port_t *)arg)->p_instance, 6517 addr); 6518 } else { 6519 DERR(vswp, "%s: duplicate entry found" 6520 "for key 0x%llx", 6521 __func__, addr); 6522 } 6523 rv = 1; 6524 dup = 1; 6525 break; 6526 } 6527 tmp_ent = tmp_ent->nextp; 6528 } 6529 6530 /* 6531 * Port not on list so add it to end now. 6532 */ 6533 if (0 == dup) { 6534 D2(vswp, "%s: added entry for 0x%llx to table", 6535 __func__, addr); 6536 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 6537 new_ent->d_addr = (void *)tgt; 6538 new_ent->d_type = devtype; 6539 new_ent->nextp = NULL; 6540 6541 tmp_ent = ment; 6542 while (tmp_ent->nextp != NULL) 6543 tmp_ent = tmp_ent->nextp; 6544 6545 tmp_ent->nextp = new_ent; 6546 } 6547 } 6548 6549 RW_EXIT(&vswp->mfdbrw); 6550 return (rv); 6551 } 6552 6553 /* 6554 * Remove a multicast entry from the hashtable. 6555 * 6556 * Search hash table based on address. If match found, scan 6557 * list of ports associated with address. If specified port 6558 * found remove it from list. 6559 */ 6560 static int 6561 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6562 { 6563 mfdb_ent_t *ment = NULL; 6564 mfdb_ent_t *curr_p, *prev_p; 6565 void *tgt = NULL; 6566 6567 D1(vswp, "%s: enter", __func__); 6568 6569 if (devtype == VSW_VNETPORT) { 6570 tgt = (vsw_port_t *)arg; 6571 D2(vswp, "%s: removing port %d from mFDB for address" 6572 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 6573 addr); 6574 } else { 6575 D2(vswp, "%s: removing entry", __func__); 6576 tgt = (void *)vswp; 6577 } 6578 6579 WRITE_ENTER(&vswp->mfdbrw); 6580 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 6581 (mod_hash_val_t *)&ment) != 0) { 6582 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 6583 RW_EXIT(&vswp->mfdbrw); 6584 return (1); 6585 } 6586 6587 prev_p = curr_p = ment; 6588 6589 while (curr_p != NULL) { 6590 if (curr_p->d_addr == (void *)tgt) { 6591 if (devtype == VSW_VNETPORT) { 6592 D2(vswp, "%s: port %d found", __func__, 6593 ((vsw_port_t *)tgt)->p_instance); 6594 } else { 6595 D2(vswp, "%s: instance found", __func__); 6596 } 6597 6598 if (prev_p == curr_p) { 6599 /* 6600 * head of list, if no other element is in 6601 * list then destroy this entry, otherwise 6602 * just replace it with updated value. 6603 */ 6604 ment = curr_p->nextp; 6605 kmem_free(curr_p, sizeof (mfdb_ent_t)); 6606 if (ment == NULL) { 6607 (void) mod_hash_destroy(vswp->mfdb, 6608 (mod_hash_val_t)addr); 6609 } else { 6610 (void) mod_hash_replace(vswp->mfdb, 6611 (mod_hash_key_t)addr, 6612 (mod_hash_val_t)ment); 6613 } 6614 } else { 6615 /* 6616 * Not head of list, no need to do 6617 * replacement, just adjust list pointers. 6618 */ 6619 prev_p->nextp = curr_p->nextp; 6620 kmem_free(curr_p, sizeof (mfdb_ent_t)); 6621 } 6622 break; 6623 } 6624 6625 prev_p = curr_p; 6626 curr_p = curr_p->nextp; 6627 } 6628 6629 RW_EXIT(&vswp->mfdbrw); 6630 6631 D1(vswp, "%s: exit", __func__); 6632 6633 return (0); 6634 } 6635 6636 /* 6637 * Port is being deleted, but has registered an interest in one 6638 * or more multicast groups. Using the list of addresses maintained 6639 * within the port structure find the appropriate entry in the hash 6640 * table and remove this port from the list of interested ports. 6641 */ 6642 static void 6643 vsw_del_mcst_port(vsw_port_t *port) 6644 { 6645 mcst_addr_t *mcst_p = NULL; 6646 vsw_t *vswp = port->p_vswp; 6647 6648 D1(vswp, "%s: enter", __func__); 6649 6650 mutex_enter(&port->mca_lock); 6651 while (port->mcap != NULL) { 6652 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6653 port->mcap->addr, port); 6654 6655 mcst_p = port->mcap->nextp; 6656 kmem_free(port->mcap, sizeof (mcst_addr_t)); 6657 port->mcap = mcst_p; 6658 } 6659 mutex_exit(&port->mca_lock); 6660 6661 D1(vswp, "%s: exit", __func__); 6662 } 6663 6664 /* 6665 * This vsw instance is detaching, but has registered an interest in one 6666 * or more multicast groups. Using the list of addresses maintained 6667 * within the vsw structure find the appropriate entry in the hash 6668 * table and remove this instance from the list of interested ports. 6669 */ 6670 static void 6671 vsw_del_mcst_vsw(vsw_t *vswp) 6672 { 6673 mcst_addr_t *next_p = NULL; 6674 6675 D1(vswp, "%s: enter", __func__); 6676 6677 mutex_enter(&vswp->mca_lock); 6678 6679 while (vswp->mcap != NULL) { 6680 DERR(vswp, "%s: deleting addr 0x%llx", 6681 __func__, vswp->mcap->addr); 6682 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 6683 vswp->mcap->addr, NULL); 6684 6685 next_p = vswp->mcap->nextp; 6686 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 6687 vswp->mcap = next_p; 6688 } 6689 6690 vswp->mcap = NULL; 6691 mutex_exit(&vswp->mca_lock); 6692 6693 D1(vswp, "%s: exit", __func__); 6694 } 6695 6696 6697 /* 6698 * Remove the specified address from the list of address maintained 6699 * in this port node. 6700 */ 6701 static void 6702 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 6703 { 6704 vsw_t *vswp = NULL; 6705 vsw_port_t *port = NULL; 6706 mcst_addr_t *prev_p = NULL; 6707 mcst_addr_t *curr_p = NULL; 6708 6709 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 6710 __func__, devtype, addr); 6711 6712 if (devtype == VSW_VNETPORT) { 6713 port = (vsw_port_t *)arg; 6714 mutex_enter(&port->mca_lock); 6715 prev_p = curr_p = port->mcap; 6716 } else { 6717 vswp = (vsw_t *)arg; 6718 mutex_enter(&vswp->mca_lock); 6719 prev_p = curr_p = vswp->mcap; 6720 } 6721 6722 while (curr_p != NULL) { 6723 if (curr_p->addr == addr) { 6724 D2(NULL, "%s: address found", __func__); 6725 /* match found */ 6726 if (prev_p == curr_p) { 6727 /* list head */ 6728 if (devtype == VSW_VNETPORT) 6729 port->mcap = curr_p->nextp; 6730 else 6731 vswp->mcap = curr_p->nextp; 6732 } else { 6733 prev_p->nextp = curr_p->nextp; 6734 } 6735 kmem_free(curr_p, sizeof (mcst_addr_t)); 6736 break; 6737 } else { 6738 prev_p = curr_p; 6739 curr_p = curr_p->nextp; 6740 } 6741 } 6742 6743 if (devtype == VSW_VNETPORT) 6744 mutex_exit(&port->mca_lock); 6745 else 6746 mutex_exit(&vswp->mca_lock); 6747 6748 D1(NULL, "%s: exit", __func__); 6749 } 6750 6751 /* 6752 * Creates a descriptor ring (dring) and links it into the 6753 * link of outbound drings for this channel. 6754 * 6755 * Returns NULL if creation failed. 6756 */ 6757 static dring_info_t * 6758 vsw_create_dring(vsw_ldc_t *ldcp) 6759 { 6760 vsw_private_desc_t *priv_addr = NULL; 6761 vsw_t *vswp = ldcp->ldc_vswp; 6762 ldc_mem_info_t minfo; 6763 dring_info_t *dp, *tp; 6764 int i; 6765 6766 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 6767 6768 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 6769 6770 /* create public section of ring */ 6771 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 6772 VSW_PUB_SIZE, &dp->handle)) != 0) { 6773 6774 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 6775 "failed", ldcp->ldc_id); 6776 goto create_fail_exit; 6777 } 6778 6779 ASSERT(dp->handle != NULL); 6780 6781 /* 6782 * Get the base address of the public section of the ring. 6783 */ 6784 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 6785 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 6786 ldcp->ldc_id); 6787 goto dring_fail_exit; 6788 } else { 6789 ASSERT(minfo.vaddr != 0); 6790 dp->pub_addr = minfo.vaddr; 6791 } 6792 6793 dp->num_descriptors = VSW_RING_NUM_EL; 6794 dp->descriptor_size = VSW_PUB_SIZE; 6795 dp->options = VIO_TX_DRING; 6796 dp->ncookies = 1; /* guaranteed by ldc */ 6797 6798 /* 6799 * create private portion of ring 6800 */ 6801 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 6802 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 6803 6804 if (vsw_setup_ring(ldcp, dp)) { 6805 DERR(vswp, "%s: unable to setup ring", __func__); 6806 goto dring_fail_exit; 6807 } 6808 6809 /* haven't used any descriptors yet */ 6810 dp->end_idx = 0; 6811 dp->last_ack_recv = -1; 6812 6813 /* bind dring to the channel */ 6814 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 6815 LDC_SHADOW_MAP, LDC_MEM_RW, 6816 &dp->cookie[0], &dp->ncookies)) != 0) { 6817 DERR(vswp, "vsw_create_dring: unable to bind to channel " 6818 "%lld", ldcp->ldc_id); 6819 goto dring_fail_exit; 6820 } 6821 6822 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 6823 dp->restart_reqd = B_TRUE; 6824 6825 /* 6826 * Only ever create rings for outgoing lane. Link it onto 6827 * end of list. 6828 */ 6829 if (ldcp->lane_out.dringp == NULL) { 6830 D2(vswp, "vsw_create_dring: adding first outbound ring"); 6831 ldcp->lane_out.dringp = dp; 6832 } else { 6833 tp = ldcp->lane_out.dringp; 6834 while (tp->next != NULL) 6835 tp = tp->next; 6836 6837 tp->next = dp; 6838 } 6839 6840 return (dp); 6841 6842 dring_fail_exit: 6843 (void) ldc_mem_dring_destroy(dp->handle); 6844 6845 create_fail_exit: 6846 if (dp->priv_addr != NULL) { 6847 priv_addr = dp->priv_addr; 6848 for (i = 0; i < VSW_RING_NUM_EL; i++) { 6849 if (priv_addr->memhandle != NULL) 6850 (void) ldc_mem_free_handle( 6851 priv_addr->memhandle); 6852 priv_addr++; 6853 } 6854 kmem_free(dp->priv_addr, 6855 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 6856 } 6857 mutex_destroy(&dp->dlock); 6858 6859 kmem_free(dp, sizeof (dring_info_t)); 6860 return (NULL); 6861 } 6862 6863 /* 6864 * Create a ring consisting of just a private portion and link 6865 * it into the list of rings for the outbound lane. 6866 * 6867 * These type of rings are used primarily for temporary data 6868 * storage (i.e. as data buffers). 6869 */ 6870 void 6871 vsw_create_privring(vsw_ldc_t *ldcp) 6872 { 6873 dring_info_t *dp, *tp; 6874 vsw_t *vswp = ldcp->ldc_vswp; 6875 6876 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6877 6878 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 6879 6880 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 6881 6882 /* no public section */ 6883 dp->pub_addr = NULL; 6884 6885 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 6886 VSW_RING_NUM_EL), KM_SLEEP); 6887 6888 if (vsw_setup_ring(ldcp, dp)) { 6889 DERR(vswp, "%s: setup of ring failed", __func__); 6890 kmem_free(dp->priv_addr, 6891 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 6892 mutex_destroy(&dp->dlock); 6893 kmem_free(dp, sizeof (dring_info_t)); 6894 return; 6895 } 6896 6897 /* haven't used any descriptors yet */ 6898 dp->end_idx = 0; 6899 6900 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 6901 dp->restart_reqd = B_TRUE; 6902 6903 /* 6904 * Only ever create rings for outgoing lane. Link it onto 6905 * end of list. 6906 */ 6907 if (ldcp->lane_out.dringp == NULL) { 6908 D2(vswp, "%s: adding first outbound privring", __func__); 6909 ldcp->lane_out.dringp = dp; 6910 } else { 6911 tp = ldcp->lane_out.dringp; 6912 while (tp->next != NULL) 6913 tp = tp->next; 6914 6915 tp->next = dp; 6916 } 6917 6918 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6919 } 6920 6921 /* 6922 * Setup the descriptors in the dring. Returns 0 on success, 1 on 6923 * failure. 6924 */ 6925 int 6926 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 6927 { 6928 vnet_public_desc_t *pub_addr = NULL; 6929 vsw_private_desc_t *priv_addr = NULL; 6930 vsw_t *vswp = ldcp->ldc_vswp; 6931 uint64_t *tmpp; 6932 uint64_t offset = 0; 6933 uint32_t ncookies = 0; 6934 static char *name = "vsw_setup_ring"; 6935 int i, j, nc, rv; 6936 6937 priv_addr = dp->priv_addr; 6938 pub_addr = dp->pub_addr; 6939 6940 /* public section may be null but private should never be */ 6941 ASSERT(priv_addr != NULL); 6942 6943 /* 6944 * Allocate the region of memory which will be used to hold 6945 * the data the descriptors will refer to. 6946 */ 6947 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 6948 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 6949 6950 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 6951 dp->data_sz, dp->data_addr); 6952 6953 tmpp = (uint64_t *)dp->data_addr; 6954 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 6955 6956 /* 6957 * Initialise some of the private and public (if they exist) 6958 * descriptor fields. 6959 */ 6960 for (i = 0; i < VSW_RING_NUM_EL; i++) { 6961 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 6962 6963 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 6964 &priv_addr->memhandle)) != 0) { 6965 DERR(vswp, "%s: alloc mem handle failed", name); 6966 goto setup_ring_cleanup; 6967 } 6968 6969 priv_addr->datap = (void *)tmpp; 6970 6971 rv = ldc_mem_bind_handle(priv_addr->memhandle, 6972 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 6973 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 6974 &(priv_addr->memcookie[0]), &ncookies); 6975 if (rv != 0) { 6976 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 6977 "(rv %d)", name, ldcp->ldc_id, rv); 6978 goto setup_ring_cleanup; 6979 } 6980 priv_addr->bound = 1; 6981 6982 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 6983 name, i, priv_addr->memcookie[0].addr, 6984 priv_addr->memcookie[0].size); 6985 6986 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 6987 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 6988 "invalid num of cookies (%d) for size 0x%llx", 6989 name, ldcp->ldc_id, ncookies, 6990 VSW_RING_EL_DATA_SZ); 6991 6992 goto setup_ring_cleanup; 6993 } else { 6994 for (j = 1; j < ncookies; j++) { 6995 rv = ldc_mem_nextcookie(priv_addr->memhandle, 6996 &(priv_addr->memcookie[j])); 6997 if (rv != 0) { 6998 DERR(vswp, "%s: ldc_mem_nextcookie " 6999 "failed rv (%d)", name, rv); 7000 goto setup_ring_cleanup; 7001 } 7002 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 7003 "size 0x%llx", name, j, 7004 priv_addr->memcookie[j].addr, 7005 priv_addr->memcookie[j].size); 7006 } 7007 7008 } 7009 priv_addr->ncookies = ncookies; 7010 priv_addr->dstate = VIO_DESC_FREE; 7011 7012 if (pub_addr != NULL) { 7013 7014 /* link pub and private sides */ 7015 priv_addr->descp = pub_addr; 7016 7017 pub_addr->ncookies = priv_addr->ncookies; 7018 7019 for (nc = 0; nc < pub_addr->ncookies; nc++) { 7020 bcopy(&priv_addr->memcookie[nc], 7021 &pub_addr->memcookie[nc], 7022 sizeof (ldc_mem_cookie_t)); 7023 } 7024 7025 pub_addr->hdr.dstate = VIO_DESC_FREE; 7026 pub_addr++; 7027 } 7028 7029 /* 7030 * move to next element in the dring and the next 7031 * position in the data buffer. 7032 */ 7033 priv_addr++; 7034 tmpp += offset; 7035 } 7036 7037 return (0); 7038 7039 setup_ring_cleanup: 7040 priv_addr = dp->priv_addr; 7041 7042 for (j = 0; j < i; j++) { 7043 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 7044 (void) ldc_mem_free_handle(priv_addr->memhandle); 7045 7046 mutex_destroy(&priv_addr->dstate_lock); 7047 7048 priv_addr++; 7049 } 7050 kmem_free(dp->data_addr, dp->data_sz); 7051 7052 return (1); 7053 } 7054 7055 /* 7056 * Searches the private section of a ring for a free descriptor, 7057 * starting at the location of the last free descriptor found 7058 * previously. 7059 * 7060 * Returns 0 if free descriptor is available, and updates state 7061 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 7062 * 7063 * FUTURE: might need to return contiguous range of descriptors 7064 * as dring info msg assumes all will be contiguous. 7065 */ 7066 static int 7067 vsw_dring_find_free_desc(dring_info_t *dringp, 7068 vsw_private_desc_t **priv_p, int *idx) 7069 { 7070 vsw_private_desc_t *addr = NULL; 7071 int num = VSW_RING_NUM_EL; 7072 int ret = 1; 7073 7074 D1(NULL, "%s enter\n", __func__); 7075 7076 ASSERT(dringp->priv_addr != NULL); 7077 7078 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 7079 __func__, dringp, dringp->end_idx); 7080 7081 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 7082 7083 mutex_enter(&addr->dstate_lock); 7084 if (addr->dstate == VIO_DESC_FREE) { 7085 addr->dstate = VIO_DESC_READY; 7086 *priv_p = addr; 7087 *idx = dringp->end_idx; 7088 dringp->end_idx = (dringp->end_idx + 1) % num; 7089 ret = 0; 7090 7091 } 7092 mutex_exit(&addr->dstate_lock); 7093 7094 /* ring full */ 7095 if (ret == 1) { 7096 D2(NULL, "%s: no desp free: started at %d", __func__, 7097 dringp->end_idx); 7098 } 7099 7100 D1(NULL, "%s: exit\n", __func__); 7101 7102 return (ret); 7103 } 7104 7105 /* 7106 * Map from a dring identifier to the ring itself. Returns 7107 * pointer to ring or NULL if no match found. 7108 */ 7109 static dring_info_t * 7110 vsw_ident2dring(lane_t *lane, uint64_t ident) 7111 { 7112 dring_info_t *dp = NULL; 7113 7114 if ((dp = lane->dringp) == NULL) { 7115 return (NULL); 7116 } else { 7117 if (dp->ident == ident) 7118 return (dp); 7119 7120 while (dp != NULL) { 7121 if (dp->ident == ident) 7122 break; 7123 dp = dp->next; 7124 } 7125 } 7126 7127 return (dp); 7128 } 7129 7130 /* 7131 * Set the default lane attributes. These are copied into 7132 * the attr msg we send to our peer. If they are not acceptable 7133 * then (currently) the handshake ends. 7134 */ 7135 static void 7136 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 7137 { 7138 bzero(lp, sizeof (lane_t)); 7139 7140 READ_ENTER(&vswp->if_lockrw); 7141 ether_copy(&(vswp->if_addr), &(lp->addr)); 7142 RW_EXIT(&vswp->if_lockrw); 7143 7144 lp->mtu = VSW_MTU; 7145 lp->addr_type = ADDR_TYPE_MAC; 7146 lp->xfer_mode = VIO_DRING_MODE; 7147 lp->ack_freq = 0; /* for shared mode */ 7148 7149 mutex_enter(&lp->seq_lock); 7150 lp->seq_num = VNET_ISS; 7151 mutex_exit(&lp->seq_lock); 7152 } 7153 7154 /* 7155 * Verify that the attributes are acceptable. 7156 * 7157 * FUTURE: If some attributes are not acceptable, change them 7158 * our desired values. 7159 */ 7160 static int 7161 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 7162 { 7163 int ret = 0; 7164 7165 D1(NULL, "vsw_check_attr enter\n"); 7166 7167 /* 7168 * Note we currently only support in-band descriptors 7169 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 7170 */ 7171 if ((pkt->xfer_mode != VIO_DESC_MODE) && 7172 (pkt->xfer_mode != VIO_DRING_MODE)) { 7173 D2(NULL, "vsw_check_attr: unknown mode %x\n", 7174 pkt->xfer_mode); 7175 ret = 1; 7176 } 7177 7178 /* Only support MAC addresses at moment. */ 7179 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 7180 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 7181 "or address 0x%llx\n", pkt->addr_type, 7182 pkt->addr); 7183 ret = 1; 7184 } 7185 7186 /* 7187 * MAC address supplied by device should match that stored 7188 * in the vsw-port OBP node. Need to decide what to do if they 7189 * don't match, for the moment just warn but don't fail. 7190 */ 7191 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 7192 DERR(NULL, "vsw_check_attr: device supplied address " 7193 "0x%llx doesn't match node address 0x%llx\n", 7194 pkt->addr, port->p_macaddr); 7195 } 7196 7197 /* 7198 * Ack freq only makes sense in pkt mode, in shared 7199 * mode the ring descriptors say whether or not to 7200 * send back an ACK. 7201 */ 7202 if ((pkt->xfer_mode == VIO_DRING_MODE) && 7203 (pkt->ack_freq > 0)) { 7204 D2(NULL, "vsw_check_attr: non zero ack freq " 7205 " in SHM mode\n"); 7206 ret = 1; 7207 } 7208 7209 /* 7210 * Note: for the moment we only support ETHER 7211 * frames. This may change in the future. 7212 */ 7213 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 7214 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 7215 pkt->mtu); 7216 ret = 1; 7217 } 7218 7219 D1(NULL, "vsw_check_attr exit\n"); 7220 7221 return (ret); 7222 } 7223 7224 /* 7225 * Returns 1 if there is a problem, 0 otherwise. 7226 */ 7227 static int 7228 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 7229 { 7230 _NOTE(ARGUNUSED(pkt)) 7231 7232 int ret = 0; 7233 7234 D1(NULL, "vsw_check_dring_info enter\n"); 7235 7236 if ((pkt->num_descriptors == 0) || 7237 (pkt->descriptor_size == 0) || 7238 (pkt->ncookies != 1)) { 7239 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 7240 ret = 1; 7241 } 7242 7243 D1(NULL, "vsw_check_dring_info exit\n"); 7244 7245 return (ret); 7246 } 7247 7248 /* 7249 * Returns 1 if two memory cookies match. Otherwise returns 0. 7250 */ 7251 static int 7252 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 7253 { 7254 if ((m1->addr != m2->addr) || 7255 (m2->size != m2->size)) { 7256 return (0); 7257 } else { 7258 return (1); 7259 } 7260 } 7261 7262 /* 7263 * Returns 1 if ring described in reg message matches that 7264 * described by dring_info structure. Otherwise returns 0. 7265 */ 7266 static int 7267 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 7268 { 7269 if ((msg->descriptor_size != dp->descriptor_size) || 7270 (msg->num_descriptors != dp->num_descriptors) || 7271 (msg->ncookies != dp->ncookies) || 7272 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 7273 return (0); 7274 } else { 7275 return (1); 7276 } 7277 7278 } 7279 7280 static caddr_t 7281 vsw_print_ethaddr(uint8_t *a, char *ebuf) 7282 { 7283 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 7284 a[0], a[1], a[2], a[3], a[4], a[5]); 7285 return (ebuf); 7286 } 7287 7288 /* 7289 * Reset and free all the resources associated with 7290 * the channel. 7291 */ 7292 static void 7293 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 7294 { 7295 dring_info_t *dp, *dpp; 7296 lane_t *lp = NULL; 7297 int rv = 0; 7298 7299 ASSERT(ldcp != NULL); 7300 7301 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 7302 7303 if (dir == INBOUND) { 7304 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 7305 " of channel %lld", __func__, ldcp->ldc_id); 7306 lp = &ldcp->lane_in; 7307 } else { 7308 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 7309 " of channel %lld", __func__, ldcp->ldc_id); 7310 lp = &ldcp->lane_out; 7311 } 7312 7313 lp->lstate = VSW_LANE_INACTIV; 7314 mutex_enter(&lp->seq_lock); 7315 lp->seq_num = VNET_ISS; 7316 mutex_exit(&lp->seq_lock); 7317 if (lp->dringp) { 7318 if (dir == INBOUND) { 7319 dp = lp->dringp; 7320 while (dp != NULL) { 7321 dpp = dp->next; 7322 if (dp->handle != NULL) 7323 (void) ldc_mem_dring_unmap(dp->handle); 7324 kmem_free(dp, sizeof (dring_info_t)); 7325 dp = dpp; 7326 } 7327 } else { 7328 /* 7329 * unbind, destroy exported dring, free dring struct 7330 */ 7331 dp = lp->dringp; 7332 rv = vsw_free_ring(dp); 7333 } 7334 if (rv == 0) { 7335 lp->dringp = NULL; 7336 } 7337 } 7338 7339 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 7340 } 7341 7342 /* 7343 * Free ring and all associated resources. 7344 */ 7345 static int 7346 vsw_free_ring(dring_info_t *dp) 7347 { 7348 vsw_private_desc_t *paddr = NULL; 7349 dring_info_t *dpp; 7350 int i, rv = 1; 7351 7352 while (dp != NULL) { 7353 mutex_enter(&dp->dlock); 7354 dpp = dp->next; 7355 if (dp->priv_addr != NULL) { 7356 /* 7357 * First unbind and free the memory handles 7358 * stored in each descriptor within the ring. 7359 */ 7360 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7361 paddr = (vsw_private_desc_t *) 7362 dp->priv_addr + i; 7363 if (paddr->memhandle != NULL) { 7364 if (paddr->bound == 1) { 7365 rv = ldc_mem_unbind_handle( 7366 paddr->memhandle); 7367 7368 if (rv != 0) { 7369 DERR(NULL, "error " 7370 "unbinding handle for " 7371 "ring 0x%llx at pos %d", 7372 dp, i); 7373 mutex_exit(&dp->dlock); 7374 return (rv); 7375 } 7376 paddr->bound = 0; 7377 } 7378 7379 rv = ldc_mem_free_handle( 7380 paddr->memhandle); 7381 if (rv != 0) { 7382 DERR(NULL, "error freeing " 7383 "handle for ring " 7384 "0x%llx at pos %d", 7385 dp, i); 7386 mutex_exit(&dp->dlock); 7387 return (rv); 7388 } 7389 paddr->memhandle = NULL; 7390 } 7391 mutex_destroy(&paddr->dstate_lock); 7392 } 7393 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 7394 * VSW_RING_NUM_EL)); 7395 } 7396 7397 /* 7398 * Now unbind and destroy the ring itself. 7399 */ 7400 if (dp->handle != NULL) { 7401 (void) ldc_mem_dring_unbind(dp->handle); 7402 (void) ldc_mem_dring_destroy(dp->handle); 7403 } 7404 7405 if (dp->data_addr != NULL) { 7406 kmem_free(dp->data_addr, dp->data_sz); 7407 } 7408 7409 mutex_exit(&dp->dlock); 7410 mutex_destroy(&dp->dlock); 7411 mutex_destroy(&dp->restart_lock); 7412 kmem_free(dp, sizeof (dring_info_t)); 7413 7414 dp = dpp; 7415 } 7416 return (0); 7417 } 7418 7419 /* 7420 * Debugging routines 7421 */ 7422 static void 7423 display_state(void) 7424 { 7425 vsw_t *vswp; 7426 vsw_port_list_t *plist; 7427 vsw_port_t *port; 7428 vsw_ldc_list_t *ldcl; 7429 vsw_ldc_t *ldcp; 7430 7431 cmn_err(CE_NOTE, "***** system state *****"); 7432 7433 for (vswp = vsw_head; vswp; vswp = vswp->next) { 7434 plist = &vswp->plist; 7435 READ_ENTER(&plist->lockrw); 7436 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 7437 vswp->instance, plist->num_ports); 7438 7439 for (port = plist->head; port != NULL; port = port->p_next) { 7440 ldcl = &port->p_ldclist; 7441 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 7442 port->p_instance, ldcl->num_ldcs); 7443 READ_ENTER(&ldcl->lockrw); 7444 ldcp = ldcl->head; 7445 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 7446 cmn_err(CE_CONT, "chan %lu : dev %d : " 7447 "status %d : phase %u\n", 7448 ldcp->ldc_id, ldcp->dev_class, 7449 ldcp->ldc_status, ldcp->hphase); 7450 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 7451 "psession %lu\n", 7452 ldcp->ldc_id, 7453 ldcp->local_session, 7454 ldcp->peer_session); 7455 7456 cmn_err(CE_CONT, "Inbound lane:\n"); 7457 display_lane(&ldcp->lane_in); 7458 cmn_err(CE_CONT, "Outbound lane:\n"); 7459 display_lane(&ldcp->lane_out); 7460 } 7461 RW_EXIT(&ldcl->lockrw); 7462 } 7463 RW_EXIT(&plist->lockrw); 7464 } 7465 cmn_err(CE_NOTE, "***** system state *****"); 7466 } 7467 7468 static void 7469 display_lane(lane_t *lp) 7470 { 7471 dring_info_t *drp; 7472 7473 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 7474 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 7475 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 7476 lp->addr_type, lp->addr, lp->xfer_mode); 7477 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 7478 7479 cmn_err(CE_CONT, "Dring info:\n"); 7480 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 7481 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 7482 drp->num_descriptors, drp->descriptor_size); 7483 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 7484 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 7485 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 7486 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 7487 drp->ident, drp->end_idx); 7488 display_ring(drp); 7489 } 7490 } 7491 7492 static void 7493 display_ring(dring_info_t *dringp) 7494 { 7495 uint64_t i; 7496 uint64_t priv_count = 0; 7497 uint64_t pub_count = 0; 7498 vnet_public_desc_t *pub_addr = NULL; 7499 vsw_private_desc_t *priv_addr = NULL; 7500 7501 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7502 if (dringp->pub_addr != NULL) { 7503 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 7504 7505 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 7506 pub_count++; 7507 } 7508 7509 if (dringp->priv_addr != NULL) { 7510 priv_addr = 7511 (vsw_private_desc_t *)dringp->priv_addr + i; 7512 7513 if (priv_addr->dstate == VIO_DESC_FREE) 7514 priv_count++; 7515 } 7516 } 7517 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 7518 i, priv_count, pub_count); 7519 } 7520 7521 static void 7522 dump_flags(uint64_t state) 7523 { 7524 int i; 7525 7526 typedef struct flag_name { 7527 int flag_val; 7528 char *flag_name; 7529 } flag_name_t; 7530 7531 flag_name_t flags[] = { 7532 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 7533 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 7534 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 7535 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 7536 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 7537 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 7538 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 7539 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 7540 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 7541 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 7542 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 7543 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 7544 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 7545 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 7546 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 7547 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 7548 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 7549 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 7550 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 7551 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 7552 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 7553 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 7554 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 7555 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 7556 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 7557 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 7558 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 7559 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 7560 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 7561 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 7562 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 7563 7564 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 7565 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 7566 if (state & flags[i].flag_val) 7567 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 7568 } 7569 } 7570