1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); 81 static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); 82 static int vsw_get_physaddr(vsw_t *); 83 static int vsw_setup_switching(vsw_t *); 84 static int vsw_setup_layer2(vsw_t *); 85 static int vsw_setup_layer3(vsw_t *); 86 87 /* MAC Ring table functions. */ 88 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 90 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 91 static void vsw_queue_stop(vsw_queue_t *vqp); 92 static vsw_queue_t *vsw_queue_create(); 93 static void vsw_queue_destroy(vsw_queue_t *vqp); 94 95 /* MAC layer routines */ 96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 97 mac_resource_t *mrp); 98 static int vsw_get_hw_maddr(vsw_t *); 99 static int vsw_set_hw(vsw_t *, vsw_port_t *); 100 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *); 101 static int vsw_unset_hw(vsw_t *, vsw_port_t *); 102 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *); 103 static int vsw_reconfig_hw(vsw_t *); 104 static int vsw_mac_attach(vsw_t *vswp); 105 static void vsw_mac_detach(vsw_t *vswp); 106 107 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 108 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 109 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 110 static int vsw_mac_register(vsw_t *); 111 static int vsw_mac_unregister(vsw_t *); 112 static int vsw_m_stat(void *, uint_t, uint64_t *); 113 static void vsw_m_stop(void *arg); 114 static int vsw_m_start(void *arg); 115 static int vsw_m_unicst(void *arg, const uint8_t *); 116 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 117 static int vsw_m_promisc(void *arg, boolean_t); 118 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 119 120 /* MDEG routines */ 121 static int vsw_mdeg_register(vsw_t *vswp); 122 static void vsw_mdeg_unregister(vsw_t *vswp); 123 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 124 static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); 125 static void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); 126 static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); 127 128 /* Port add/deletion routines */ 129 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 130 static int vsw_port_attach(vsw_t *vswp, int p_instance, 131 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 132 static int vsw_detach_ports(vsw_t *vswp); 133 static int vsw_port_detach(vsw_t *vswp, int p_instance); 134 static int vsw_port_delete(vsw_port_t *port); 135 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 136 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 137 static int vsw_init_ldcs(vsw_port_t *port); 138 static int vsw_uninit_ldcs(vsw_port_t *port); 139 static int vsw_ldc_init(vsw_ldc_t *ldcp); 140 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 141 static int vsw_drain_ldcs(vsw_port_t *port); 142 static int vsw_drain_port_taskq(vsw_port_t *port); 143 static void vsw_marker_task(void *); 144 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 145 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 146 147 /* Interrupt routines */ 148 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 149 150 /* Handshake routines */ 151 static void vsw_ldc_reinit(vsw_ldc_t *); 152 static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); 153 static void vsw_conn_task(void *); 154 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 155 static void vsw_next_milestone(vsw_ldc_t *); 156 static int vsw_supported_version(vio_ver_msg_t *); 157 158 /* Data processing routines */ 159 static void vsw_process_pkt(void *); 160 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 161 static void vsw_process_ctrl_pkt(void *); 162 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 163 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 164 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 165 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 166 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 167 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 168 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 169 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 170 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 171 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 172 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 173 174 /* Switching/data transmit routines */ 175 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 176 vsw_port_t *port, mac_resource_handle_t); 177 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 178 vsw_port_t *port, mac_resource_handle_t); 179 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 180 vsw_port_t *port); 181 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 182 vsw_port_t *port); 183 static int vsw_portsend(vsw_port_t *, mblk_t *); 184 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 185 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 186 187 /* Packet creation routines */ 188 static void vsw_send_ver(void *); 189 static void vsw_send_attr(vsw_ldc_t *); 190 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 191 static void vsw_send_dring_info(vsw_ldc_t *); 192 static void vsw_send_rdx(vsw_ldc_t *); 193 194 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); 195 196 /* Forwarding database (FDB) routines */ 197 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 198 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 199 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 200 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 201 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 202 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 203 static void vsw_del_addr(uint8_t, void *, uint64_t); 204 static void vsw_del_mcst_port(vsw_port_t *); 205 static void vsw_del_mcst_vsw(vsw_t *); 206 207 /* Dring routines */ 208 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 209 static void vsw_create_privring(vsw_ldc_t *); 210 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 211 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 212 int *); 213 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 214 215 static void vsw_set_lane_attr(vsw_t *, lane_t *); 216 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 217 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 218 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 219 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 220 221 /* Misc support routines */ 222 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 223 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 224 static int vsw_free_ring(dring_info_t *); 225 226 /* Debugging routines */ 227 static void dump_flags(uint64_t); 228 static void display_state(void); 229 static void display_lane(lane_t *); 230 static void display_ring(dring_info_t *); 231 232 int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */ 233 int vsw_wretries = 100; /* # of write attempts */ 234 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 235 int vsw_desc_delay = 0; /* delay in us */ 236 int vsw_read_attempts = 5; /* # of reads of descriptor */ 237 238 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 239 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 240 241 static mac_callbacks_t vsw_m_callbacks = { 242 0, 243 vsw_m_stat, 244 vsw_m_start, 245 vsw_m_stop, 246 vsw_m_promisc, 247 vsw_m_multicst, 248 vsw_m_unicst, 249 vsw_m_tx, 250 NULL, 251 NULL, 252 NULL 253 }; 254 255 static struct cb_ops vsw_cb_ops = { 256 nulldev, /* cb_open */ 257 nulldev, /* cb_close */ 258 nodev, /* cb_strategy */ 259 nodev, /* cb_print */ 260 nodev, /* cb_dump */ 261 nodev, /* cb_read */ 262 nodev, /* cb_write */ 263 nodev, /* cb_ioctl */ 264 nodev, /* cb_devmap */ 265 nodev, /* cb_mmap */ 266 nodev, /* cb_segmap */ 267 nochpoll, /* cb_chpoll */ 268 ddi_prop_op, /* cb_prop_op */ 269 NULL, /* cb_stream */ 270 D_MP, /* cb_flag */ 271 CB_REV, /* rev */ 272 nodev, /* int (*cb_aread)() */ 273 nodev /* int (*cb_awrite)() */ 274 }; 275 276 static struct dev_ops vsw_ops = { 277 DEVO_REV, /* devo_rev */ 278 0, /* devo_refcnt */ 279 vsw_getinfo, /* devo_getinfo */ 280 nulldev, /* devo_identify */ 281 nulldev, /* devo_probe */ 282 vsw_attach, /* devo_attach */ 283 vsw_detach, /* devo_detach */ 284 nodev, /* devo_reset */ 285 &vsw_cb_ops, /* devo_cb_ops */ 286 (struct bus_ops *)NULL, /* devo_bus_ops */ 287 ddi_power /* devo_power */ 288 }; 289 290 extern struct mod_ops mod_driverops; 291 static struct modldrv vswmodldrv = { 292 &mod_driverops, 293 "sun4v Virtual Switch %I%", 294 &vsw_ops, 295 }; 296 297 #define LDC_ENTER_LOCK(ldcp) \ 298 mutex_enter(&((ldcp)->ldc_cblock));\ 299 mutex_enter(&((ldcp)->ldc_txlock)); 300 #define LDC_EXIT_LOCK(ldcp) \ 301 mutex_exit(&((ldcp)->ldc_txlock));\ 302 mutex_exit(&((ldcp)->ldc_cblock)); 303 304 /* Driver soft state ptr */ 305 static void *vsw_state; 306 307 /* 308 * Linked list of "vsw_t" structures - one per instance. 309 */ 310 vsw_t *vsw_head = NULL; 311 krwlock_t vsw_rw; 312 313 /* 314 * Property names 315 */ 316 static char vdev_propname[] = "virtual-device"; 317 static char vsw_propname[] = "virtual-network-switch"; 318 static char physdev_propname[] = "vsw-phys-dev"; 319 static char smode_propname[] = "vsw-switch-mode"; 320 static char macaddr_propname[] = "local-mac-address"; 321 static char remaddr_propname[] = "remote-mac-address"; 322 static char ldcids_propname[] = "ldc-ids"; 323 static char chan_propname[] = "channel-endpoint"; 324 static char id_propname[] = "id"; 325 static char reg_propname[] = "reg"; 326 327 /* supported versions */ 328 static ver_sup_t vsw_versions[] = { {1, 0} }; 329 330 /* 331 * Matching criteria passed to the MDEG to register interest 332 * in changes to 'virtual-device-port' nodes identified by their 333 * 'id' property. 334 */ 335 static md_prop_match_t vport_prop_match[] = { 336 { MDET_PROP_VAL, "id" }, 337 { MDET_LIST_END, NULL } 338 }; 339 340 static mdeg_node_match_t vport_match = { "virtual-device-port", 341 vport_prop_match }; 342 343 /* 344 * Matching criteria passed to the MDEG to register interest 345 * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified 346 * by their 'name' and 'cfg-handle' properties. 347 */ 348 static md_prop_match_t vdev_prop_match[] = { 349 { MDET_PROP_STR, "name" }, 350 { MDET_PROP_VAL, "cfg-handle" }, 351 { MDET_LIST_END, NULL } 352 }; 353 354 static mdeg_node_match_t vdev_match = { "virtual-device", 355 vdev_prop_match }; 356 357 358 /* 359 * Specification of an MD node passed to the MDEG to filter any 360 * 'vport' nodes that do not belong to the specified node. This 361 * template is copied for each vsw instance and filled in with 362 * the appropriate 'cfg-handle' value before being passed to the MDEG. 363 */ 364 static mdeg_prop_spec_t vsw_prop_template[] = { 365 { MDET_PROP_STR, "name", vsw_propname }, 366 { MDET_PROP_VAL, "cfg-handle", NULL }, 367 { MDET_LIST_END, NULL, NULL } 368 }; 369 370 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 371 372 /* 373 * From /etc/system enable/disable thread per ring. This is a mode 374 * selection that is done a vsw driver attach time. 375 */ 376 boolean_t vsw_multi_ring_enable = B_FALSE; 377 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 378 379 /* 380 * Print debug messages - set to 0x1f to enable all msgs 381 * or 0x0 to turn all off. 382 */ 383 int vswdbg = 0x0; 384 385 /* 386 * debug levels: 387 * 0x01: Function entry/exit tracing 388 * 0x02: Internal function messages 389 * 0x04: Verbose internal messages 390 * 0x08: Warning messages 391 * 0x10: Error messages 392 */ 393 394 static void 395 vswdebug(vsw_t *vswp, const char *fmt, ...) 396 { 397 char buf[512]; 398 va_list ap; 399 400 va_start(ap, fmt); 401 (void) vsprintf(buf, fmt, ap); 402 va_end(ap); 403 404 if (vswp == NULL) 405 cmn_err(CE_CONT, "%s\n", buf); 406 else 407 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 408 } 409 410 /* 411 * For the moment the state dump routines have their own 412 * private flag. 413 */ 414 #define DUMP_STATE 0 415 416 #if DUMP_STATE 417 418 #define DUMP_TAG(tag) \ 419 { \ 420 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 421 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 422 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 423 } 424 425 #define DUMP_TAG_PTR(tag) \ 426 { \ 427 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 428 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 429 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 430 } 431 432 #define DUMP_FLAGS(flags) dump_flags(flags); 433 #define DISPLAY_STATE() display_state() 434 435 #else 436 437 #define DUMP_TAG(tag) 438 #define DUMP_TAG_PTR(tag) 439 #define DUMP_FLAGS(state) 440 #define DISPLAY_STATE() 441 442 #endif /* DUMP_STATE */ 443 444 #ifdef DEBUG 445 446 #define D1 \ 447 if (vswdbg & 0x01) \ 448 vswdebug 449 450 #define D2 \ 451 if (vswdbg & 0x02) \ 452 vswdebug 453 454 #define D3 \ 455 if (vswdbg & 0x04) \ 456 vswdebug 457 458 #define DWARN \ 459 if (vswdbg & 0x08) \ 460 vswdebug 461 462 #define DERR \ 463 if (vswdbg & 0x10) \ 464 vswdebug 465 466 #else 467 468 #define DERR if (0) vswdebug 469 #define DWARN if (0) vswdebug 470 #define D1 if (0) vswdebug 471 #define D2 if (0) vswdebug 472 #define D3 if (0) vswdebug 473 474 #endif /* DEBUG */ 475 476 static struct modlinkage modlinkage = { 477 MODREV_1, 478 &vswmodldrv, 479 NULL 480 }; 481 482 int 483 _init(void) 484 { 485 int status; 486 487 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 488 489 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 490 if (status != 0) { 491 return (status); 492 } 493 494 mac_init_ops(&vsw_ops, "vsw"); 495 status = mod_install(&modlinkage); 496 if (status != 0) { 497 ddi_soft_state_fini(&vsw_state); 498 } 499 return (status); 500 } 501 502 int 503 _fini(void) 504 { 505 int status; 506 507 status = mod_remove(&modlinkage); 508 if (status != 0) 509 return (status); 510 mac_fini_ops(&vsw_ops); 511 ddi_soft_state_fini(&vsw_state); 512 513 rw_destroy(&vsw_rw); 514 515 return (status); 516 } 517 518 int 519 _info(struct modinfo *modinfop) 520 { 521 return (mod_info(&modlinkage, modinfop)); 522 } 523 524 static int 525 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 526 { 527 vsw_t *vswp; 528 int instance; 529 char hashname[MAXNAMELEN]; 530 char qname[TASKQ_NAMELEN]; 531 enum { PROG_init = 0x00, 532 PROG_if_lock = 0x01, 533 PROG_fdb = 0x02, 534 PROG_mfdb = 0x04, 535 PROG_report_dev = 0x08, 536 PROG_plist = 0x10, 537 PROG_taskq = 0x20} 538 progress; 539 540 progress = PROG_init; 541 542 switch (cmd) { 543 case DDI_ATTACH: 544 break; 545 case DDI_RESUME: 546 /* nothing to do for this non-device */ 547 return (DDI_SUCCESS); 548 case DDI_PM_RESUME: 549 default: 550 return (DDI_FAILURE); 551 } 552 553 instance = ddi_get_instance(dip); 554 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 555 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 556 return (DDI_FAILURE); 557 } 558 vswp = ddi_get_soft_state(vsw_state, instance); 559 560 if (vswp == NULL) { 561 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 562 goto vsw_attach_fail; 563 } 564 565 vswp->dip = dip; 566 vswp->instance = instance; 567 ddi_set_driver_private(dip, (caddr_t)vswp); 568 569 mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); 570 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 571 progress |= PROG_if_lock; 572 573 /* setup the unicast forwarding database */ 574 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 575 vswp->instance); 576 D2(vswp, "creating unicast hash table (%s)...", hashname); 577 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 578 mod_hash_null_valdtor, sizeof (void *)); 579 580 progress |= PROG_fdb; 581 582 /* setup the multicast fowarding database */ 583 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 584 vswp->instance); 585 D2(vswp, "creating multicast hash table %s)...", hashname); 586 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 587 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 588 mod_hash_null_valdtor, sizeof (void *)); 589 590 progress |= PROG_mfdb; 591 592 /* 593 * create lock protecting list of multicast addresses 594 * which could come via m_multicst() entry point when plumbed. 595 */ 596 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 597 vswp->mcap = NULL; 598 599 ddi_report_dev(vswp->dip); 600 601 progress |= PROG_report_dev; 602 603 WRITE_ENTER(&vsw_rw); 604 vswp->next = vsw_head; 605 vsw_head = vswp; 606 RW_EXIT(&vsw_rw); 607 608 /* setup the port list */ 609 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 610 vswp->plist.head = NULL; 611 612 progress |= PROG_plist; 613 614 /* 615 * Create the taskq which will process all the VIO 616 * control messages. 617 */ 618 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 619 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 620 TASKQ_DEFAULTPRI, 0)) == NULL) { 621 cmn_err(CE_WARN, "!vsw%d: Unable to create task queue", 622 vswp->instance); 623 goto vsw_attach_fail; 624 } 625 626 progress |= PROG_taskq; 627 628 /* prevent auto-detaching */ 629 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 630 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 631 cmn_err(CE_NOTE, "!Unable to set \"%s\" property for " 632 "instance %u", DDI_NO_AUTODETACH, instance); 633 } 634 635 /* 636 * Now we have everything setup, register an interest in 637 * specific MD nodes. 638 * 639 * The callback is invoked in 2 cases, firstly if upon mdeg 640 * registration there are existing nodes which match our specified 641 * criteria, and secondly if the MD is changed (and again, there 642 * are nodes which we are interested in present within it. Note 643 * that our callback will be invoked even if our specified nodes 644 * have not actually changed). 645 * 646 * Until the callback is invoked we cannot switch any pkts as 647 * we don't know basic information such as what mode we are 648 * operating in. However we expect the callback to be invoked 649 * immediately upon registration as this driver should only 650 * be attaching if there are vsw nodes in the MD. 651 */ 652 if (vsw_mdeg_register(vswp)) 653 goto vsw_attach_fail; 654 655 return (DDI_SUCCESS); 656 657 vsw_attach_fail: 658 DERR(NULL, "vsw_attach: failed"); 659 660 if (progress & PROG_taskq) 661 ddi_taskq_destroy(vswp->taskq_p); 662 663 if (progress & PROG_plist) 664 rw_destroy(&vswp->plist.lockrw); 665 666 if (progress & PROG_report_dev) { 667 ddi_remove_minor_node(dip, NULL); 668 mutex_destroy(&vswp->mca_lock); 669 } 670 671 if (progress & PROG_mfdb) { 672 mod_hash_destroy_hash(vswp->mfdb); 673 vswp->mfdb = NULL; 674 rw_destroy(&vswp->mfdbrw); 675 } 676 677 if (progress & PROG_fdb) { 678 mod_hash_destroy_hash(vswp->fdb); 679 vswp->fdb = NULL; 680 } 681 682 if (progress & PROG_if_lock) { 683 rw_destroy(&vswp->if_lockrw); 684 mutex_destroy(&vswp->mac_lock); 685 } 686 687 ddi_soft_state_free(vsw_state, instance); 688 return (DDI_FAILURE); 689 } 690 691 static int 692 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 693 { 694 vio_mblk_pool_t *poolp, *npoolp; 695 vsw_t **vswpp, *vswp; 696 int instance; 697 698 instance = ddi_get_instance(dip); 699 vswp = ddi_get_soft_state(vsw_state, instance); 700 701 if (vswp == NULL) { 702 return (DDI_FAILURE); 703 } 704 705 switch (cmd) { 706 case DDI_DETACH: 707 break; 708 case DDI_SUSPEND: 709 case DDI_PM_SUSPEND: 710 default: 711 return (DDI_FAILURE); 712 } 713 714 D2(vswp, "detaching instance %d", instance); 715 716 if (vswp->if_state & VSW_IF_REG) { 717 if (vsw_mac_unregister(vswp) != 0) { 718 cmn_err(CE_WARN, "!vsw%d: Unable to detach from " 719 "MAC layer", vswp->instance); 720 return (DDI_FAILURE); 721 } 722 } 723 724 vsw_mdeg_unregister(vswp); 725 726 /* remove mac layer callback */ 727 mutex_enter(&vswp->mac_lock); 728 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 729 mac_rx_remove(vswp->mh, vswp->mrh); 730 vswp->mrh = NULL; 731 } 732 mutex_exit(&vswp->mac_lock); 733 734 if (vsw_detach_ports(vswp) != 0) { 735 cmn_err(CE_WARN, "!vsw%d: Unable to detach ports", 736 vswp->instance); 737 return (DDI_FAILURE); 738 } 739 740 rw_destroy(&vswp->if_lockrw); 741 742 /* 743 * Now that the ports have been deleted, stop and close 744 * the physical device. 745 */ 746 mutex_enter(&vswp->mac_lock); 747 if (vswp->mh != NULL) { 748 if (vswp->mstarted) 749 mac_stop(vswp->mh); 750 if (vswp->mresources) 751 mac_resource_set(vswp->mh, NULL, NULL); 752 mac_close(vswp->mh); 753 754 vswp->mh = NULL; 755 vswp->txinfo = NULL; 756 } 757 mutex_exit(&vswp->mac_lock); 758 mutex_destroy(&vswp->mac_lock); 759 760 /* 761 * Destroy any free pools that may still exist. 762 */ 763 poolp = vswp->rxh; 764 while (poolp != NULL) { 765 npoolp = vswp->rxh = poolp->nextp; 766 if (vio_destroy_mblks(poolp) != 0) { 767 vswp->rxh = poolp; 768 return (DDI_FAILURE); 769 } 770 poolp = npoolp; 771 } 772 773 /* 774 * Remove this instance from any entries it may be on in 775 * the hash table by using the list of addresses maintained 776 * in the vsw_t structure. 777 */ 778 vsw_del_mcst_vsw(vswp); 779 780 vswp->mcap = NULL; 781 mutex_destroy(&vswp->mca_lock); 782 783 /* 784 * By now any pending tasks have finished and the underlying 785 * ldc's have been destroyed, so its safe to delete the control 786 * message taskq. 787 */ 788 if (vswp->taskq_p != NULL) 789 ddi_taskq_destroy(vswp->taskq_p); 790 791 /* 792 * At this stage all the data pointers in the hash table 793 * should be NULL, as all the ports have been removed and will 794 * have deleted themselves from the port lists which the data 795 * pointers point to. Hence we can destroy the table using the 796 * default destructors. 797 */ 798 D2(vswp, "vsw_detach: destroying hash tables.."); 799 mod_hash_destroy_hash(vswp->fdb); 800 vswp->fdb = NULL; 801 802 WRITE_ENTER(&vswp->mfdbrw); 803 mod_hash_destroy_hash(vswp->mfdb); 804 vswp->mfdb = NULL; 805 RW_EXIT(&vswp->mfdbrw); 806 rw_destroy(&vswp->mfdbrw); 807 808 ddi_remove_minor_node(dip, NULL); 809 810 rw_destroy(&vswp->plist.lockrw); 811 WRITE_ENTER(&vsw_rw); 812 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 813 if (*vswpp == vswp) { 814 *vswpp = vswp->next; 815 break; 816 } 817 } 818 RW_EXIT(&vsw_rw); 819 ddi_soft_state_free(vsw_state, instance); 820 821 return (DDI_SUCCESS); 822 } 823 824 static int 825 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 826 { 827 _NOTE(ARGUNUSED(dip)) 828 829 vsw_t *vswp = NULL; 830 dev_t dev = (dev_t)arg; 831 int instance; 832 833 instance = getminor(dev); 834 835 switch (infocmd) { 836 case DDI_INFO_DEVT2DEVINFO: 837 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 838 *result = NULL; 839 return (DDI_FAILURE); 840 } 841 *result = vswp->dip; 842 return (DDI_SUCCESS); 843 844 case DDI_INFO_DEVT2INSTANCE: 845 *result = (void *)(uintptr_t)instance; 846 return (DDI_SUCCESS); 847 848 default: 849 *result = NULL; 850 return (DDI_FAILURE); 851 } 852 } 853 854 /* 855 * Get the value of the "vsw-phys-dev" property in the specified 856 * node. This property is the name of the physical device that 857 * the virtual switch will use to talk to the outside world. 858 * 859 * Note it is valid for this property to be NULL (but the property 860 * itself must exist). Callers of this routine should verify that 861 * the value returned is what they expected (i.e. either NULL or non NULL). 862 * 863 * On success returns value of the property in region pointed to by 864 * the 'name' argument, and with return value of 0. Otherwise returns 1. 865 */ 866 static int 867 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) 868 { 869 int len = 0; 870 char *physname = NULL; 871 char *dev; 872 873 if (md_get_prop_data(mdp, node, physdev_propname, 874 (uint8_t **)(&physname), &len) != 0) { 875 cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical " 876 "device(s) from MD", vswp->instance); 877 return (1); 878 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 879 cmn_err(CE_WARN, "!vsw%d: %s is too long a device name", 880 vswp->instance, physname); 881 return (1); 882 } else { 883 (void) strncpy(name, physname, strlen(physname) + 1); 884 D2(vswp, "%s: using first device specified (%s)", 885 __func__, physname); 886 } 887 888 #ifdef DEBUG 889 /* 890 * As a temporary measure to aid testing we check to see if there 891 * is a vsw.conf file present. If there is we use the value of the 892 * vsw_physname property in the file as the name of the physical 893 * device, overriding the value from the MD. 894 * 895 * There may be multiple devices listed, but for the moment 896 * we just use the first one. 897 */ 898 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 899 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 900 if ((strlen(dev) + 1) > LIFNAMSIZ) { 901 cmn_err(CE_WARN, "vsw%d: %s is too long a device name", 902 vswp->instance, dev); 903 ddi_prop_free(dev); 904 return (1); 905 } else { 906 cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from " 907 "config file", vswp->instance, dev); 908 909 (void) strncpy(name, dev, strlen(dev) + 1); 910 } 911 912 ddi_prop_free(dev); 913 } 914 #endif 915 916 return (0); 917 } 918 919 /* 920 * Read the 'vsw-switch-mode' property from the specified MD node. 921 * 922 * Returns 0 on success and the number of modes found in 'found', 923 * otherwise returns 1. 924 */ 925 static int 926 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, 927 uint8_t *modes, int *found) 928 { 929 int len = 0; 930 int smode_num = 0; 931 char *smode = NULL; 932 char *curr_mode = NULL; 933 934 D1(vswp, "%s: enter", __func__); 935 936 /* 937 * Get the switch-mode property. The modes are listed in 938 * decreasing order of preference, i.e. prefered mode is 939 * first item in list. 940 */ 941 len = 0; 942 smode_num = 0; 943 if (md_get_prop_data(mdp, node, smode_propname, 944 (uint8_t **)(&smode), &len) != 0) { 945 /* 946 * Unable to get switch-mode property from MD, nothing 947 * more we can do. 948 */ 949 cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" 950 " from the MD", vswp->instance); 951 *found = 0; 952 return (1); 953 } 954 955 curr_mode = smode; 956 /* 957 * Modes of operation: 958 * 'switched' - layer 2 switching, underlying HW in 959 * programmed mode. 960 * 'promiscuous' - layer 2 switching, underlying HW in 961 * promiscuous mode. 962 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 963 * in non-promiscuous mode. 964 */ 965 while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { 966 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 967 if (strcmp(curr_mode, "switched") == 0) { 968 modes[smode_num++] = VSW_LAYER2; 969 } else if (strcmp(curr_mode, "promiscuous") == 0) { 970 modes[smode_num++] = VSW_LAYER2_PROMISC; 971 } else if (strcmp(curr_mode, "routed") == 0) { 972 modes[smode_num++] = VSW_LAYER3; 973 } else { 974 cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " 975 "setting to default switched mode", 976 vswp->instance, curr_mode); 977 modes[smode_num++] = VSW_LAYER2; 978 } 979 curr_mode += strlen(curr_mode) + 1; 980 } 981 *found = smode_num; 982 983 D2(vswp, "%s: %d modes found", __func__, smode_num); 984 985 D1(vswp, "%s: exit", __func__); 986 987 return (0); 988 } 989 990 /* 991 * Get the mac address of the physical device. 992 * 993 * Returns 0 on success, 1 on failure. 994 */ 995 static int 996 vsw_get_physaddr(vsw_t *vswp) 997 { 998 mac_handle_t mh; 999 char drv[LIFNAMSIZ]; 1000 uint_t ddi_instance; 1001 1002 D1(vswp, "%s: enter", __func__); 1003 1004 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1005 return (1); 1006 1007 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1008 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1009 vswp->instance, vswp->physname); 1010 return (1); 1011 } 1012 1013 READ_ENTER(&vswp->if_lockrw); 1014 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1015 RW_EXIT(&vswp->if_lockrw); 1016 1017 mac_close(mh); 1018 1019 vswp->mdprops |= VSW_DEV_MACADDR; 1020 1021 D1(vswp, "%s: exit", __func__); 1022 1023 return (0); 1024 } 1025 1026 /* 1027 * Check to see if the card supports the setting of multiple unicst 1028 * addresses. 1029 * 1030 * Returns 0 if card supports the programming of multiple unicast addresses 1031 * and there are free address slots available, otherwise returns 1. 1032 */ 1033 static int 1034 vsw_get_hw_maddr(vsw_t *vswp) 1035 { 1036 D1(vswp, "%s: enter", __func__); 1037 1038 mutex_enter(&vswp->mac_lock); 1039 if (vswp->mh == NULL) { 1040 mutex_exit(&vswp->mac_lock); 1041 return (1); 1042 } 1043 1044 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1045 DWARN(vswp, "Unable to get capabilities of" 1046 " underlying device (%s)", vswp->physname); 1047 mutex_exit(&vswp->mac_lock); 1048 return (1); 1049 } 1050 mutex_exit(&vswp->mac_lock); 1051 1052 if (vswp->maddr.maddr_naddrfree == 0) { 1053 cmn_err(CE_WARN, 1054 "!vsw%d: device %s has no free unicast address slots", 1055 vswp->instance, vswp->physname); 1056 return (1); 1057 } 1058 1059 D2(vswp, "%s: %d addrs : %d free", __func__, 1060 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1061 1062 D1(vswp, "%s: exit", __func__); 1063 1064 return (0); 1065 } 1066 1067 /* 1068 * Setup the required switching mode. 1069 * 1070 * Returns 0 on success, 1 on failure. 1071 */ 1072 static int 1073 vsw_setup_switching(vsw_t *vswp) 1074 { 1075 int i, rv = 1; 1076 1077 D1(vswp, "%s: enter", __func__); 1078 1079 /* select best switching mode */ 1080 for (i = 0; i < vswp->smode_num; i++) { 1081 vswp->smode_idx = i; 1082 switch (vswp->smode[i]) { 1083 case VSW_LAYER2: 1084 case VSW_LAYER2_PROMISC: 1085 rv = vsw_setup_layer2(vswp); 1086 break; 1087 1088 case VSW_LAYER3: 1089 rv = vsw_setup_layer3(vswp); 1090 break; 1091 1092 default: 1093 DERR(vswp, "unknown switch mode"); 1094 rv = 1; 1095 break; 1096 } 1097 1098 if (rv == 0) 1099 break; 1100 } 1101 1102 if (rv == 1) { 1103 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 1104 "switching mode", vswp->instance); 1105 return (rv); 1106 } 1107 1108 D2(vswp, "%s: Operating in mode %d", __func__, 1109 vswp->smode[vswp->smode_idx]); 1110 1111 D1(vswp, "%s: exit", __func__); 1112 1113 return (0); 1114 } 1115 1116 /* 1117 * Setup for layer 2 switching. 1118 * 1119 * Returns 0 on success, 1 on failure. 1120 */ 1121 static int 1122 vsw_setup_layer2(vsw_t *vswp) 1123 { 1124 D1(vswp, "%s: enter", __func__); 1125 1126 vswp->vsw_switch_frame = vsw_switch_l2_frame; 1127 1128 /* 1129 * Attempt to link into the MAC layer so we can get 1130 * and send packets out over the physical adapter. 1131 */ 1132 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1133 if (vsw_mac_attach(vswp) != 0) { 1134 /* 1135 * Registration with the MAC layer has failed, 1136 * so return 1 so that can fall back to next 1137 * prefered switching method. 1138 */ 1139 cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer " 1140 "client", vswp->instance); 1141 return (1); 1142 } 1143 1144 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1145 /* 1146 * Verify that underlying device can support multiple 1147 * unicast mac addresses, and has free capacity. 1148 */ 1149 if (vsw_get_hw_maddr(vswp) != 0) { 1150 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 1151 "switching", vswp->instance); 1152 vsw_mac_detach(vswp); 1153 return (1); 1154 } 1155 } 1156 1157 } else { 1158 /* 1159 * No physical device name found in MD which is 1160 * required for layer 2. 1161 */ 1162 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 1163 vswp->instance); 1164 return (1); 1165 } 1166 1167 D1(vswp, "%s: exit", __func__); 1168 1169 return (0); 1170 } 1171 1172 static int 1173 vsw_setup_layer3(vsw_t *vswp) 1174 { 1175 D1(vswp, "%s: enter", __func__); 1176 1177 D2(vswp, "%s: operating in layer 3 mode", __func__); 1178 vswp->vsw_switch_frame = vsw_switch_l3_frame; 1179 1180 D1(vswp, "%s: exit", __func__); 1181 1182 return (0); 1183 } 1184 1185 /* 1186 * Link into the MAC layer to gain access to the services provided by 1187 * the underlying physical device driver (which should also have 1188 * registered with the MAC layer). 1189 * 1190 * Only when in layer 2 mode. 1191 */ 1192 static int 1193 vsw_mac_attach(vsw_t *vswp) 1194 { 1195 char drv[LIFNAMSIZ]; 1196 uint_t ddi_instance; 1197 1198 D1(vswp, "%s: enter", __func__); 1199 1200 ASSERT(vswp->mh == NULL); 1201 ASSERT(vswp->mrh == NULL); 1202 ASSERT(vswp->mstarted == B_FALSE); 1203 ASSERT(vswp->mresources == B_FALSE); 1204 1205 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1206 1207 mutex_enter(&vswp->mac_lock); 1208 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1209 cmn_err(CE_WARN, "!vsw%d: invalid device name: %s", 1210 vswp->instance, vswp->physname); 1211 goto mac_fail_exit; 1212 } 1213 1214 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1215 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1216 vswp->instance, vswp->physname); 1217 goto mac_fail_exit; 1218 } 1219 1220 ASSERT(vswp->mh != NULL); 1221 1222 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1223 1224 if (vsw_multi_ring_enable) { 1225 /* 1226 * Initialize the ring table. 1227 */ 1228 vsw_mac_ring_tbl_init(vswp); 1229 1230 /* 1231 * Register our rx callback function. 1232 */ 1233 vswp->mrh = mac_rx_add(vswp->mh, 1234 vsw_rx_queue_cb, (void *)vswp); 1235 ASSERT(vswp->mrh != NULL); 1236 1237 /* 1238 * Register our mac resource callback. 1239 */ 1240 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1241 vswp->mresources = B_TRUE; 1242 1243 /* 1244 * Get the ring resources available to us from 1245 * the mac below us. 1246 */ 1247 mac_resources(vswp->mh); 1248 } else { 1249 /* 1250 * Just register our rx callback function 1251 */ 1252 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1253 ASSERT(vswp->mrh != NULL); 1254 } 1255 1256 /* Get the MAC tx fn */ 1257 vswp->txinfo = mac_tx_get(vswp->mh); 1258 1259 /* start the interface */ 1260 if (mac_start(vswp->mh) != 0) { 1261 cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", 1262 vswp->instance); 1263 goto mac_fail_exit; 1264 } 1265 1266 mutex_exit(&vswp->mac_lock); 1267 1268 vswp->mstarted = B_TRUE; 1269 1270 D1(vswp, "%s: exit", __func__); 1271 return (0); 1272 1273 mac_fail_exit: 1274 mutex_exit(&vswp->mac_lock); 1275 vsw_mac_detach(vswp); 1276 1277 D1(vswp, "%s: exit", __func__); 1278 return (1); 1279 } 1280 1281 static void 1282 vsw_mac_detach(vsw_t *vswp) 1283 { 1284 D1(vswp, "vsw_mac_detach: enter"); 1285 1286 ASSERT(vswp != NULL); 1287 1288 if (vsw_multi_ring_enable) { 1289 vsw_mac_ring_tbl_destroy(vswp); 1290 } 1291 1292 mutex_enter(&vswp->mac_lock); 1293 1294 if (vswp->mh != NULL) { 1295 if (vswp->mstarted) 1296 mac_stop(vswp->mh); 1297 if (vswp->mrh != NULL) 1298 mac_rx_remove(vswp->mh, vswp->mrh); 1299 if (vswp->mresources) 1300 mac_resource_set(vswp->mh, NULL, NULL); 1301 mac_close(vswp->mh); 1302 } 1303 1304 vswp->mrh = NULL; 1305 vswp->mh = NULL; 1306 vswp->txinfo = NULL; 1307 vswp->mstarted = B_FALSE; 1308 1309 mutex_exit(&vswp->mac_lock); 1310 1311 D1(vswp, "vsw_mac_detach: exit"); 1312 } 1313 1314 /* 1315 * Depending on the mode specified, the capabilites and capacity 1316 * of the underlying device setup the physical device. 1317 * 1318 * If in layer 3 mode, then do nothing. 1319 * 1320 * If in layer 2 programmed mode attempt to program the unicast address 1321 * associated with the port into the physical device. If this is not 1322 * possible due to resource exhaustion or simply because the device does 1323 * not support multiple unicast addresses then if required fallback onto 1324 * putting the card into promisc mode. 1325 * 1326 * If in promisc mode then simply set the card into promisc mode. 1327 * 1328 * Returns 0 success, 1 on failure. 1329 */ 1330 static int 1331 vsw_set_hw(vsw_t *vswp, vsw_port_t *port) 1332 { 1333 mac_multi_addr_t mac_addr; 1334 void *mah; 1335 int err; 1336 1337 D1(vswp, "%s: enter", __func__); 1338 1339 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1340 return (0); 1341 1342 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1343 return (vsw_set_hw_promisc(vswp, port)); 1344 } 1345 1346 if (vswp->maddr.maddr_handle == NULL) 1347 return (1); 1348 1349 mah = vswp->maddr.maddr_handle; 1350 1351 /* 1352 * Attempt to program the unicast address into the HW. 1353 */ 1354 mac_addr.mma_addrlen = ETHERADDRL; 1355 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1356 1357 err = vswp->maddr.maddr_add(mah, &mac_addr); 1358 if (err != 0) { 1359 cmn_err(CE_WARN, "!vsw%d: failed to program addr " 1360 "%x:%x:%x:%x:%x:%x for port %d into device %s " 1361 ": err %d", vswp->instance, 1362 port->p_macaddr.ether_addr_octet[0], 1363 port->p_macaddr.ether_addr_octet[1], 1364 port->p_macaddr.ether_addr_octet[2], 1365 port->p_macaddr.ether_addr_octet[3], 1366 port->p_macaddr.ether_addr_octet[4], 1367 port->p_macaddr.ether_addr_octet[5], 1368 port->p_instance, vswp->physname, err); 1369 1370 /* 1371 * Mark that attempt should be made to re-config sometime 1372 * in future if a port is deleted. 1373 */ 1374 vswp->recfg_reqd = B_TRUE; 1375 1376 /* 1377 * Only 1 mode specified, nothing more to do. 1378 */ 1379 if (vswp->smode_num == 1) 1380 return (err); 1381 1382 /* 1383 * If promiscuous was next mode specified try to 1384 * set the card into that mode. 1385 */ 1386 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1387 (vswp->smode[vswp->smode_idx + 1] 1388 == VSW_LAYER2_PROMISC)) { 1389 vswp->smode_idx += 1; 1390 return (vsw_set_hw_promisc(vswp, port)); 1391 } 1392 return (err); 1393 } 1394 1395 port->addr_slot = mac_addr.mma_slot; 1396 port->addr_set = VSW_ADDR_HW; 1397 1398 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d " 1399 "into slot %d of device %s", 1400 port->p_macaddr.ether_addr_octet[0], 1401 port->p_macaddr.ether_addr_octet[1], 1402 port->p_macaddr.ether_addr_octet[2], 1403 port->p_macaddr.ether_addr_octet[3], 1404 port->p_macaddr.ether_addr_octet[4], 1405 port->p_macaddr.ether_addr_octet[5], 1406 port->p_instance, port->addr_slot, vswp->physname); 1407 1408 D1(vswp, "%s: exit", __func__); 1409 1410 return (0); 1411 } 1412 1413 /* 1414 * If in layer 3 mode do nothing. 1415 * 1416 * If in layer 2 switched mode remove the address from the physical 1417 * device. 1418 * 1419 * If in layer 2 promiscuous mode disable promisc mode. 1420 * 1421 * Returns 0 on success. 1422 */ 1423 static int 1424 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port) 1425 { 1426 int err; 1427 void *mah; 1428 1429 D1(vswp, "%s: enter", __func__); 1430 1431 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1432 return (0); 1433 1434 if (port->addr_set == VSW_ADDR_PROMISC) { 1435 return (vsw_unset_hw_promisc(vswp, port)); 1436 } 1437 1438 if (port->addr_set == VSW_ADDR_HW) { 1439 if (vswp->maddr.maddr_handle == NULL) 1440 return (1); 1441 1442 mah = vswp->maddr.maddr_handle; 1443 1444 err = vswp->maddr.maddr_remove(mah, port->addr_slot); 1445 if (err != 0) { 1446 cmn_err(CE_WARN, "!vsw%d: Unable to remove addr " 1447 "%x:%x:%x:%x:%x:%x for port %d from device %s" 1448 " : (err %d)", vswp->instance, 1449 port->p_macaddr.ether_addr_octet[0], 1450 port->p_macaddr.ether_addr_octet[1], 1451 port->p_macaddr.ether_addr_octet[2], 1452 port->p_macaddr.ether_addr_octet[3], 1453 port->p_macaddr.ether_addr_octet[4], 1454 port->p_macaddr.ether_addr_octet[5], 1455 port->p_instance, vswp->physname, err); 1456 return (err); 1457 } 1458 1459 port->addr_set = VSW_ADDR_UNSET; 1460 1461 D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for " 1462 "port %d from device %s", 1463 port->p_macaddr.ether_addr_octet[0], 1464 port->p_macaddr.ether_addr_octet[1], 1465 port->p_macaddr.ether_addr_octet[2], 1466 port->p_macaddr.ether_addr_octet[3], 1467 port->p_macaddr.ether_addr_octet[4], 1468 port->p_macaddr.ether_addr_octet[5], 1469 port->p_instance, vswp->physname); 1470 } 1471 1472 D1(vswp, "%s: exit", __func__); 1473 return (0); 1474 } 1475 1476 /* 1477 * Set network card into promisc mode. 1478 * 1479 * Returns 0 on success, 1 on failure. 1480 */ 1481 static int 1482 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1483 { 1484 D1(vswp, "%s: enter", __func__); 1485 1486 mutex_enter(&vswp->mac_lock); 1487 if (vswp->mh == NULL) { 1488 mutex_exit(&vswp->mac_lock); 1489 return (1); 1490 } 1491 1492 if (vswp->promisc_cnt++ == 0) { 1493 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1494 vswp->promisc_cnt--; 1495 mutex_exit(&vswp->mac_lock); 1496 return (1); 1497 } 1498 cmn_err(CE_NOTE, "!vsw%d: switching device %s into " 1499 "promiscuous mode", vswp->instance, vswp->physname); 1500 } 1501 mutex_exit(&vswp->mac_lock); 1502 port->addr_set = VSW_ADDR_PROMISC; 1503 1504 D1(vswp, "%s: exit", __func__); 1505 1506 return (0); 1507 } 1508 1509 /* 1510 * Turn off promiscuous mode on network card. 1511 * 1512 * Returns 0 on success, 1 on failure. 1513 */ 1514 static int 1515 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1516 { 1517 vsw_port_list_t *plist = &vswp->plist; 1518 1519 D2(vswp, "%s: enter", __func__); 1520 1521 mutex_enter(&vswp->mac_lock); 1522 if (vswp->mh == NULL) { 1523 mutex_exit(&vswp->mac_lock); 1524 return (1); 1525 } 1526 1527 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1528 1529 if (--vswp->promisc_cnt == 0) { 1530 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1531 vswp->promisc_cnt++; 1532 mutex_exit(&vswp->mac_lock); 1533 return (1); 1534 } 1535 1536 /* 1537 * We are exiting promisc mode either because we were 1538 * only in promisc mode because we had failed over from 1539 * switched mode due to HW resource issues, or the user 1540 * wanted the card in promisc mode for all the ports and 1541 * the last port is now being deleted. Tweak the message 1542 * accordingly. 1543 */ 1544 if (plist->num_ports != 0) { 1545 cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " 1546 "programmed mode", vswp->instance, 1547 vswp->physname); 1548 } else { 1549 cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " 1550 "promiscuous mode", vswp->instance, 1551 vswp->physname); 1552 } 1553 } 1554 mutex_exit(&vswp->mac_lock); 1555 port->addr_set = VSW_ADDR_UNSET; 1556 1557 D1(vswp, "%s: exit", __func__); 1558 return (0); 1559 } 1560 1561 /* 1562 * Determine whether or not we are operating in our prefered 1563 * mode and if not whether the physical resources now allow us 1564 * to operate in it. 1565 * 1566 * Should only be invoked after port which is being deleted has been 1567 * removed from the port list. 1568 */ 1569 static int 1570 vsw_reconfig_hw(vsw_t *vswp) 1571 { 1572 vsw_port_list_t *plist = &vswp->plist; 1573 mac_multi_addr_t mac_addr; 1574 vsw_port_t *tp; 1575 void *mah; 1576 int rv = 0; 1577 int s_idx; 1578 1579 D1(vswp, "%s: enter", __func__); 1580 1581 if (vswp->maddr.maddr_handle == NULL) 1582 return (1); 1583 1584 /* 1585 * Check if there are now sufficient HW resources to 1586 * attempt a re-config. 1587 */ 1588 if (plist->num_ports > vswp->maddr.maddr_naddrfree) 1589 return (1); 1590 1591 /* 1592 * If we are in layer 2 (i.e. switched) or would like to be 1593 * in layer 2 then check if any ports need to be programmed 1594 * into the HW. 1595 * 1596 * This can happen in two cases - switched was specified as 1597 * the prefered mode of operation but we exhausted the HW 1598 * resources and so failed over to the next specifed mode, 1599 * or switched was the only mode specified so after HW 1600 * resources were exhausted there was nothing more we 1601 * could do. 1602 */ 1603 if (vswp->smode_idx > 0) 1604 s_idx = vswp->smode_idx - 1; 1605 else 1606 s_idx = vswp->smode_idx; 1607 1608 if (vswp->smode[s_idx] == VSW_LAYER2) { 1609 mah = vswp->maddr.maddr_handle; 1610 1611 D2(vswp, "%s: attempting reconfig..", __func__); 1612 1613 /* 1614 * Scan the port list for any port whose address has not 1615 * be programmed in HW - there should be a max of one. 1616 */ 1617 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1618 if (tp->addr_set != VSW_ADDR_HW) { 1619 mac_addr.mma_addrlen = ETHERADDRL; 1620 ether_copy(&tp->p_macaddr, &mac_addr.mma_addr); 1621 1622 rv = vswp->maddr.maddr_add(mah, &mac_addr); 1623 if (rv != 0) { 1624 DWARN(vswp, "Error setting addr in " 1625 "HW for port %d err %d", 1626 tp->p_instance, rv); 1627 goto reconfig_err_exit; 1628 } 1629 tp->addr_slot = mac_addr.mma_slot; 1630 1631 D2(vswp, "re-programmed port %d " 1632 "addr %x:%x:%x:%x:%x:%x into slot %d" 1633 " of device %s", tp->p_instance, 1634 tp->p_macaddr.ether_addr_octet[0], 1635 tp->p_macaddr.ether_addr_octet[1], 1636 tp->p_macaddr.ether_addr_octet[2], 1637 tp->p_macaddr.ether_addr_octet[3], 1638 tp->p_macaddr.ether_addr_octet[4], 1639 tp->p_macaddr.ether_addr_octet[5], 1640 tp->addr_slot, vswp->physname); 1641 1642 /* 1643 * If up to now we had to put the card into 1644 * promisc mode to see this address, we 1645 * can now safely disable promisc mode. 1646 */ 1647 if (tp->addr_set == VSW_ADDR_PROMISC) 1648 (void) vsw_unset_hw_promisc(vswp, tp); 1649 1650 tp->addr_set = VSW_ADDR_HW; 1651 } 1652 } 1653 1654 /* no further re-config needed */ 1655 vswp->recfg_reqd = B_FALSE; 1656 1657 vswp->smode_idx = s_idx; 1658 1659 return (0); 1660 } 1661 1662 reconfig_err_exit: 1663 return (rv); 1664 } 1665 1666 static void 1667 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 1668 { 1669 ringp->ring_state = VSW_MAC_RING_FREE; 1670 ringp->ring_arg = NULL; 1671 ringp->ring_blank = NULL; 1672 ringp->ring_vqp = NULL; 1673 ringp->ring_vswp = vswp; 1674 } 1675 1676 static void 1677 vsw_mac_ring_tbl_init(vsw_t *vswp) 1678 { 1679 int i; 1680 1681 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 1682 1683 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 1684 vswp->mac_ring_tbl = 1685 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), 1686 KM_SLEEP); 1687 1688 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 1689 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 1690 } 1691 1692 static void 1693 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 1694 { 1695 int i; 1696 vsw_mac_ring_t *ringp; 1697 1698 mutex_enter(&vswp->mac_ring_lock); 1699 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1700 ringp = &vswp->mac_ring_tbl[i]; 1701 1702 if (ringp->ring_state != VSW_MAC_RING_FREE) { 1703 /* 1704 * Destroy the queue. 1705 */ 1706 vsw_queue_stop(ringp->ring_vqp); 1707 vsw_queue_destroy(ringp->ring_vqp); 1708 1709 /* 1710 * Re-initialize the structure. 1711 */ 1712 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1713 } 1714 } 1715 mutex_exit(&vswp->mac_ring_lock); 1716 1717 mutex_destroy(&vswp->mac_ring_lock); 1718 kmem_free(vswp->mac_ring_tbl, 1719 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 1720 vswp->mac_ring_tbl_sz = 0; 1721 } 1722 1723 /* 1724 * Handle resource add callbacks from the driver below. 1725 */ 1726 static mac_resource_handle_t 1727 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 1728 { 1729 vsw_t *vswp = (vsw_t *)arg; 1730 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 1731 vsw_mac_ring_t *ringp; 1732 vsw_queue_t *vqp; 1733 int i; 1734 1735 ASSERT(vswp != NULL); 1736 ASSERT(mrp != NULL); 1737 ASSERT(vswp->mac_ring_tbl != NULL); 1738 1739 D1(vswp, "%s: enter", __func__); 1740 1741 /* 1742 * Check to make sure we have the correct resource type. 1743 */ 1744 if (mrp->mr_type != MAC_RX_FIFO) 1745 return (NULL); 1746 1747 /* 1748 * Find a open entry in the ring table. 1749 */ 1750 mutex_enter(&vswp->mac_ring_lock); 1751 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1752 ringp = &vswp->mac_ring_tbl[i]; 1753 1754 /* 1755 * Check for an empty slot, if found, then setup queue 1756 * and thread. 1757 */ 1758 if (ringp->ring_state == VSW_MAC_RING_FREE) { 1759 /* 1760 * Create the queue for this ring. 1761 */ 1762 vqp = vsw_queue_create(); 1763 1764 /* 1765 * Initialize the ring data structure. 1766 */ 1767 ringp->ring_vqp = vqp; 1768 ringp->ring_arg = mrfp->mrf_arg; 1769 ringp->ring_blank = mrfp->mrf_blank; 1770 ringp->ring_state = VSW_MAC_RING_INUSE; 1771 1772 /* 1773 * Create the worker thread. 1774 */ 1775 vqp->vq_worker = thread_create(NULL, 0, 1776 vsw_queue_worker, ringp, 0, &p0, 1777 TS_RUN, minclsyspri); 1778 if (vqp->vq_worker == NULL) { 1779 vsw_queue_destroy(vqp); 1780 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1781 ringp = NULL; 1782 } 1783 1784 if (ringp != NULL) { 1785 /* 1786 * Make sure thread get's running state for 1787 * this ring. 1788 */ 1789 mutex_enter(&vqp->vq_lock); 1790 while ((vqp->vq_state != VSW_QUEUE_RUNNING) && 1791 (vqp->vq_state != VSW_QUEUE_DRAINED)) { 1792 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1793 } 1794 1795 /* 1796 * If the thread is not running, cleanup. 1797 */ 1798 if (vqp->vq_state == VSW_QUEUE_DRAINED) { 1799 vsw_queue_destroy(vqp); 1800 vsw_mac_ring_tbl_entry_init(vswp, 1801 ringp); 1802 ringp = NULL; 1803 } 1804 mutex_exit(&vqp->vq_lock); 1805 } 1806 1807 mutex_exit(&vswp->mac_ring_lock); 1808 D1(vswp, "%s: exit", __func__); 1809 return ((mac_resource_handle_t)ringp); 1810 } 1811 } 1812 mutex_exit(&vswp->mac_ring_lock); 1813 1814 /* 1815 * No slots in the ring table available. 1816 */ 1817 D1(vswp, "%s: exit", __func__); 1818 return (NULL); 1819 } 1820 1821 static void 1822 vsw_queue_stop(vsw_queue_t *vqp) 1823 { 1824 mutex_enter(&vqp->vq_lock); 1825 1826 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 1827 vqp->vq_state = VSW_QUEUE_STOP; 1828 cv_signal(&vqp->vq_cv); 1829 1830 while (vqp->vq_state != VSW_QUEUE_DRAINED) 1831 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1832 } 1833 1834 vqp->vq_state = VSW_QUEUE_STOPPED; 1835 1836 mutex_exit(&vqp->vq_lock); 1837 } 1838 1839 static vsw_queue_t * 1840 vsw_queue_create() 1841 { 1842 vsw_queue_t *vqp; 1843 1844 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 1845 1846 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 1847 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 1848 vqp->vq_first = NULL; 1849 vqp->vq_last = NULL; 1850 vqp->vq_state = VSW_QUEUE_STOPPED; 1851 1852 return (vqp); 1853 } 1854 1855 static void 1856 vsw_queue_destroy(vsw_queue_t *vqp) 1857 { 1858 cv_destroy(&vqp->vq_cv); 1859 mutex_destroy(&vqp->vq_lock); 1860 kmem_free(vqp, sizeof (vsw_queue_t)); 1861 } 1862 1863 static void 1864 vsw_queue_worker(vsw_mac_ring_t *rrp) 1865 { 1866 mblk_t *mp; 1867 vsw_queue_t *vqp = rrp->ring_vqp; 1868 vsw_t *vswp = rrp->ring_vswp; 1869 1870 mutex_enter(&vqp->vq_lock); 1871 1872 ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); 1873 1874 /* 1875 * Set the state to running, since the thread is now active. 1876 */ 1877 vqp->vq_state = VSW_QUEUE_RUNNING; 1878 cv_signal(&vqp->vq_cv); 1879 1880 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 1881 /* 1882 * Wait for work to do or the state has changed 1883 * to not running. 1884 */ 1885 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 1886 (vqp->vq_first == NULL)) { 1887 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1888 } 1889 1890 /* 1891 * Process packets that we received from the interface. 1892 */ 1893 if (vqp->vq_first != NULL) { 1894 mp = vqp->vq_first; 1895 1896 vqp->vq_first = NULL; 1897 vqp->vq_last = NULL; 1898 1899 mutex_exit(&vqp->vq_lock); 1900 1901 /* switch the chain of packets received */ 1902 vswp->vsw_switch_frame(vswp, mp, 1903 VSW_PHYSDEV, NULL, NULL); 1904 1905 mutex_enter(&vqp->vq_lock); 1906 } 1907 } 1908 1909 /* 1910 * We are drained and signal we are done. 1911 */ 1912 vqp->vq_state = VSW_QUEUE_DRAINED; 1913 cv_signal(&vqp->vq_cv); 1914 1915 /* 1916 * Exit lock and drain the remaining packets. 1917 */ 1918 mutex_exit(&vqp->vq_lock); 1919 1920 /* 1921 * Exit the thread 1922 */ 1923 thread_exit(); 1924 } 1925 1926 /* 1927 * static void 1928 * vsw_rx_queue_cb() - Receive callback routine when 1929 * vsw_multi_ring_enable is non-zero. Queue the packets 1930 * to a packet queue for a worker thread to process. 1931 */ 1932 static void 1933 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 1934 { 1935 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 1936 vsw_t *vswp = (vsw_t *)arg; 1937 vsw_queue_t *vqp; 1938 mblk_t *bp, *last; 1939 1940 ASSERT(mrh != NULL); 1941 ASSERT(vswp != NULL); 1942 ASSERT(mp != NULL); 1943 1944 D1(vswp, "%s: enter", __func__); 1945 1946 /* 1947 * Find the last element in the mblk chain. 1948 */ 1949 bp = mp; 1950 do { 1951 last = bp; 1952 bp = bp->b_next; 1953 } while (bp != NULL); 1954 1955 /* Get the queue for the packets */ 1956 vqp = ringp->ring_vqp; 1957 1958 /* 1959 * Grab the lock such we can queue the packets. 1960 */ 1961 mutex_enter(&vqp->vq_lock); 1962 1963 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 1964 freemsg(mp); 1965 mutex_exit(&vqp->vq_lock); 1966 goto vsw_rx_queue_cb_exit; 1967 } 1968 1969 /* 1970 * Add the mblk chain to the queue. If there 1971 * is some mblks in the queue, then add the new 1972 * chain to the end. 1973 */ 1974 if (vqp->vq_first == NULL) 1975 vqp->vq_first = mp; 1976 else 1977 vqp->vq_last->b_next = mp; 1978 1979 vqp->vq_last = last; 1980 1981 /* 1982 * Signal the worker thread that there is work to 1983 * do. 1984 */ 1985 cv_signal(&vqp->vq_cv); 1986 1987 /* 1988 * Let go of the lock and exit. 1989 */ 1990 mutex_exit(&vqp->vq_lock); 1991 1992 vsw_rx_queue_cb_exit: 1993 D1(vswp, "%s: exit", __func__); 1994 } 1995 1996 /* 1997 * receive callback routine. Invoked by MAC layer when there 1998 * are pkts being passed up from physical device. 1999 * 2000 * PERF: It may be more efficient when the card is in promisc 2001 * mode to check the dest address of the pkts here (against 2002 * the FDB) rather than checking later. Needs to be investigated. 2003 */ 2004 static void 2005 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2006 { 2007 _NOTE(ARGUNUSED(mrh)) 2008 2009 vsw_t *vswp = (vsw_t *)arg; 2010 2011 ASSERT(vswp != NULL); 2012 2013 D1(vswp, "vsw_rx_cb: enter"); 2014 2015 /* switch the chain of packets received */ 2016 vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2017 2018 D1(vswp, "vsw_rx_cb: exit"); 2019 } 2020 2021 /* 2022 * Send a message out over the physical device via the MAC layer. 2023 * 2024 * Returns any mblks that it was unable to transmit. 2025 */ 2026 static mblk_t * 2027 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2028 { 2029 const mac_txinfo_t *mtp; 2030 mblk_t *nextp; 2031 2032 mutex_enter(&vswp->mac_lock); 2033 if (vswp->mh == NULL) { 2034 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2035 mutex_exit(&vswp->mac_lock); 2036 return (mp); 2037 } else { 2038 for (;;) { 2039 nextp = mp->b_next; 2040 mp->b_next = NULL; 2041 2042 mtp = vswp->txinfo; 2043 2044 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2045 mp->b_next = nextp; 2046 break; 2047 } 2048 2049 if ((mp = nextp) == NULL) 2050 break; 2051 } 2052 } 2053 mutex_exit(&vswp->mac_lock); 2054 2055 return (mp); 2056 } 2057 2058 /* 2059 * Register with the MAC layer as a network device, so we 2060 * can be plumbed if necessary. 2061 */ 2062 static int 2063 vsw_mac_register(vsw_t *vswp) 2064 { 2065 mac_register_t *macp; 2066 int rv; 2067 2068 D1(vswp, "%s: enter", __func__); 2069 2070 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2071 return (EINVAL); 2072 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2073 macp->m_driver = vswp; 2074 macp->m_dip = vswp->dip; 2075 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2076 macp->m_callbacks = &vsw_m_callbacks; 2077 macp->m_min_sdu = 0; 2078 macp->m_max_sdu = ETHERMTU; 2079 rv = mac_register(macp, &vswp->if_mh); 2080 mac_free(macp); 2081 if (rv == 0) 2082 vswp->if_state |= VSW_IF_REG; 2083 2084 D1(vswp, "%s: exit", __func__); 2085 2086 return (rv); 2087 } 2088 2089 static int 2090 vsw_mac_unregister(vsw_t *vswp) 2091 { 2092 int rv = 0; 2093 2094 D1(vswp, "%s: enter", __func__); 2095 2096 WRITE_ENTER(&vswp->if_lockrw); 2097 2098 if (vswp->if_state & VSW_IF_REG) { 2099 rv = mac_unregister(vswp->if_mh); 2100 if (rv != 0) { 2101 DWARN(vswp, "%s: unable to unregister from MAC " 2102 "framework", __func__); 2103 2104 RW_EXIT(&vswp->if_lockrw); 2105 D1(vswp, "%s: fail exit", __func__); 2106 return (rv); 2107 } 2108 2109 /* mark i/f as down and unregistered */ 2110 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2111 } 2112 RW_EXIT(&vswp->if_lockrw); 2113 2114 D1(vswp, "%s: exit", __func__); 2115 2116 return (rv); 2117 } 2118 2119 static int 2120 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2121 { 2122 vsw_t *vswp = (vsw_t *)arg; 2123 2124 D1(vswp, "%s: enter", __func__); 2125 2126 mutex_enter(&vswp->mac_lock); 2127 if (vswp->mh == NULL) { 2128 mutex_exit(&vswp->mac_lock); 2129 return (EINVAL); 2130 } 2131 2132 /* return stats from underlying device */ 2133 *val = mac_stat_get(vswp->mh, stat); 2134 2135 mutex_exit(&vswp->mac_lock); 2136 2137 return (0); 2138 } 2139 2140 static void 2141 vsw_m_stop(void *arg) 2142 { 2143 vsw_t *vswp = (vsw_t *)arg; 2144 2145 D1(vswp, "%s: enter", __func__); 2146 2147 WRITE_ENTER(&vswp->if_lockrw); 2148 vswp->if_state &= ~VSW_IF_UP; 2149 RW_EXIT(&vswp->if_lockrw); 2150 2151 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2152 } 2153 2154 static int 2155 vsw_m_start(void *arg) 2156 { 2157 vsw_t *vswp = (vsw_t *)arg; 2158 2159 D1(vswp, "%s: enter", __func__); 2160 2161 WRITE_ENTER(&vswp->if_lockrw); 2162 vswp->if_state |= VSW_IF_UP; 2163 RW_EXIT(&vswp->if_lockrw); 2164 2165 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2166 return (0); 2167 } 2168 2169 /* 2170 * Change the local interface address. 2171 */ 2172 static int 2173 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2174 { 2175 vsw_t *vswp = (vsw_t *)arg; 2176 2177 D1(vswp, "%s: enter", __func__); 2178 2179 WRITE_ENTER(&vswp->if_lockrw); 2180 ether_copy(macaddr, &vswp->if_addr); 2181 RW_EXIT(&vswp->if_lockrw); 2182 2183 D1(vswp, "%s: exit", __func__); 2184 2185 return (0); 2186 } 2187 2188 static int 2189 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2190 { 2191 vsw_t *vswp = (vsw_t *)arg; 2192 mcst_addr_t *mcst_p = NULL; 2193 uint64_t addr = 0x0; 2194 int i, ret = 0; 2195 2196 D1(vswp, "%s: enter", __func__); 2197 2198 /* 2199 * Convert address into form that can be used 2200 * as hash table key. 2201 */ 2202 for (i = 0; i < ETHERADDRL; i++) { 2203 addr = (addr << 8) | mca[i]; 2204 } 2205 2206 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2207 2208 if (add) { 2209 D2(vswp, "%s: adding multicast", __func__); 2210 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2211 /* 2212 * Update the list of multicast addresses 2213 * contained within the vsw_t structure to 2214 * include this new one. 2215 */ 2216 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2217 if (mcst_p == NULL) { 2218 DERR(vswp, "%s unable to alloc mem", __func__); 2219 return (1); 2220 } 2221 mcst_p->addr = addr; 2222 2223 mutex_enter(&vswp->mca_lock); 2224 mcst_p->nextp = vswp->mcap; 2225 vswp->mcap = mcst_p; 2226 mutex_exit(&vswp->mca_lock); 2227 2228 /* 2229 * Call into the underlying driver to program the 2230 * address into HW. 2231 */ 2232 mutex_enter(&vswp->mac_lock); 2233 if (vswp->mh != NULL) { 2234 ret = mac_multicst_add(vswp->mh, mca); 2235 if (ret != 0) { 2236 cmn_err(CE_WARN, "!vsw%d: unable to " 2237 "add multicast address", 2238 vswp->instance); 2239 mutex_exit(&vswp->mac_lock); 2240 goto vsw_remove_addr; 2241 } 2242 } 2243 mutex_exit(&vswp->mac_lock); 2244 } else { 2245 cmn_err(CE_WARN, "!vsw%d: unable to add multicast " 2246 "address", vswp->instance); 2247 } 2248 return (ret); 2249 } 2250 2251 vsw_remove_addr: 2252 2253 D2(vswp, "%s: removing multicast", __func__); 2254 /* 2255 * Remove the address from the hash table.. 2256 */ 2257 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2258 2259 /* 2260 * ..and then from the list maintained in the 2261 * vsw_t structure. 2262 */ 2263 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2264 2265 mutex_enter(&vswp->mac_lock); 2266 if (vswp->mh != NULL) 2267 (void) mac_multicst_remove(vswp->mh, mca); 2268 mutex_exit(&vswp->mac_lock); 2269 } 2270 2271 D1(vswp, "%s: exit", __func__); 2272 2273 return (0); 2274 } 2275 2276 static int 2277 vsw_m_promisc(void *arg, boolean_t on) 2278 { 2279 vsw_t *vswp = (vsw_t *)arg; 2280 2281 D1(vswp, "%s: enter", __func__); 2282 2283 WRITE_ENTER(&vswp->if_lockrw); 2284 if (on) 2285 vswp->if_state |= VSW_IF_PROMISC; 2286 else 2287 vswp->if_state &= ~VSW_IF_PROMISC; 2288 RW_EXIT(&vswp->if_lockrw); 2289 2290 D1(vswp, "%s: exit", __func__); 2291 2292 return (0); 2293 } 2294 2295 static mblk_t * 2296 vsw_m_tx(void *arg, mblk_t *mp) 2297 { 2298 vsw_t *vswp = (vsw_t *)arg; 2299 2300 D1(vswp, "%s: enter", __func__); 2301 2302 vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2303 2304 D1(vswp, "%s: exit", __func__); 2305 2306 return (NULL); 2307 } 2308 2309 /* 2310 * Register for machine description (MD) updates. 2311 * 2312 * Returns 0 on success, 1 on failure. 2313 */ 2314 static int 2315 vsw_mdeg_register(vsw_t *vswp) 2316 { 2317 mdeg_prop_spec_t *pspecp; 2318 mdeg_node_spec_t *inst_specp; 2319 mdeg_handle_t mdeg_hdl, mdeg_port_hdl; 2320 size_t templatesz; 2321 int inst, rv; 2322 2323 D1(vswp, "%s: enter", __func__); 2324 2325 /* 2326 * In each 'virtual-device' node in the MD there is a 2327 * 'cfg-handle' property which is the MD's concept of 2328 * an instance number (this may be completely different from 2329 * the device drivers instance #). OBP reads that value and 2330 * stores it in the 'reg' property of the appropriate node in 2331 * the device tree. So we use the 'reg' value when registering 2332 * with the mdeg framework, to ensure we get events for the 2333 * correct nodes. 2334 */ 2335 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 2336 DDI_PROP_DONTPASS, reg_propname, -1); 2337 if (inst == -1) { 2338 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from " 2339 "OBP device tree", vswp->instance, reg_propname); 2340 return (1); 2341 } 2342 2343 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 2344 2345 /* 2346 * Allocate and initialize a per-instance copy 2347 * of the global property spec array that will 2348 * uniquely identify this vsw instance. 2349 */ 2350 templatesz = sizeof (vsw_prop_template); 2351 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2352 2353 bcopy(vsw_prop_template, pspecp, templatesz); 2354 2355 VSW_SET_MDEG_PROP_INST(pspecp, inst); 2356 2357 /* initialize the complete prop spec structure */ 2358 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2359 inst_specp->namep = "virtual-device"; 2360 inst_specp->specp = pspecp; 2361 2362 /* 2363 * Register an interest in 'virtual-device' nodes with a 2364 * 'name' property of 'virtual-network-switch' 2365 */ 2366 rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb, 2367 (void *)vswp, &mdeg_hdl); 2368 if (rv != MDEG_SUCCESS) { 2369 DERR(vswp, "%s: mdeg_register failed (%d) for vsw node", 2370 __func__, rv); 2371 goto mdeg_reg_fail; 2372 } 2373 2374 /* 2375 * Register an interest in 'vsw-port' nodes. 2376 */ 2377 rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb, 2378 (void *)vswp, &mdeg_port_hdl); 2379 if (rv != MDEG_SUCCESS) { 2380 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2381 (void) mdeg_unregister(mdeg_hdl); 2382 goto mdeg_reg_fail; 2383 } 2384 2385 /* save off data that will be needed later */ 2386 vswp->inst_spec = inst_specp; 2387 vswp->mdeg_hdl = mdeg_hdl; 2388 vswp->mdeg_port_hdl = mdeg_port_hdl; 2389 2390 D1(vswp, "%s: exit", __func__); 2391 return (0); 2392 2393 mdeg_reg_fail: 2394 cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks", 2395 vswp->instance); 2396 kmem_free(pspecp, templatesz); 2397 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2398 2399 vswp->mdeg_hdl = NULL; 2400 vswp->mdeg_port_hdl = NULL; 2401 2402 return (1); 2403 } 2404 2405 static void 2406 vsw_mdeg_unregister(vsw_t *vswp) 2407 { 2408 D1(vswp, "vsw_mdeg_unregister: enter"); 2409 2410 if (vswp->mdeg_hdl != NULL) 2411 (void) mdeg_unregister(vswp->mdeg_hdl); 2412 2413 if (vswp->mdeg_port_hdl != NULL) 2414 (void) mdeg_unregister(vswp->mdeg_port_hdl); 2415 2416 if (vswp->inst_spec != NULL) { 2417 if (vswp->inst_spec->specp != NULL) { 2418 (void) kmem_free(vswp->inst_spec->specp, 2419 sizeof (vsw_prop_template)); 2420 vswp->inst_spec->specp = NULL; 2421 } 2422 2423 (void) kmem_free(vswp->inst_spec, 2424 sizeof (mdeg_node_spec_t)); 2425 vswp->inst_spec = NULL; 2426 } 2427 2428 D1(vswp, "vsw_mdeg_unregister: exit"); 2429 } 2430 2431 /* 2432 * Mdeg callback invoked for the vsw node itself. 2433 */ 2434 static int 2435 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2436 { 2437 vsw_t *vswp; 2438 int idx; 2439 md_t *mdp; 2440 mde_cookie_t node; 2441 uint64_t inst; 2442 char *node_name = NULL; 2443 2444 if (resp == NULL) 2445 return (MDEG_FAILURE); 2446 2447 vswp = (vsw_t *)cb_argp; 2448 2449 D1(vswp, "%s: added %d : removed %d : curr matched %d" 2450 " : prev matched %d", __func__, resp->added.nelem, 2451 resp->removed.nelem, resp->match_curr.nelem, 2452 resp->match_prev.nelem); 2453 2454 /* 2455 * Expect 'added' to be non-zero if virtual-network-switch 2456 * nodes exist in the MD when the driver attaches. 2457 */ 2458 for (idx = 0; idx < resp->added.nelem; idx++) { 2459 mdp = resp->added.mdp; 2460 node = resp->added.mdep[idx]; 2461 2462 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2463 DERR(vswp, "%s: unable to get node name for " 2464 "node(%d) 0x%lx", __func__, idx, node); 2465 continue; 2466 } 2467 2468 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2469 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2470 __func__, idx); 2471 continue; 2472 } 2473 2474 D2(vswp, "%s: added node(%d) 0x%lx with name %s " 2475 "and inst %d", __func__, idx, node, node_name, inst); 2476 2477 vsw_get_initial_md_properties(vswp, mdp, node); 2478 } 2479 2480 /* 2481 * A non-zero 'match' value indicates that the MD has been 2482 * updated and that a virtual-network-switch node is present 2483 * which may or may not have been updated. It is up to the clients 2484 * to examine their own nodes and determine if they have changed. 2485 */ 2486 for (idx = 0; idx < resp->match_curr.nelem; idx++) { 2487 mdp = resp->match_curr.mdp; 2488 node = resp->match_curr.mdep[idx]; 2489 2490 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2491 DERR(vswp, "%s: unable to get node name for " 2492 "node(%d) 0x%lx", __func__, idx, node); 2493 continue; 2494 } 2495 2496 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2497 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2498 __func__, idx); 2499 continue; 2500 } 2501 2502 D2(vswp, "%s: changed node(%d) 0x%lx with name %s " 2503 "and inst %d", __func__, idx, node, node_name, inst); 2504 2505 vsw_update_md_prop(vswp, mdp, node); 2506 } 2507 2508 return (MDEG_SUCCESS); 2509 } 2510 2511 /* 2512 * Mdeg callback invoked for changes to the vsw-port nodes 2513 * under the vsw node. 2514 */ 2515 static int 2516 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2517 { 2518 vsw_t *vswp; 2519 int idx; 2520 md_t *mdp; 2521 mde_cookie_t node; 2522 uint64_t inst; 2523 2524 if ((resp == NULL) || (cb_argp == NULL)) 2525 return (MDEG_FAILURE); 2526 2527 vswp = (vsw_t *)cb_argp; 2528 2529 D2(vswp, "%s: added %d : removed %d : curr matched %d" 2530 " : prev matched %d", __func__, resp->added.nelem, 2531 resp->removed.nelem, resp->match_curr.nelem, 2532 resp->match_prev.nelem); 2533 2534 /* process added ports */ 2535 for (idx = 0; idx < resp->added.nelem; idx++) { 2536 mdp = resp->added.mdp; 2537 node = resp->added.mdep[idx]; 2538 2539 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2540 2541 if (vsw_port_add(vswp, mdp, &node) != 0) { 2542 cmn_err(CE_WARN, "!vsw%d: Unable to add new port " 2543 "(0x%lx)", vswp->instance, node); 2544 } 2545 } 2546 2547 /* process removed ports */ 2548 for (idx = 0; idx < resp->removed.nelem; idx++) { 2549 mdp = resp->removed.mdp; 2550 node = resp->removed.mdep[idx]; 2551 2552 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2553 DERR(vswp, "%s: prop(%s) not found in port(%d)", 2554 __func__, id_propname, idx); 2555 continue; 2556 } 2557 2558 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2559 2560 if (vsw_port_detach(vswp, inst) != 0) { 2561 cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld", 2562 vswp->instance, inst); 2563 } 2564 } 2565 2566 /* 2567 * Currently no support for updating already active ports. 2568 * So, ignore the match_curr and match_priv arrays for now. 2569 */ 2570 2571 D1(vswp, "%s: exit", __func__); 2572 2573 return (MDEG_SUCCESS); 2574 } 2575 2576 /* 2577 * Read the initial start-of-day values from the specified MD node. 2578 */ 2579 static void 2580 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2581 { 2582 int i; 2583 uint64_t macaddr = 0; 2584 2585 D1(vswp, "%s: enter", __func__); 2586 2587 if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) { 2588 /* 2589 * Note it is valid for the physname property to 2590 * be NULL so check actual name length to determine 2591 * if we have a actual device name. 2592 */ 2593 if (strlen(vswp->physname) > 0) 2594 vswp->mdprops |= VSW_MD_PHYSNAME; 2595 } else { 2596 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2597 "device from MD", vswp->instance); 2598 return; 2599 } 2600 2601 /* mac address for vswitch device itself */ 2602 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2603 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2604 vswp->instance); 2605 2606 /* 2607 * Fallback to using the mac address of the physical 2608 * device. 2609 */ 2610 if (vsw_get_physaddr(vswp) == 0) { 2611 cmn_err(CE_NOTE, "!vsw%d: Using MAC address from " 2612 "physical device (%s)", vswp->instance, 2613 vswp->physname); 2614 } else { 2615 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address" 2616 "from device %s", vswp->instance, 2617 vswp->physname); 2618 } 2619 } else { 2620 WRITE_ENTER(&vswp->if_lockrw); 2621 for (i = ETHERADDRL - 1; i >= 0; i--) { 2622 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 2623 macaddr >>= 8; 2624 } 2625 RW_EXIT(&vswp->if_lockrw); 2626 vswp->mdprops |= VSW_MD_MACADDR; 2627 } 2628 2629 if (vsw_get_md_smodes(vswp, mdp, node, 2630 vswp->smode, &vswp->smode_num)) { 2631 cmn_err(CE_WARN, "vsw%d: Unable to read %s property from " 2632 "MD, defaulting to programmed mode", vswp->instance, 2633 smode_propname); 2634 2635 for (i = 0; i < NUM_SMODES; i++) 2636 vswp->smode[i] = VSW_LAYER2; 2637 2638 vswp->smode_num = NUM_SMODES; 2639 } else { 2640 ASSERT(vswp->smode_num != 0); 2641 vswp->mdprops |= VSW_MD_SMODE; 2642 } 2643 2644 /* 2645 * Unable to setup any switching mode, nothing more 2646 * we can do. 2647 */ 2648 if (vsw_setup_switching(vswp)) 2649 return; 2650 2651 WRITE_ENTER(&vswp->if_lockrw); 2652 vswp->if_state &= ~VSW_IF_UP; 2653 RW_EXIT(&vswp->if_lockrw); 2654 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 2655 if (vsw_mac_register(vswp) != 0) { 2656 /* 2657 * Treat this as a non-fatal error as we may be 2658 * able to operate in some other mode. 2659 */ 2660 cmn_err(CE_WARN, "vsw%d: Unable to register as " 2661 "provider with MAC layer", vswp->instance); 2662 } 2663 } 2664 2665 D1(vswp, "%s: exit", __func__); 2666 } 2667 2668 /* 2669 * Check to see if the relevant properties in the specified node have 2670 * changed, and if so take the appropriate action. 2671 * 2672 * If any of the properties are missing or invalid we don't take 2673 * any action, as this function should only be invoked when modifications 2674 * have been made to what we assume is a working configuration, which 2675 * we leave active. 2676 * 2677 * Note it is legal for this routine to be invoked even if none of the 2678 * properties in the port node within the MD have actually changed. 2679 */ 2680 static void 2681 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2682 { 2683 char physname[LIFNAMSIZ]; 2684 char drv[LIFNAMSIZ]; 2685 uint_t ddi_instance; 2686 uint8_t new_smode[NUM_SMODES]; 2687 int i, smode_num = 0; 2688 uint64_t macaddr = 0; 2689 vsw_port_list_t *plist = &vswp->plist; 2690 vsw_port_t *port = NULL; 2691 enum {MD_init = 0x1, 2692 MD_physname = 0x2, 2693 MD_macaddr = 0x4, 2694 MD_smode = 0x8} updated; 2695 2696 updated = MD_init; 2697 2698 D1(vswp, "%s: enter", __func__); 2699 2700 /* 2701 * Check if name of physical device in MD has changed. 2702 */ 2703 if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) { 2704 /* 2705 * Do basic sanity check on new device name/instance, 2706 * if its non NULL. It is valid for the device name to 2707 * have changed from a non NULL to a NULL value, i.e. 2708 * the vsw is being changed to 'routed' mode. 2709 */ 2710 if ((strlen(physname) != 0) && 2711 (ddi_parse(physname, drv, 2712 &ddi_instance) != DDI_SUCCESS)) { 2713 cmn_err(CE_WARN, "!vsw%d: new device name %s is not" 2714 " a valid device name/instance", 2715 vswp->instance, physname); 2716 goto fail_reconf; 2717 } 2718 2719 if (strcmp(physname, vswp->physname)) { 2720 D2(vswp, "%s: device name changed from %s to %s", 2721 __func__, vswp->physname, physname); 2722 2723 updated |= MD_physname; 2724 } else { 2725 D2(vswp, "%s: device name unchanged at %s", 2726 __func__, vswp->physname); 2727 } 2728 } else { 2729 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2730 "device from updated MD.", vswp->instance); 2731 goto fail_reconf; 2732 } 2733 2734 /* 2735 * Check if MAC address has changed. 2736 */ 2737 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2738 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2739 vswp->instance); 2740 goto fail_reconf; 2741 } else { 2742 READ_ENTER(&vswp->if_lockrw); 2743 for (i = ETHERADDRL - 1; i >= 0; i--) { 2744 if (vswp->if_addr.ether_addr_octet[i] 2745 != (macaddr & 0xFF)) { 2746 D2(vswp, "%s: octet[%d] 0x%x != 0x%x", 2747 __func__, i, 2748 vswp->if_addr.ether_addr_octet[i], 2749 (macaddr & 0xFF)); 2750 updated |= MD_macaddr; 2751 break; 2752 } 2753 macaddr >>= 8; 2754 } 2755 RW_EXIT(&vswp->if_lockrw); 2756 } 2757 2758 /* 2759 * Check if switching modes have changed. 2760 */ 2761 if (vsw_get_md_smodes(vswp, mdp, node, 2762 new_smode, &smode_num)) { 2763 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", 2764 vswp->instance, smode_propname); 2765 goto fail_reconf; 2766 } else { 2767 ASSERT(smode_num != 0); 2768 if (smode_num != vswp->smode_num) { 2769 D2(vswp, "%s: number of modes changed from %d to %d", 2770 __func__, vswp->smode_num, smode_num); 2771 } 2772 2773 for (i = 0; i < smode_num; i++) { 2774 if (new_smode[i] != vswp->smode[i]) { 2775 D2(vswp, "%s: mode changed from %d to %d", 2776 __func__, vswp->smode[i], new_smode[i]); 2777 updated |= MD_smode; 2778 break; 2779 } 2780 } 2781 } 2782 2783 /* 2784 * Now make any changes which are needed... 2785 */ 2786 2787 if (updated & (MD_physname | MD_smode)) { 2788 /* 2789 * Disconnect all ports from the current card 2790 */ 2791 WRITE_ENTER(&plist->lockrw); 2792 for (port = plist->head; port != NULL; port = port->p_next) { 2793 /* Remove address if was programmed into HW. */ 2794 if (vsw_unset_hw(vswp, port)) { 2795 RW_EXIT(&plist->lockrw); 2796 goto fail_update; 2797 } 2798 } 2799 RW_EXIT(&plist->lockrw); 2800 2801 /* 2802 * Stop, detach the old device.. 2803 */ 2804 vsw_mac_detach(vswp); 2805 2806 /* 2807 * Update phys name. 2808 */ 2809 if (updated & MD_physname) { 2810 cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s", 2811 vswp->instance, vswp->physname, physname); 2812 (void) strncpy(vswp->physname, 2813 physname, strlen(physname) + 1); 2814 2815 if (strlen(vswp->physname) > 0) 2816 vswp->mdprops |= VSW_MD_PHYSNAME; 2817 } 2818 2819 /* 2820 * Update array with the new switch mode values. 2821 */ 2822 if (updated & MD_smode) { 2823 for (i = 0; i < smode_num; i++) 2824 vswp->smode[i] = new_smode[i]; 2825 2826 vswp->smode_num = smode_num; 2827 vswp->smode_idx = 0; 2828 } 2829 2830 /* 2831 * ..and attach, start the new device. 2832 */ 2833 if (vsw_setup_switching(vswp)) 2834 goto fail_update; 2835 2836 /* 2837 * Connect ports to new card. 2838 */ 2839 WRITE_ENTER(&plist->lockrw); 2840 for (port = plist->head; port != NULL; port = port->p_next) { 2841 if (vsw_set_hw(vswp, port)) { 2842 RW_EXIT(&plist->lockrw); 2843 goto fail_update; 2844 } 2845 } 2846 RW_EXIT(&plist->lockrw); 2847 } 2848 2849 if (updated & MD_macaddr) { 2850 cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx", 2851 vswp->instance, macaddr); 2852 2853 WRITE_ENTER(&vswp->if_lockrw); 2854 for (i = ETHERADDRL - 1; i >= 0; i--) { 2855 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 2856 macaddr >>= 8; 2857 } 2858 RW_EXIT(&vswp->if_lockrw); 2859 2860 /* 2861 * Notify the MAC layer of the changed address. 2862 */ 2863 mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr); 2864 } 2865 2866 return; 2867 2868 fail_reconf: 2869 cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance); 2870 return; 2871 2872 fail_update: 2873 cmn_err(CE_WARN, "!vsw%d: update of configuration failed", 2874 vswp->instance); 2875 } 2876 2877 /* 2878 * Add a new port to the system. 2879 * 2880 * Returns 0 on success, 1 on failure. 2881 */ 2882 int 2883 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 2884 { 2885 uint64_t ldc_id; 2886 uint8_t *addrp; 2887 int i, addrsz; 2888 int num_nodes = 0, nchan = 0; 2889 int listsz = 0; 2890 mde_cookie_t *listp = NULL; 2891 struct ether_addr ea; 2892 uint64_t macaddr; 2893 uint64_t inst = 0; 2894 vsw_port_t *port; 2895 2896 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 2897 DWARN(vswp, "%s: prop(%s) not found", __func__, 2898 id_propname); 2899 return (1); 2900 } 2901 2902 /* 2903 * Find the channel endpoint node(s) (which should be under this 2904 * port node) which contain the channel id(s). 2905 */ 2906 if ((num_nodes = md_node_count(mdp)) <= 0) { 2907 DERR(vswp, "%s: invalid number of nodes found (%d)", 2908 __func__, num_nodes); 2909 return (1); 2910 } 2911 2912 D2(vswp, "%s: %d nodes found", __func__, num_nodes); 2913 2914 /* allocate enough space for node list */ 2915 listsz = num_nodes * sizeof (mde_cookie_t); 2916 listp = kmem_zalloc(listsz, KM_SLEEP); 2917 2918 nchan = md_scan_dag(mdp, *node, 2919 md_find_name(mdp, chan_propname), 2920 md_find_name(mdp, "fwd"), listp); 2921 2922 if (nchan <= 0) { 2923 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 2924 kmem_free(listp, listsz); 2925 return (1); 2926 } 2927 2928 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 2929 2930 /* use property from first node found */ 2931 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 2932 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 2933 id_propname); 2934 kmem_free(listp, listsz); 2935 return (1); 2936 } 2937 2938 /* don't need list any more */ 2939 kmem_free(listp, listsz); 2940 2941 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 2942 2943 /* read mac-address property */ 2944 if (md_get_prop_data(mdp, *node, remaddr_propname, 2945 &addrp, &addrsz)) { 2946 DWARN(vswp, "%s: prop(%s) not found", 2947 __func__, remaddr_propname); 2948 return (1); 2949 } 2950 2951 if (addrsz < ETHERADDRL) { 2952 DWARN(vswp, "%s: invalid address size", __func__); 2953 return (1); 2954 } 2955 2956 macaddr = *((uint64_t *)addrp); 2957 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 2958 2959 for (i = ETHERADDRL - 1; i >= 0; i--) { 2960 ea.ether_addr_octet[i] = macaddr & 0xFF; 2961 macaddr >>= 8; 2962 } 2963 2964 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 2965 DERR(vswp, "%s: failed to attach port", __func__); 2966 return (1); 2967 } 2968 2969 port = vsw_lookup_port(vswp, (int)inst); 2970 2971 /* just successfuly created the port, so it should exist */ 2972 ASSERT(port != NULL); 2973 2974 return (0); 2975 } 2976 2977 /* 2978 * Attach the specified port. 2979 * 2980 * Returns 0 on success, 1 on failure. 2981 */ 2982 static int 2983 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 2984 struct ether_addr *macaddr) 2985 { 2986 vsw_port_list_t *plist = &vswp->plist; 2987 vsw_port_t *port, **prev_port; 2988 int i; 2989 2990 D1(vswp, "%s: enter : port %d", __func__, p_instance); 2991 2992 /* port already exists? */ 2993 READ_ENTER(&plist->lockrw); 2994 for (port = plist->head; port != NULL; port = port->p_next) { 2995 if (port->p_instance == p_instance) { 2996 DWARN(vswp, "%s: port instance %d already attached", 2997 __func__, p_instance); 2998 RW_EXIT(&plist->lockrw); 2999 return (1); 3000 } 3001 } 3002 RW_EXIT(&plist->lockrw); 3003 3004 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 3005 port->p_vswp = vswp; 3006 port->p_instance = p_instance; 3007 port->p_ldclist.num_ldcs = 0; 3008 port->p_ldclist.head = NULL; 3009 port->addr_set = VSW_ADDR_UNSET; 3010 3011 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 3012 3013 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 3014 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 3015 3016 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 3017 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 3018 3019 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 3020 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 3021 port->state = VSW_PORT_INIT; 3022 3023 if (nids > VSW_PORT_MAX_LDCS) { 3024 D2(vswp, "%s: using first of %d ldc ids", 3025 __func__, nids); 3026 nids = VSW_PORT_MAX_LDCS; 3027 } 3028 3029 D2(vswp, "%s: %d nids", __func__, nids); 3030 for (i = 0; i < nids; i++) { 3031 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 3032 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 3033 DERR(vswp, "%s: ldc_attach failed", __func__); 3034 3035 rw_destroy(&port->p_ldclist.lockrw); 3036 3037 cv_destroy(&port->ref_cv); 3038 mutex_destroy(&port->ref_lock); 3039 3040 cv_destroy(&port->state_cv); 3041 mutex_destroy(&port->state_lock); 3042 3043 mutex_destroy(&port->tx_lock); 3044 mutex_destroy(&port->mca_lock); 3045 kmem_free(port, sizeof (vsw_port_t)); 3046 return (1); 3047 } 3048 } 3049 3050 ether_copy(macaddr, &port->p_macaddr); 3051 3052 WRITE_ENTER(&plist->lockrw); 3053 3054 /* create the fdb entry for this port/mac address */ 3055 (void) vsw_add_fdb(vswp, port); 3056 3057 (void) vsw_set_hw(vswp, port); 3058 3059 /* link it into the list of ports for this vsw instance */ 3060 prev_port = (vsw_port_t **)(&plist->head); 3061 port->p_next = *prev_port; 3062 *prev_port = port; 3063 plist->num_ports++; 3064 RW_EXIT(&plist->lockrw); 3065 3066 /* 3067 * Initialise the port and any ldc's under it. 3068 */ 3069 (void) vsw_init_ldcs(port); 3070 3071 D1(vswp, "%s: exit", __func__); 3072 return (0); 3073 } 3074 3075 /* 3076 * Detach the specified port. 3077 * 3078 * Returns 0 on success, 1 on failure. 3079 */ 3080 static int 3081 vsw_port_detach(vsw_t *vswp, int p_instance) 3082 { 3083 vsw_port_t *port = NULL; 3084 vsw_port_list_t *plist = &vswp->plist; 3085 3086 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 3087 3088 WRITE_ENTER(&plist->lockrw); 3089 3090 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 3091 RW_EXIT(&plist->lockrw); 3092 return (1); 3093 } 3094 3095 if (vsw_plist_del_node(vswp, port)) { 3096 RW_EXIT(&plist->lockrw); 3097 return (1); 3098 } 3099 3100 /* Remove address if was programmed into HW. */ 3101 (void) vsw_unset_hw(vswp, port); 3102 3103 /* Remove the fdb entry for this port/mac address */ 3104 (void) vsw_del_fdb(vswp, port); 3105 3106 /* Remove any multicast addresses.. */ 3107 vsw_del_mcst_port(port); 3108 3109 /* 3110 * No longer need to hold writer lock on port list now 3111 * that we have unlinked the target port from the list. 3112 */ 3113 RW_EXIT(&plist->lockrw); 3114 3115 READ_ENTER(&plist->lockrw); 3116 3117 if (vswp->recfg_reqd) 3118 (void) vsw_reconfig_hw(vswp); 3119 3120 RW_EXIT(&plist->lockrw); 3121 3122 if (vsw_port_delete(port)) { 3123 return (1); 3124 } 3125 3126 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 3127 return (0); 3128 } 3129 3130 /* 3131 * Detach all active ports. 3132 * 3133 * Returns 0 on success, 1 on failure. 3134 */ 3135 static int 3136 vsw_detach_ports(vsw_t *vswp) 3137 { 3138 vsw_port_list_t *plist = &vswp->plist; 3139 vsw_port_t *port = NULL; 3140 3141 D1(vswp, "%s: enter", __func__); 3142 3143 WRITE_ENTER(&plist->lockrw); 3144 3145 while ((port = plist->head) != NULL) { 3146 if (vsw_plist_del_node(vswp, port)) { 3147 DERR(vswp, "%s: Error deleting port %d" 3148 " from port list", __func__, 3149 port->p_instance); 3150 RW_EXIT(&plist->lockrw); 3151 return (1); 3152 } 3153 3154 /* Remove address if was programmed into HW. */ 3155 (void) vsw_unset_hw(vswp, port); 3156 3157 /* Remove the fdb entry for this port/mac address */ 3158 (void) vsw_del_fdb(vswp, port); 3159 3160 /* Remove any multicast addresses.. */ 3161 vsw_del_mcst_port(port); 3162 3163 /* 3164 * No longer need to hold the lock on the port list 3165 * now that we have unlinked the target port from the 3166 * list. 3167 */ 3168 RW_EXIT(&plist->lockrw); 3169 if (vsw_port_delete(port)) { 3170 DERR(vswp, "%s: Error deleting port %d", 3171 __func__, port->p_instance); 3172 return (1); 3173 } 3174 WRITE_ENTER(&plist->lockrw); 3175 } 3176 RW_EXIT(&plist->lockrw); 3177 3178 D1(vswp, "%s: exit", __func__); 3179 3180 return (0); 3181 } 3182 3183 /* 3184 * Delete the specified port. 3185 * 3186 * Returns 0 on success, 1 on failure. 3187 */ 3188 static int 3189 vsw_port_delete(vsw_port_t *port) 3190 { 3191 vsw_ldc_list_t *ldcl; 3192 vsw_t *vswp = port->p_vswp; 3193 3194 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 3195 3196 (void) vsw_uninit_ldcs(port); 3197 3198 /* 3199 * Wait for any pending ctrl msg tasks which reference this 3200 * port to finish. 3201 */ 3202 if (vsw_drain_port_taskq(port)) 3203 return (1); 3204 3205 /* 3206 * Wait for port reference count to hit zero. 3207 */ 3208 mutex_enter(&port->ref_lock); 3209 while (port->ref_cnt != 0) 3210 cv_wait(&port->ref_cv, &port->ref_lock); 3211 mutex_exit(&port->ref_lock); 3212 3213 /* 3214 * Wait for any active callbacks to finish 3215 */ 3216 if (vsw_drain_ldcs(port)) 3217 return (1); 3218 3219 ldcl = &port->p_ldclist; 3220 WRITE_ENTER(&ldcl->lockrw); 3221 while (ldcl->num_ldcs > 0) { 3222 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 3223 cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld", 3224 vswp->instance, ldcl->head->ldc_id); 3225 RW_EXIT(&ldcl->lockrw); 3226 return (1); 3227 } 3228 } 3229 RW_EXIT(&ldcl->lockrw); 3230 3231 rw_destroy(&port->p_ldclist.lockrw); 3232 3233 mutex_destroy(&port->mca_lock); 3234 mutex_destroy(&port->tx_lock); 3235 cv_destroy(&port->ref_cv); 3236 mutex_destroy(&port->ref_lock); 3237 3238 cv_destroy(&port->state_cv); 3239 mutex_destroy(&port->state_lock); 3240 3241 kmem_free(port, sizeof (vsw_port_t)); 3242 3243 D1(vswp, "%s: exit", __func__); 3244 3245 return (0); 3246 } 3247 3248 /* 3249 * Attach a logical domain channel (ldc) under a specified port. 3250 * 3251 * Returns 0 on success, 1 on failure. 3252 */ 3253 static int 3254 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 3255 { 3256 vsw_t *vswp = port->p_vswp; 3257 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3258 vsw_ldc_t *ldcp = NULL; 3259 ldc_attr_t attr; 3260 ldc_status_t istatus; 3261 int status = DDI_FAILURE; 3262 int rv; 3263 enum { PROG_init = 0x0, PROG_mblks = 0x1, 3264 PROG_callback = 0x2} 3265 progress; 3266 3267 progress = PROG_init; 3268 3269 D1(vswp, "%s: enter", __func__); 3270 3271 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 3272 if (ldcp == NULL) { 3273 DERR(vswp, "%s: kmem_zalloc failed", __func__); 3274 return (1); 3275 } 3276 ldcp->ldc_id = ldc_id; 3277 3278 /* allocate pool of receive mblks */ 3279 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 3280 if (rv) { 3281 DWARN(vswp, "%s: unable to create free mblk pool for" 3282 " channel %ld (rv %d)", __func__, ldc_id, rv); 3283 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3284 return (1); 3285 } 3286 3287 progress |= PROG_mblks; 3288 3289 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 3290 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 3291 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 3292 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 3293 rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL); 3294 rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL); 3295 3296 /* required for handshake with peer */ 3297 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 3298 ldcp->peer_session = 0; 3299 ldcp->session_status = 0; 3300 3301 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 3302 ldcp->hss_id = 1; /* Initial handshake session id */ 3303 3304 /* only set for outbound lane, inbound set by peer */ 3305 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 3306 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 3307 vsw_set_lane_attr(vswp, &ldcp->lane_out); 3308 3309 attr.devclass = LDC_DEV_NT_SVC; 3310 attr.instance = ddi_get_instance(vswp->dip); 3311 attr.mode = LDC_MODE_UNRELIABLE; 3312 attr.mtu = VSW_LDC_MTU; 3313 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 3314 if (status != 0) { 3315 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 3316 __func__, ldc_id, status); 3317 goto ldc_attach_fail; 3318 } 3319 3320 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 3321 if (status != 0) { 3322 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 3323 __func__, ldc_id, status); 3324 (void) ldc_fini(ldcp->ldc_handle); 3325 goto ldc_attach_fail; 3326 } 3327 3328 progress |= PROG_callback; 3329 3330 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 3331 3332 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3333 DERR(vswp, "%s: ldc_status failed", __func__); 3334 mutex_destroy(&ldcp->status_lock); 3335 goto ldc_attach_fail; 3336 } 3337 3338 ldcp->ldc_status = istatus; 3339 ldcp->ldc_port = port; 3340 ldcp->ldc_vswp = vswp; 3341 3342 /* link it into the list of channels for this port */ 3343 WRITE_ENTER(&ldcl->lockrw); 3344 ldcp->ldc_next = ldcl->head; 3345 ldcl->head = ldcp; 3346 ldcl->num_ldcs++; 3347 RW_EXIT(&ldcl->lockrw); 3348 3349 D1(vswp, "%s: exit", __func__); 3350 return (0); 3351 3352 ldc_attach_fail: 3353 mutex_destroy(&ldcp->ldc_txlock); 3354 mutex_destroy(&ldcp->ldc_cblock); 3355 3356 cv_destroy(&ldcp->drain_cv); 3357 3358 rw_destroy(&ldcp->lane_in.dlistrw); 3359 rw_destroy(&ldcp->lane_out.dlistrw); 3360 3361 if (progress & PROG_callback) { 3362 (void) ldc_unreg_callback(ldcp->ldc_handle); 3363 } 3364 3365 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 3366 if (vio_destroy_mblks(ldcp->rxh) != 0) { 3367 /* 3368 * Something odd has happened, as the destroy 3369 * will only fail if some mblks have been allocated 3370 * from the pool already (which shouldn't happen) 3371 * and have not been returned. 3372 * 3373 * Add the pool pointer to a list maintained in 3374 * the device instance. Another attempt will be made 3375 * to free the pool when the device itself detaches. 3376 */ 3377 cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld " 3378 "failed and cannot destroy associated mblk " 3379 "pool", vswp->instance, ldc_id); 3380 ldcp->rxh->nextp = vswp->rxh; 3381 vswp->rxh = ldcp->rxh; 3382 } 3383 } 3384 mutex_destroy(&ldcp->drain_cv_lock); 3385 mutex_destroy(&ldcp->hss_lock); 3386 3387 mutex_destroy(&ldcp->lane_in.seq_lock); 3388 mutex_destroy(&ldcp->lane_out.seq_lock); 3389 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3390 3391 return (1); 3392 } 3393 3394 /* 3395 * Detach a logical domain channel (ldc) belonging to a 3396 * particular port. 3397 * 3398 * Returns 0 on success, 1 on failure. 3399 */ 3400 static int 3401 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 3402 { 3403 vsw_t *vswp = port->p_vswp; 3404 vsw_ldc_t *ldcp, *prev_ldcp; 3405 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3406 int rv; 3407 3408 prev_ldcp = ldcl->head; 3409 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 3410 if (ldcp->ldc_id == ldc_id) { 3411 break; 3412 } 3413 } 3414 3415 /* specified ldc id not found */ 3416 if (ldcp == NULL) { 3417 DERR(vswp, "%s: ldcp = NULL", __func__); 3418 return (1); 3419 } 3420 3421 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 3422 3423 /* 3424 * Before we can close the channel we must release any mapped 3425 * resources (e.g. drings). 3426 */ 3427 vsw_free_lane_resources(ldcp, INBOUND); 3428 vsw_free_lane_resources(ldcp, OUTBOUND); 3429 3430 /* 3431 * If the close fails we are in serious trouble, as won't 3432 * be able to delete the parent port. 3433 */ 3434 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 3435 DERR(vswp, "%s: error %d closing channel %lld", 3436 __func__, rv, ldcp->ldc_id); 3437 return (1); 3438 } 3439 3440 (void) ldc_fini(ldcp->ldc_handle); 3441 3442 ldcp->ldc_status = LDC_INIT; 3443 ldcp->ldc_handle = NULL; 3444 ldcp->ldc_vswp = NULL; 3445 3446 if (ldcp->rxh != NULL) { 3447 if (vio_destroy_mblks(ldcp->rxh)) { 3448 /* 3449 * Mostly likely some mblks are still in use and 3450 * have not been returned to the pool. Add the pool 3451 * to the list maintained in the device instance. 3452 * Another attempt will be made to destroy the pool 3453 * when the device detaches. 3454 */ 3455 ldcp->rxh->nextp = vswp->rxh; 3456 vswp->rxh = ldcp->rxh; 3457 } 3458 } 3459 3460 /* unlink it from the list */ 3461 prev_ldcp = ldcp->ldc_next; 3462 ldcl->num_ldcs--; 3463 3464 mutex_destroy(&ldcp->ldc_txlock); 3465 mutex_destroy(&ldcp->ldc_cblock); 3466 cv_destroy(&ldcp->drain_cv); 3467 mutex_destroy(&ldcp->drain_cv_lock); 3468 mutex_destroy(&ldcp->hss_lock); 3469 mutex_destroy(&ldcp->lane_in.seq_lock); 3470 mutex_destroy(&ldcp->lane_out.seq_lock); 3471 mutex_destroy(&ldcp->status_lock); 3472 rw_destroy(&ldcp->lane_in.dlistrw); 3473 rw_destroy(&ldcp->lane_out.dlistrw); 3474 3475 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3476 3477 return (0); 3478 } 3479 3480 /* 3481 * Open and attempt to bring up the channel. Note that channel 3482 * can only be brought up if peer has also opened channel. 3483 * 3484 * Returns 0 if can open and bring up channel, otherwise 3485 * returns 1. 3486 */ 3487 static int 3488 vsw_ldc_init(vsw_ldc_t *ldcp) 3489 { 3490 vsw_t *vswp = ldcp->ldc_vswp; 3491 ldc_status_t istatus = 0; 3492 int rv; 3493 3494 D1(vswp, "%s: enter", __func__); 3495 3496 LDC_ENTER_LOCK(ldcp); 3497 3498 /* don't start at 0 in case clients don't like that */ 3499 ldcp->next_ident = 1; 3500 3501 rv = ldc_open(ldcp->ldc_handle); 3502 if (rv != 0) { 3503 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 3504 __func__, ldcp->ldc_id, rv); 3505 LDC_EXIT_LOCK(ldcp); 3506 return (1); 3507 } 3508 3509 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3510 DERR(vswp, "%s: unable to get status", __func__); 3511 LDC_EXIT_LOCK(ldcp); 3512 return (1); 3513 3514 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 3515 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 3516 __func__, ldcp->ldc_id, istatus); 3517 LDC_EXIT_LOCK(ldcp); 3518 return (1); 3519 } 3520 3521 mutex_enter(&ldcp->status_lock); 3522 ldcp->ldc_status = istatus; 3523 mutex_exit(&ldcp->status_lock); 3524 3525 rv = ldc_up(ldcp->ldc_handle); 3526 if (rv != 0) { 3527 /* 3528 * Not a fatal error for ldc_up() to fail, as peer 3529 * end point may simply not be ready yet. 3530 */ 3531 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3532 ldcp->ldc_id, rv); 3533 LDC_EXIT_LOCK(ldcp); 3534 return (1); 3535 } 3536 3537 /* 3538 * ldc_up() call is non-blocking so need to explicitly 3539 * check channel status to see if in fact the channel 3540 * is UP. 3541 */ 3542 mutex_enter(&ldcp->status_lock); 3543 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3544 DERR(vswp, "%s: unable to get status", __func__); 3545 mutex_exit(&ldcp->status_lock); 3546 LDC_EXIT_LOCK(ldcp); 3547 return (1); 3548 3549 } 3550 3551 if (ldcp->ldc_status == LDC_UP) { 3552 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 3553 ldcp->ldc_id, istatus); 3554 mutex_exit(&ldcp->status_lock); 3555 LDC_EXIT_LOCK(ldcp); 3556 3557 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 3558 return (0); 3559 } 3560 3561 mutex_exit(&ldcp->status_lock); 3562 LDC_EXIT_LOCK(ldcp); 3563 3564 D1(vswp, "%s: exit", __func__); 3565 return (0); 3566 } 3567 3568 /* disable callbacks on the channel */ 3569 static int 3570 vsw_ldc_uninit(vsw_ldc_t *ldcp) 3571 { 3572 vsw_t *vswp = ldcp->ldc_vswp; 3573 int rv; 3574 3575 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 3576 3577 LDC_ENTER_LOCK(ldcp); 3578 3579 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 3580 if (rv != 0) { 3581 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 3582 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 3583 LDC_EXIT_LOCK(ldcp); 3584 return (1); 3585 } 3586 3587 mutex_enter(&ldcp->status_lock); 3588 ldcp->ldc_status = LDC_INIT; 3589 mutex_exit(&ldcp->status_lock); 3590 3591 LDC_EXIT_LOCK(ldcp); 3592 3593 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 3594 3595 return (0); 3596 } 3597 3598 static int 3599 vsw_init_ldcs(vsw_port_t *port) 3600 { 3601 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3602 vsw_ldc_t *ldcp; 3603 3604 READ_ENTER(&ldcl->lockrw); 3605 ldcp = ldcl->head; 3606 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3607 (void) vsw_ldc_init(ldcp); 3608 } 3609 RW_EXIT(&ldcl->lockrw); 3610 3611 return (0); 3612 } 3613 3614 static int 3615 vsw_uninit_ldcs(vsw_port_t *port) 3616 { 3617 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3618 vsw_ldc_t *ldcp; 3619 3620 D1(NULL, "vsw_uninit_ldcs: enter\n"); 3621 3622 READ_ENTER(&ldcl->lockrw); 3623 ldcp = ldcl->head; 3624 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3625 (void) vsw_ldc_uninit(ldcp); 3626 } 3627 RW_EXIT(&ldcl->lockrw); 3628 3629 D1(NULL, "vsw_uninit_ldcs: exit\n"); 3630 3631 return (0); 3632 } 3633 3634 /* 3635 * Wait until the callback(s) associated with the ldcs under the specified 3636 * port have completed. 3637 * 3638 * Prior to this function being invoked each channel under this port 3639 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3640 * 3641 * A short explaination of what we are doing below.. 3642 * 3643 * The simplest approach would be to have a reference counter in 3644 * the ldc structure which is increment/decremented by the callbacks as 3645 * they use the channel. The drain function could then simply disable any 3646 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 3647 * there is a tiny window here - before the callback is able to get the lock 3648 * on the channel it is interrupted and this function gets to execute. It 3649 * sees that the ref count is zero and believes its free to delete the 3650 * associated data structures. 3651 * 3652 * We get around this by taking advantage of the fact that before the ldc 3653 * framework invokes a callback it sets a flag to indicate that there is a 3654 * callback active (or about to become active). If when we attempt to 3655 * unregister a callback when this active flag is set then the unregister 3656 * will fail with EWOULDBLOCK. 3657 * 3658 * If the unregister fails we do a cv_timedwait. We will either be signaled 3659 * by the callback as it is exiting (note we have to wait a short period to 3660 * allow the callback to return fully to the ldc framework and it to clear 3661 * the active flag), or by the timer expiring. In either case we again attempt 3662 * the unregister. We repeat this until we can succesfully unregister the 3663 * callback. 3664 * 3665 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 3666 * the case where the callback has finished but the ldc framework has not yet 3667 * cleared the active flag. In this case we would never get a cv_signal. 3668 */ 3669 static int 3670 vsw_drain_ldcs(vsw_port_t *port) 3671 { 3672 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3673 vsw_ldc_t *ldcp; 3674 vsw_t *vswp = port->p_vswp; 3675 3676 D1(vswp, "%s: enter", __func__); 3677 3678 READ_ENTER(&ldcl->lockrw); 3679 3680 ldcp = ldcl->head; 3681 3682 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3683 /* 3684 * If we can unregister the channel callback then we 3685 * know that there is no callback either running or 3686 * scheduled to run for this channel so move on to next 3687 * channel in the list. 3688 */ 3689 mutex_enter(&ldcp->drain_cv_lock); 3690 3691 /* prompt active callbacks to quit */ 3692 ldcp->drain_state = VSW_LDC_DRAINING; 3693 3694 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 3695 D2(vswp, "%s: unreg callback for chan %ld", __func__, 3696 ldcp->ldc_id); 3697 mutex_exit(&ldcp->drain_cv_lock); 3698 continue; 3699 } else { 3700 /* 3701 * If we end up here we know that either 1) a callback 3702 * is currently executing, 2) is about to start (i.e. 3703 * the ldc framework has set the active flag but 3704 * has not actually invoked the callback yet, or 3) 3705 * has finished and has returned to the ldc framework 3706 * but the ldc framework has not yet cleared the 3707 * active bit. 3708 * 3709 * Wait for it to finish. 3710 */ 3711 while (ldc_unreg_callback(ldcp->ldc_handle) 3712 == EWOULDBLOCK) 3713 (void) cv_timedwait(&ldcp->drain_cv, 3714 &ldcp->drain_cv_lock, lbolt + hz); 3715 3716 mutex_exit(&ldcp->drain_cv_lock); 3717 D2(vswp, "%s: unreg callback for chan %ld after " 3718 "timeout", __func__, ldcp->ldc_id); 3719 } 3720 } 3721 RW_EXIT(&ldcl->lockrw); 3722 3723 D1(vswp, "%s: exit", __func__); 3724 return (0); 3725 } 3726 3727 /* 3728 * Wait until all tasks which reference this port have completed. 3729 * 3730 * Prior to this function being invoked each channel under this port 3731 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3732 */ 3733 static int 3734 vsw_drain_port_taskq(vsw_port_t *port) 3735 { 3736 vsw_t *vswp = port->p_vswp; 3737 3738 D1(vswp, "%s: enter", __func__); 3739 3740 /* 3741 * Mark the port as in the process of being detached, and 3742 * dispatch a marker task to the queue so we know when all 3743 * relevant tasks have completed. 3744 */ 3745 mutex_enter(&port->state_lock); 3746 port->state = VSW_PORT_DETACHING; 3747 3748 if ((vswp->taskq_p == NULL) || 3749 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 3750 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 3751 DERR(vswp, "%s: unable to dispatch marker task", 3752 __func__); 3753 mutex_exit(&port->state_lock); 3754 return (1); 3755 } 3756 3757 /* 3758 * Wait for the marker task to finish. 3759 */ 3760 while (port->state != VSW_PORT_DETACHABLE) 3761 cv_wait(&port->state_cv, &port->state_lock); 3762 3763 mutex_exit(&port->state_lock); 3764 3765 D1(vswp, "%s: exit", __func__); 3766 3767 return (0); 3768 } 3769 3770 static void 3771 vsw_marker_task(void *arg) 3772 { 3773 vsw_port_t *port = arg; 3774 vsw_t *vswp = port->p_vswp; 3775 3776 D1(vswp, "%s: enter", __func__); 3777 3778 mutex_enter(&port->state_lock); 3779 3780 /* 3781 * No further tasks should be dispatched which reference 3782 * this port so ok to mark it as safe to detach. 3783 */ 3784 port->state = VSW_PORT_DETACHABLE; 3785 3786 cv_signal(&port->state_cv); 3787 3788 mutex_exit(&port->state_lock); 3789 3790 D1(vswp, "%s: exit", __func__); 3791 } 3792 3793 static vsw_port_t * 3794 vsw_lookup_port(vsw_t *vswp, int p_instance) 3795 { 3796 vsw_port_list_t *plist = &vswp->plist; 3797 vsw_port_t *port; 3798 3799 for (port = plist->head; port != NULL; port = port->p_next) { 3800 if (port->p_instance == p_instance) { 3801 D2(vswp, "vsw_lookup_port: found p_instance\n"); 3802 return (port); 3803 } 3804 } 3805 3806 return (NULL); 3807 } 3808 3809 /* 3810 * Search for and remove the specified port from the port 3811 * list. Returns 0 if able to locate and remove port, otherwise 3812 * returns 1. 3813 */ 3814 static int 3815 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 3816 { 3817 vsw_port_list_t *plist = &vswp->plist; 3818 vsw_port_t *curr_p, *prev_p; 3819 3820 if (plist->head == NULL) 3821 return (1); 3822 3823 curr_p = prev_p = plist->head; 3824 3825 while (curr_p != NULL) { 3826 if (curr_p == port) { 3827 if (prev_p == curr_p) { 3828 plist->head = curr_p->p_next; 3829 } else { 3830 prev_p->p_next = curr_p->p_next; 3831 } 3832 plist->num_ports--; 3833 break; 3834 } else { 3835 prev_p = curr_p; 3836 curr_p = curr_p->p_next; 3837 } 3838 } 3839 return (0); 3840 } 3841 3842 /* 3843 * Interrupt handler for ldc messages. 3844 */ 3845 static uint_t 3846 vsw_ldc_cb(uint64_t event, caddr_t arg) 3847 { 3848 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3849 vsw_t *vswp = ldcp->ldc_vswp; 3850 3851 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3852 3853 mutex_enter(&ldcp->ldc_cblock); 3854 3855 mutex_enter(&ldcp->status_lock); 3856 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 3857 mutex_exit(&ldcp->status_lock); 3858 mutex_exit(&ldcp->ldc_cblock); 3859 return (LDC_SUCCESS); 3860 } 3861 mutex_exit(&ldcp->status_lock); 3862 3863 if (event & LDC_EVT_UP) { 3864 /* 3865 * Channel has come up. 3866 */ 3867 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 3868 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3869 3870 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 3871 3872 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3873 } 3874 3875 if (event & LDC_EVT_READ) { 3876 /* 3877 * Data available for reading. 3878 */ 3879 D2(vswp, "%s: id(ld) event(%llx) data READ", 3880 __func__, ldcp->ldc_id, event); 3881 3882 vsw_process_pkt(ldcp); 3883 3884 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3885 3886 goto vsw_cb_exit; 3887 } 3888 3889 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 3890 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", 3891 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3892 3893 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 3894 } 3895 3896 /* 3897 * Catch either LDC_EVT_WRITE which we don't support or any 3898 * unknown event. 3899 */ 3900 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 3901 | LDC_EVT_DOWN | LDC_EVT_READ)) { 3902 3903 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 3904 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3905 } 3906 3907 vsw_cb_exit: 3908 mutex_exit(&ldcp->ldc_cblock); 3909 3910 /* 3911 * Let the drain function know we are finishing if it 3912 * is waiting. 3913 */ 3914 mutex_enter(&ldcp->drain_cv_lock); 3915 if (ldcp->drain_state == VSW_LDC_DRAINING) 3916 cv_signal(&ldcp->drain_cv); 3917 mutex_exit(&ldcp->drain_cv_lock); 3918 3919 return (LDC_SUCCESS); 3920 } 3921 3922 /* 3923 * Reinitialise data structures associated with the channel. 3924 */ 3925 static void 3926 vsw_ldc_reinit(vsw_ldc_t *ldcp) 3927 { 3928 vsw_t *vswp = ldcp->ldc_vswp; 3929 vsw_port_t *port; 3930 vsw_ldc_list_t *ldcl; 3931 3932 D1(vswp, "%s: enter", __func__); 3933 3934 port = ldcp->ldc_port; 3935 ldcl = &port->p_ldclist; 3936 3937 READ_ENTER(&ldcl->lockrw); 3938 3939 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 3940 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 3941 3942 vsw_free_lane_resources(ldcp, INBOUND); 3943 vsw_free_lane_resources(ldcp, OUTBOUND); 3944 RW_EXIT(&ldcl->lockrw); 3945 3946 ldcp->lane_in.lstate = 0; 3947 ldcp->lane_out.lstate = 0; 3948 3949 /* 3950 * Remove parent port from any multicast groups 3951 * it may have registered with. Client must resend 3952 * multicast add command after handshake completes. 3953 */ 3954 (void) vsw_del_fdb(vswp, port); 3955 3956 vsw_del_mcst_port(port); 3957 3958 ldcp->peer_session = 0; 3959 ldcp->session_status = 0; 3960 ldcp->hcnt = 0; 3961 ldcp->hphase = VSW_MILESTONE0; 3962 3963 D1(vswp, "%s: exit", __func__); 3964 } 3965 3966 /* 3967 * Process a connection event. 3968 * 3969 * Note - care must be taken to ensure that this function is 3970 * not called with the dlistrw lock held. 3971 */ 3972 static void 3973 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) 3974 { 3975 vsw_t *vswp = ldcp->ldc_vswp; 3976 vsw_conn_evt_t *conn = NULL; 3977 3978 D1(vswp, "%s: enter", __func__); 3979 3980 /* 3981 * Check if either a reset or restart event is pending 3982 * or in progress. If so just return. 3983 * 3984 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT 3985 * being received by the callback handler, or a ECONNRESET error 3986 * code being returned from a ldc_read() or ldc_write() call. 3987 * 3988 * A VSW_CONN_RESTART event occurs when some error checking code 3989 * decides that there is a problem with data from the channel, 3990 * and that the handshake should be restarted. 3991 */ 3992 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && 3993 (ldstub((uint8_t *)&ldcp->reset_active))) 3994 return; 3995 3996 /* 3997 * If it is an LDC_UP event we first check the recorded 3998 * state of the channel. If this is UP then we know that 3999 * the channel moving to the UP state has already been dealt 4000 * with and don't need to dispatch a new task. 4001 * 4002 * The reason for this check is that when we do a ldc_up(), 4003 * depending on the state of the peer, we may or may not get 4004 * a LDC_UP event. As we can't depend on getting a LDC_UP evt 4005 * every time we do ldc_up() we explicitly check the channel 4006 * status to see has it come up (ldc_up() is asynch and will 4007 * complete at some undefined time), and take the appropriate 4008 * action. 4009 * 4010 * The flip side of this is that we may get a LDC_UP event 4011 * when we have already seen that the channel is up and have 4012 * dealt with that. 4013 */ 4014 mutex_enter(&ldcp->status_lock); 4015 if (evt == VSW_CONN_UP) { 4016 if ((ldcp->ldc_status == LDC_UP) || 4017 (ldcp->reset_active != 0)) { 4018 mutex_exit(&ldcp->status_lock); 4019 return; 4020 } 4021 } 4022 mutex_exit(&ldcp->status_lock); 4023 4024 /* 4025 * The transaction group id allows us to identify and discard 4026 * any tasks which are still pending on the taskq and refer 4027 * to the handshake session we are about to restart or reset. 4028 * These stale messages no longer have any real meaning. 4029 */ 4030 mutex_enter(&ldcp->hss_lock); 4031 ldcp->hss_id++; 4032 mutex_exit(&ldcp->hss_lock); 4033 4034 ASSERT(vswp->taskq_p != NULL); 4035 4036 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { 4037 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" 4038 " connection event", vswp->instance); 4039 goto err_exit; 4040 } 4041 4042 conn->evt = evt; 4043 conn->ldcp = ldcp; 4044 4045 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, 4046 DDI_NOSLEEP) != DDI_SUCCESS) { 4047 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", 4048 vswp->instance); 4049 4050 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4051 goto err_exit; 4052 } 4053 4054 D1(vswp, "%s: exit", __func__); 4055 return; 4056 4057 err_exit: 4058 /* 4059 * Have mostly likely failed due to memory shortage. Clear the flag so 4060 * that future requests will at least be attempted and will hopefully 4061 * succeed. 4062 */ 4063 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4064 ldcp->reset_active = 0; 4065 } 4066 4067 /* 4068 * Deal with events relating to a connection. Invoked from a taskq. 4069 */ 4070 static void 4071 vsw_conn_task(void *arg) 4072 { 4073 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; 4074 vsw_ldc_t *ldcp = NULL; 4075 vsw_t *vswp = NULL; 4076 uint16_t evt; 4077 ldc_status_t curr_status; 4078 4079 ldcp = conn->ldcp; 4080 evt = conn->evt; 4081 vswp = ldcp->ldc_vswp; 4082 4083 D1(vswp, "%s: enter", __func__); 4084 4085 /* can safely free now have copied out data */ 4086 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4087 4088 mutex_enter(&ldcp->status_lock); 4089 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4090 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4091 "channel %ld", vswp->instance, ldcp->ldc_id); 4092 mutex_exit(&ldcp->status_lock); 4093 return; 4094 } 4095 4096 /* 4097 * If we wish to restart the handshake on this channel, then if 4098 * the channel is UP we bring it DOWN to flush the underlying 4099 * ldc queue. 4100 */ 4101 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) 4102 (void) ldc_down(ldcp->ldc_handle); 4103 4104 /* 4105 * re-init all the associated data structures. 4106 */ 4107 vsw_ldc_reinit(ldcp); 4108 4109 /* 4110 * Bring the channel back up (note it does no harm to 4111 * do this even if the channel is already UP, Just 4112 * becomes effectively a no-op). 4113 */ 4114 (void) ldc_up(ldcp->ldc_handle); 4115 4116 /* 4117 * Check if channel is now UP. This will only happen if 4118 * peer has also done a ldc_up(). 4119 */ 4120 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4121 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4122 "channel %ld", vswp->instance, ldcp->ldc_id); 4123 mutex_exit(&ldcp->status_lock); 4124 return; 4125 } 4126 4127 ldcp->ldc_status = curr_status; 4128 4129 /* channel UP so restart handshake by sending version info */ 4130 if (curr_status == LDC_UP) { 4131 if (ldcp->hcnt++ > vsw_num_handshakes) { 4132 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" 4133 " handshake attempts (%d) on channel %ld", 4134 vswp->instance, ldcp->hcnt, ldcp->ldc_id); 4135 mutex_exit(&ldcp->status_lock); 4136 return; 4137 } 4138 4139 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 4140 DDI_NOSLEEP) != DDI_SUCCESS) { 4141 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", 4142 vswp->instance); 4143 4144 /* 4145 * Don't count as valid restart attempt if couldn't 4146 * send version msg. 4147 */ 4148 if (ldcp->hcnt > 0) 4149 ldcp->hcnt--; 4150 } 4151 } 4152 4153 /* 4154 * Mark that the process is complete by clearing the flag. 4155 * 4156 * Note is it possible that the taskq dispatch above may have failed, 4157 * most likely due to memory shortage. We still clear the flag so 4158 * future attempts will at least be attempted and will hopefully 4159 * succeed. 4160 */ 4161 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4162 ldcp->reset_active = 0; 4163 4164 mutex_exit(&ldcp->status_lock); 4165 4166 D1(vswp, "%s: exit", __func__); 4167 } 4168 4169 /* 4170 * returns 0 if legal for event signified by flag to have 4171 * occured at the time it did. Otherwise returns 1. 4172 */ 4173 int 4174 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 4175 { 4176 vsw_t *vswp = ldcp->ldc_vswp; 4177 uint64_t state; 4178 uint64_t phase; 4179 4180 if (dir == INBOUND) 4181 state = ldcp->lane_in.lstate; 4182 else 4183 state = ldcp->lane_out.lstate; 4184 4185 phase = ldcp->hphase; 4186 4187 switch (flag) { 4188 case VSW_VER_INFO_RECV: 4189 if (phase > VSW_MILESTONE0) { 4190 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 4191 " when in state %d\n", ldcp->ldc_id, phase); 4192 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4193 return (1); 4194 } 4195 break; 4196 4197 case VSW_VER_ACK_RECV: 4198 case VSW_VER_NACK_RECV: 4199 if (!(state & VSW_VER_INFO_SENT)) { 4200 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 4201 " or VER_NACK when in state %d\n", 4202 ldcp->ldc_id, phase); 4203 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4204 return (1); 4205 } else 4206 state &= ~VSW_VER_INFO_SENT; 4207 break; 4208 4209 case VSW_ATTR_INFO_RECV: 4210 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 4211 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 4212 " when in state %d\n", ldcp->ldc_id, phase); 4213 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4214 return (1); 4215 } 4216 break; 4217 4218 case VSW_ATTR_ACK_RECV: 4219 case VSW_ATTR_NACK_RECV: 4220 if (!(state & VSW_ATTR_INFO_SENT)) { 4221 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 4222 " or ATTR_NACK when in state %d\n", 4223 ldcp->ldc_id, phase); 4224 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4225 return (1); 4226 } else 4227 state &= ~VSW_ATTR_INFO_SENT; 4228 break; 4229 4230 case VSW_DRING_INFO_RECV: 4231 if (phase < VSW_MILESTONE1) { 4232 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 4233 " when in state %d\n", ldcp->ldc_id, phase); 4234 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4235 return (1); 4236 } 4237 break; 4238 4239 case VSW_DRING_ACK_RECV: 4240 case VSW_DRING_NACK_RECV: 4241 if (!(state & VSW_DRING_INFO_SENT)) { 4242 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 4243 " or DRING_NACK when in state %d\n", 4244 ldcp->ldc_id, phase); 4245 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4246 return (1); 4247 } else 4248 state &= ~VSW_DRING_INFO_SENT; 4249 break; 4250 4251 case VSW_RDX_INFO_RECV: 4252 if (phase < VSW_MILESTONE3) { 4253 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 4254 " when in state %d\n", ldcp->ldc_id, phase); 4255 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4256 return (1); 4257 } 4258 break; 4259 4260 case VSW_RDX_ACK_RECV: 4261 case VSW_RDX_NACK_RECV: 4262 if (!(state & VSW_RDX_INFO_SENT)) { 4263 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 4264 " or RDX_NACK when in state %d\n", 4265 ldcp->ldc_id, phase); 4266 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4267 return (1); 4268 } else 4269 state &= ~VSW_RDX_INFO_SENT; 4270 break; 4271 4272 case VSW_MCST_INFO_RECV: 4273 if (phase < VSW_MILESTONE3) { 4274 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 4275 " when in state %d\n", ldcp->ldc_id, phase); 4276 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4277 return (1); 4278 } 4279 break; 4280 4281 default: 4282 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 4283 ldcp->ldc_id, flag); 4284 return (1); 4285 } 4286 4287 if (dir == INBOUND) 4288 ldcp->lane_in.lstate = state; 4289 else 4290 ldcp->lane_out.lstate = state; 4291 4292 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 4293 4294 return (0); 4295 } 4296 4297 void 4298 vsw_next_milestone(vsw_ldc_t *ldcp) 4299 { 4300 vsw_t *vswp = ldcp->ldc_vswp; 4301 4302 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 4303 ldcp->ldc_id, ldcp->hphase); 4304 4305 DUMP_FLAGS(ldcp->lane_in.lstate); 4306 DUMP_FLAGS(ldcp->lane_out.lstate); 4307 4308 switch (ldcp->hphase) { 4309 4310 case VSW_MILESTONE0: 4311 /* 4312 * If we haven't started to handshake with our peer, 4313 * start to do so now. 4314 */ 4315 if (ldcp->lane_out.lstate == 0) { 4316 D2(vswp, "%s: (chan %lld) starting handshake " 4317 "with peer", __func__, ldcp->ldc_id); 4318 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4319 } 4320 4321 /* 4322 * Only way to pass this milestone is to have successfully 4323 * negotiated version info. 4324 */ 4325 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 4326 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 4327 4328 D2(vswp, "%s: (chan %lld) leaving milestone 0", 4329 __func__, ldcp->ldc_id); 4330 4331 /* 4332 * Next milestone is passed when attribute 4333 * information has been successfully exchanged. 4334 */ 4335 ldcp->hphase = VSW_MILESTONE1; 4336 vsw_send_attr(ldcp); 4337 4338 } 4339 break; 4340 4341 case VSW_MILESTONE1: 4342 /* 4343 * Only way to pass this milestone is to have successfully 4344 * negotiated attribute information. 4345 */ 4346 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 4347 4348 ldcp->hphase = VSW_MILESTONE2; 4349 4350 /* 4351 * If the peer device has said it wishes to 4352 * use descriptor rings then we send it our ring 4353 * info, otherwise we just set up a private ring 4354 * which we use an internal buffer 4355 */ 4356 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 4357 vsw_send_dring_info(ldcp); 4358 } 4359 break; 4360 4361 case VSW_MILESTONE2: 4362 /* 4363 * If peer has indicated in its attribute message that 4364 * it wishes to use descriptor rings then the only way 4365 * to pass this milestone is for us to have received 4366 * valid dring info. 4367 * 4368 * If peer is not using descriptor rings then just fall 4369 * through. 4370 */ 4371 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 4372 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 4373 break; 4374 4375 D2(vswp, "%s: (chan %lld) leaving milestone 2", 4376 __func__, ldcp->ldc_id); 4377 4378 ldcp->hphase = VSW_MILESTONE3; 4379 vsw_send_rdx(ldcp); 4380 break; 4381 4382 case VSW_MILESTONE3: 4383 /* 4384 * Pass this milestone when all paramaters have been 4385 * successfully exchanged and RDX sent in both directions. 4386 * 4387 * Mark outbound lane as available to transmit data. 4388 */ 4389 if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) && 4390 (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) { 4391 4392 D2(vswp, "%s: (chan %lld) leaving milestone 3", 4393 __func__, ldcp->ldc_id); 4394 D2(vswp, "%s: ** handshake complete (0x%llx : " 4395 "0x%llx) **", __func__, ldcp->lane_in.lstate, 4396 ldcp->lane_out.lstate); 4397 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 4398 ldcp->hphase = VSW_MILESTONE4; 4399 ldcp->hcnt = 0; 4400 DISPLAY_STATE(); 4401 } else { 4402 D2(vswp, "%s: still in milestone 3 (0x%llx :" 4403 " 0x%llx", __func__, ldcp->lane_in.lstate, 4404 ldcp->lane_out.lstate); 4405 } 4406 break; 4407 4408 case VSW_MILESTONE4: 4409 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 4410 ldcp->ldc_id); 4411 break; 4412 4413 default: 4414 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 4415 ldcp->ldc_id, ldcp->hphase); 4416 } 4417 4418 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 4419 ldcp->hphase); 4420 } 4421 4422 /* 4423 * Check if major version is supported. 4424 * 4425 * Returns 0 if finds supported major number, and if necessary 4426 * adjusts the minor field. 4427 * 4428 * Returns 1 if can't match major number exactly. Sets mjor/minor 4429 * to next lowest support values, or to zero if no other values possible. 4430 */ 4431 static int 4432 vsw_supported_version(vio_ver_msg_t *vp) 4433 { 4434 int i; 4435 4436 D1(NULL, "vsw_supported_version: enter"); 4437 4438 for (i = 0; i < VSW_NUM_VER; i++) { 4439 if (vsw_versions[i].ver_major == vp->ver_major) { 4440 /* 4441 * Matching or lower major version found. Update 4442 * minor number if necessary. 4443 */ 4444 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4445 D2(NULL, "%s: adjusting minor value" 4446 " from %d to %d", __func__, 4447 vp->ver_minor, 4448 vsw_versions[i].ver_minor); 4449 vp->ver_minor = vsw_versions[i].ver_minor; 4450 } 4451 4452 return (0); 4453 } 4454 4455 if (vsw_versions[i].ver_major < vp->ver_major) { 4456 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4457 D2(NULL, "%s: adjusting minor value" 4458 " from %d to %d", __func__, 4459 vp->ver_minor, 4460 vsw_versions[i].ver_minor); 4461 vp->ver_minor = vsw_versions[i].ver_minor; 4462 } 4463 return (1); 4464 } 4465 } 4466 4467 /* No match was possible, zero out fields */ 4468 vp->ver_major = 0; 4469 vp->ver_minor = 0; 4470 4471 D1(NULL, "vsw_supported_version: exit"); 4472 4473 return (1); 4474 } 4475 4476 /* 4477 * Main routine for processing messages received over LDC. 4478 */ 4479 static void 4480 vsw_process_pkt(void *arg) 4481 { 4482 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4483 vsw_t *vswp = ldcp->ldc_vswp; 4484 size_t msglen; 4485 vio_msg_tag_t tag; 4486 def_msg_t dmsg; 4487 int rv = 0; 4488 4489 4490 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4491 4492 /* 4493 * If channel is up read messages until channel is empty. 4494 */ 4495 do { 4496 msglen = sizeof (dmsg); 4497 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 4498 4499 if (rv != 0) { 4500 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 4501 "len(%d)\n", __func__, ldcp->ldc_id, 4502 rv, msglen); 4503 } 4504 4505 /* channel has been reset */ 4506 if (rv == ECONNRESET) { 4507 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4508 break; 4509 } 4510 4511 if (msglen == 0) { 4512 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 4513 ldcp->ldc_id); 4514 break; 4515 } 4516 4517 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 4518 ldcp->ldc_id, msglen); 4519 4520 /* 4521 * Figure out what sort of packet we have gotten by 4522 * examining the msg tag, and then switch it appropriately. 4523 */ 4524 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 4525 4526 switch (tag.vio_msgtype) { 4527 case VIO_TYPE_CTRL: 4528 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 4529 break; 4530 case VIO_TYPE_DATA: 4531 vsw_process_data_pkt(ldcp, &dmsg, tag); 4532 break; 4533 case VIO_TYPE_ERR: 4534 vsw_process_err_pkt(ldcp, &dmsg, tag); 4535 break; 4536 default: 4537 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 4538 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 4539 break; 4540 } 4541 } while (msglen); 4542 4543 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4544 } 4545 4546 /* 4547 * Dispatch a task to process a VIO control message. 4548 */ 4549 static void 4550 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 4551 { 4552 vsw_ctrl_task_t *ctaskp = NULL; 4553 vsw_port_t *port = ldcp->ldc_port; 4554 vsw_t *vswp = port->p_vswp; 4555 4556 D1(vswp, "%s: enter", __func__); 4557 4558 /* 4559 * We need to handle RDX ACK messages in-band as once they 4560 * are exchanged it is possible that we will get an 4561 * immediate (legitimate) data packet. 4562 */ 4563 if ((tag.vio_subtype_env == VIO_RDX) && 4564 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 4565 4566 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) 4567 return; 4568 4569 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; 4570 D2(vswp, "%s (%ld) handling RDX_ACK in place " 4571 "(ostate 0x%llx : hphase %d)", __func__, 4572 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); 4573 vsw_next_milestone(ldcp); 4574 return; 4575 } 4576 4577 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 4578 4579 if (ctaskp == NULL) { 4580 DERR(vswp, "%s: unable to alloc space for ctrl" 4581 " msg", __func__); 4582 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4583 return; 4584 } 4585 4586 ctaskp->ldcp = ldcp; 4587 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 4588 mutex_enter(&ldcp->hss_lock); 4589 ctaskp->hss_id = ldcp->hss_id; 4590 mutex_exit(&ldcp->hss_lock); 4591 4592 /* 4593 * Dispatch task to processing taskq if port is not in 4594 * the process of being detached. 4595 */ 4596 mutex_enter(&port->state_lock); 4597 if (port->state == VSW_PORT_INIT) { 4598 if ((vswp->taskq_p == NULL) || 4599 (ddi_taskq_dispatch(vswp->taskq_p, 4600 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 4601 != DDI_SUCCESS)) { 4602 DERR(vswp, "%s: unable to dispatch task to taskq", 4603 __func__); 4604 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4605 mutex_exit(&port->state_lock); 4606 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4607 return; 4608 } 4609 } else { 4610 DWARN(vswp, "%s: port %d detaching, not dispatching " 4611 "task", __func__, port->p_instance); 4612 } 4613 4614 mutex_exit(&port->state_lock); 4615 4616 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 4617 ldcp->ldc_id); 4618 D1(vswp, "%s: exit", __func__); 4619 } 4620 4621 /* 4622 * Process a VIO ctrl message. Invoked from taskq. 4623 */ 4624 static void 4625 vsw_process_ctrl_pkt(void *arg) 4626 { 4627 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 4628 vsw_ldc_t *ldcp = ctaskp->ldcp; 4629 vsw_t *vswp = ldcp->ldc_vswp; 4630 vio_msg_tag_t tag; 4631 uint16_t env; 4632 4633 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4634 4635 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 4636 env = tag.vio_subtype_env; 4637 4638 /* stale pkt check */ 4639 mutex_enter(&ldcp->hss_lock); 4640 if (ctaskp->hss_id < ldcp->hss_id) { 4641 DWARN(vswp, "%s: discarding stale packet belonging to" 4642 " earlier (%ld) handshake session", __func__, 4643 ctaskp->hss_id); 4644 mutex_exit(&ldcp->hss_lock); 4645 return; 4646 } 4647 mutex_exit(&ldcp->hss_lock); 4648 4649 /* session id check */ 4650 if (ldcp->session_status & VSW_PEER_SESSION) { 4651 if (ldcp->peer_session != tag.vio_sid) { 4652 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4653 __func__, ldcp->ldc_id, tag.vio_sid); 4654 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4655 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4656 return; 4657 } 4658 } 4659 4660 /* 4661 * Switch on vio_subtype envelope, then let lower routines 4662 * decide if its an INFO, ACK or NACK packet. 4663 */ 4664 switch (env) { 4665 case VIO_VER_INFO: 4666 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 4667 break; 4668 case VIO_DRING_REG: 4669 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 4670 break; 4671 case VIO_DRING_UNREG: 4672 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 4673 break; 4674 case VIO_ATTR_INFO: 4675 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 4676 break; 4677 case VNET_MCAST_INFO: 4678 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 4679 break; 4680 case VIO_RDX: 4681 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 4682 break; 4683 default: 4684 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4685 __func__, env); 4686 } 4687 4688 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4689 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4690 } 4691 4692 /* 4693 * Version negotiation. We can end up here either because our peer 4694 * has responded to a handshake message we have sent it, or our peer 4695 * has initiated a handshake with us. If its the former then can only 4696 * be ACK or NACK, if its the later can only be INFO. 4697 * 4698 * If its an ACK we move to the next stage of the handshake, namely 4699 * attribute exchange. If its a NACK we see if we can specify another 4700 * version, if we can't we stop. 4701 * 4702 * If it is an INFO we reset all params associated with communication 4703 * in that direction over this channel (remember connection is 4704 * essentially 2 independent simplex channels). 4705 */ 4706 void 4707 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 4708 { 4709 vio_ver_msg_t *ver_pkt; 4710 vsw_t *vswp = ldcp->ldc_vswp; 4711 4712 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4713 4714 /* 4715 * We know this is a ctrl/version packet so 4716 * cast it into the correct structure. 4717 */ 4718 ver_pkt = (vio_ver_msg_t *)pkt; 4719 4720 switch (ver_pkt->tag.vio_subtype) { 4721 case VIO_SUBTYPE_INFO: 4722 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 4723 4724 /* 4725 * Record the session id, which we will use from now 4726 * until we see another VER_INFO msg. Even then the 4727 * session id in most cases will be unchanged, execpt 4728 * if channel was reset. 4729 */ 4730 if ((ldcp->session_status & VSW_PEER_SESSION) && 4731 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 4732 DERR(vswp, "%s: updating session id for chan %lld " 4733 "from %llx to %llx", __func__, ldcp->ldc_id, 4734 ldcp->peer_session, ver_pkt->tag.vio_sid); 4735 } 4736 4737 ldcp->peer_session = ver_pkt->tag.vio_sid; 4738 ldcp->session_status |= VSW_PEER_SESSION; 4739 4740 /* Legal message at this time ? */ 4741 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 4742 return; 4743 4744 /* 4745 * First check the device class. Currently only expect 4746 * to be talking to a network device. In the future may 4747 * also talk to another switch. 4748 */ 4749 if (ver_pkt->dev_class != VDEV_NETWORK) { 4750 DERR(vswp, "%s: illegal device class %d", __func__, 4751 ver_pkt->dev_class); 4752 4753 ver_pkt->tag.vio_sid = ldcp->local_session; 4754 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4755 4756 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4757 4758 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4759 sizeof (vio_ver_msg_t), B_TRUE); 4760 4761 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4762 vsw_next_milestone(ldcp); 4763 return; 4764 } else { 4765 ldcp->dev_class = ver_pkt->dev_class; 4766 } 4767 4768 /* 4769 * Now check the version. 4770 */ 4771 if (vsw_supported_version(ver_pkt) == 0) { 4772 /* 4773 * Support this major version and possibly 4774 * adjusted minor version. 4775 */ 4776 4777 D2(vswp, "%s: accepted ver %d:%d", __func__, 4778 ver_pkt->ver_major, ver_pkt->ver_minor); 4779 4780 /* Store accepted values */ 4781 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4782 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4783 4784 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4785 4786 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 4787 } else { 4788 /* 4789 * NACK back with the next lower major/minor 4790 * pairing we support (if don't suuport any more 4791 * versions then they will be set to zero. 4792 */ 4793 4794 D2(vswp, "%s: replying with ver %d:%d", __func__, 4795 ver_pkt->ver_major, ver_pkt->ver_minor); 4796 4797 /* Store updated values */ 4798 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4799 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4800 4801 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4802 4803 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4804 } 4805 4806 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4807 ver_pkt->tag.vio_sid = ldcp->local_session; 4808 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4809 sizeof (vio_ver_msg_t), B_TRUE); 4810 4811 vsw_next_milestone(ldcp); 4812 break; 4813 4814 case VIO_SUBTYPE_ACK: 4815 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 4816 4817 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 4818 return; 4819 4820 /* Store updated values */ 4821 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4822 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4823 4824 4825 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 4826 vsw_next_milestone(ldcp); 4827 4828 break; 4829 4830 case VIO_SUBTYPE_NACK: 4831 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 4832 4833 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 4834 return; 4835 4836 /* 4837 * If our peer sent us a NACK with the ver fields set to 4838 * zero then there is nothing more we can do. Otherwise see 4839 * if we support either the version suggested, or a lesser 4840 * one. 4841 */ 4842 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4843 DERR(vswp, "%s: peer unable to negotiate any " 4844 "further.", __func__); 4845 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4846 vsw_next_milestone(ldcp); 4847 return; 4848 } 4849 4850 /* 4851 * Check to see if we support this major version or 4852 * a lower one. If we don't then maj/min will be set 4853 * to zero. 4854 */ 4855 (void) vsw_supported_version(ver_pkt); 4856 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4857 /* Nothing more we can do */ 4858 DERR(vswp, "%s: version negotiation failed.\n", 4859 __func__); 4860 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4861 vsw_next_milestone(ldcp); 4862 } else { 4863 /* found a supported major version */ 4864 ldcp->lane_out.ver_major = ver_pkt->ver_major; 4865 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 4866 4867 D2(vswp, "%s: resending with updated values (%x, %x)", 4868 __func__, ver_pkt->ver_major, 4869 ver_pkt->ver_minor); 4870 4871 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 4872 ver_pkt->tag.vio_sid = ldcp->local_session; 4873 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 4874 4875 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4876 4877 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4878 sizeof (vio_ver_msg_t), B_TRUE); 4879 4880 vsw_next_milestone(ldcp); 4881 4882 } 4883 break; 4884 4885 default: 4886 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4887 ver_pkt->tag.vio_subtype); 4888 } 4889 4890 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 4891 } 4892 4893 /* 4894 * Process an attribute packet. We can end up here either because our peer 4895 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 4896 * peer has sent us an attribute INFO message 4897 * 4898 * If its an ACK we then move to the next stage of the handshake which 4899 * is to send our descriptor ring info to our peer. If its a NACK then 4900 * there is nothing more we can (currently) do. 4901 * 4902 * If we get a valid/acceptable INFO packet (and we have already negotiated 4903 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 4904 * NACK back and reset channel state to INACTIV. 4905 * 4906 * FUTURE: in time we will probably negotiate over attributes, but for 4907 * the moment unacceptable attributes are regarded as a fatal error. 4908 * 4909 */ 4910 void 4911 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 4912 { 4913 vnet_attr_msg_t *attr_pkt; 4914 vsw_t *vswp = ldcp->ldc_vswp; 4915 vsw_port_t *port = ldcp->ldc_port; 4916 uint64_t macaddr = 0; 4917 int i; 4918 4919 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4920 4921 /* 4922 * We know this is a ctrl/attr packet so 4923 * cast it into the correct structure. 4924 */ 4925 attr_pkt = (vnet_attr_msg_t *)pkt; 4926 4927 switch (attr_pkt->tag.vio_subtype) { 4928 case VIO_SUBTYPE_INFO: 4929 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4930 4931 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 4932 return; 4933 4934 /* 4935 * If the attributes are unacceptable then we NACK back. 4936 */ 4937 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 4938 4939 DERR(vswp, "%s (chan %d): invalid attributes", 4940 __func__, ldcp->ldc_id); 4941 4942 vsw_free_lane_resources(ldcp, INBOUND); 4943 4944 attr_pkt->tag.vio_sid = ldcp->local_session; 4945 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4946 4947 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4948 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 4949 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 4950 sizeof (vnet_attr_msg_t), B_TRUE); 4951 4952 vsw_next_milestone(ldcp); 4953 return; 4954 } 4955 4956 /* 4957 * Otherwise store attributes for this lane and update 4958 * lane state. 4959 */ 4960 ldcp->lane_in.mtu = attr_pkt->mtu; 4961 ldcp->lane_in.addr = attr_pkt->addr; 4962 ldcp->lane_in.addr_type = attr_pkt->addr_type; 4963 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 4964 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 4965 4966 macaddr = ldcp->lane_in.addr; 4967 for (i = ETHERADDRL - 1; i >= 0; i--) { 4968 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 4969 macaddr >>= 8; 4970 } 4971 4972 /* create the fdb entry for this port/mac address */ 4973 (void) vsw_add_fdb(vswp, port); 4974 4975 /* setup device specifc xmit routines */ 4976 mutex_enter(&port->tx_lock); 4977 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 4978 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 4979 port->transmit = vsw_dringsend; 4980 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 4981 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 4982 vsw_create_privring(ldcp); 4983 port->transmit = vsw_descrsend; 4984 } 4985 mutex_exit(&port->tx_lock); 4986 4987 attr_pkt->tag.vio_sid = ldcp->local_session; 4988 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4989 4990 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4991 4992 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 4993 4994 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 4995 sizeof (vnet_attr_msg_t), B_TRUE); 4996 4997 vsw_next_milestone(ldcp); 4998 break; 4999 5000 case VIO_SUBTYPE_ACK: 5001 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5002 5003 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 5004 return; 5005 5006 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 5007 vsw_next_milestone(ldcp); 5008 break; 5009 5010 case VIO_SUBTYPE_NACK: 5011 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5012 5013 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 5014 return; 5015 5016 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 5017 vsw_next_milestone(ldcp); 5018 break; 5019 5020 default: 5021 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5022 attr_pkt->tag.vio_subtype); 5023 } 5024 5025 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5026 } 5027 5028 /* 5029 * Process a dring info packet. We can end up here either because our peer 5030 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 5031 * peer has sent us a dring INFO message. 5032 * 5033 * If we get a valid/acceptable INFO packet (and we have already negotiated 5034 * a version) we ACK back and update the lane state, otherwise we NACK back. 5035 * 5036 * FUTURE: nothing to stop client from sending us info on multiple dring's 5037 * but for the moment we will just use the first one we are given. 5038 * 5039 */ 5040 void 5041 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 5042 { 5043 vio_dring_reg_msg_t *dring_pkt; 5044 vsw_t *vswp = ldcp->ldc_vswp; 5045 ldc_mem_info_t minfo; 5046 dring_info_t *dp, *dbp; 5047 int dring_found = 0; 5048 5049 /* 5050 * We know this is a ctrl/dring packet so 5051 * cast it into the correct structure. 5052 */ 5053 dring_pkt = (vio_dring_reg_msg_t *)pkt; 5054 5055 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5056 5057 switch (dring_pkt->tag.vio_subtype) { 5058 case VIO_SUBTYPE_INFO: 5059 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5060 5061 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5062 return; 5063 5064 /* 5065 * If the dring params are unacceptable then we NACK back. 5066 */ 5067 if (vsw_check_dring_info(dring_pkt)) { 5068 5069 DERR(vswp, "%s (%lld): invalid dring info", 5070 __func__, ldcp->ldc_id); 5071 5072 vsw_free_lane_resources(ldcp, INBOUND); 5073 5074 dring_pkt->tag.vio_sid = ldcp->local_session; 5075 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5076 5077 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5078 5079 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5080 5081 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5082 sizeof (vio_dring_reg_msg_t), B_TRUE); 5083 5084 vsw_next_milestone(ldcp); 5085 return; 5086 } 5087 5088 /* 5089 * Otherwise, attempt to map in the dring using the 5090 * cookie. If that succeeds we send back a unique dring 5091 * identifier that the sending side will use in future 5092 * to refer to this descriptor ring. 5093 */ 5094 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 5095 5096 dp->num_descriptors = dring_pkt->num_descriptors; 5097 dp->descriptor_size = dring_pkt->descriptor_size; 5098 dp->options = dring_pkt->options; 5099 dp->ncookies = dring_pkt->ncookies; 5100 5101 /* 5102 * Note: should only get one cookie. Enforced in 5103 * the ldc layer. 5104 */ 5105 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 5106 sizeof (ldc_mem_cookie_t)); 5107 5108 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 5109 dp->num_descriptors, dp->descriptor_size); 5110 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 5111 dp->options, dp->ncookies); 5112 5113 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 5114 dp->ncookies, dp->num_descriptors, 5115 dp->descriptor_size, LDC_SHADOW_MAP, 5116 &(dp->handle))) != 0) { 5117 5118 DERR(vswp, "%s: dring_map failed\n", __func__); 5119 5120 kmem_free(dp, sizeof (dring_info_t)); 5121 vsw_free_lane_resources(ldcp, INBOUND); 5122 5123 dring_pkt->tag.vio_sid = ldcp->local_session; 5124 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5125 5126 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5127 5128 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5129 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5130 sizeof (vio_dring_reg_msg_t), B_TRUE); 5131 5132 vsw_next_milestone(ldcp); 5133 return; 5134 } 5135 5136 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 5137 5138 DERR(vswp, "%s: dring_addr failed\n", __func__); 5139 5140 kmem_free(dp, sizeof (dring_info_t)); 5141 vsw_free_lane_resources(ldcp, INBOUND); 5142 5143 dring_pkt->tag.vio_sid = ldcp->local_session; 5144 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5145 5146 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5147 5148 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5149 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5150 sizeof (vio_dring_reg_msg_t), B_TRUE); 5151 5152 vsw_next_milestone(ldcp); 5153 return; 5154 } else { 5155 /* store the address of the pub part of ring */ 5156 dp->pub_addr = minfo.vaddr; 5157 } 5158 5159 /* no private section as we are importing */ 5160 dp->priv_addr = NULL; 5161 5162 /* 5163 * Using simple mono increasing int for ident at 5164 * the moment. 5165 */ 5166 dp->ident = ldcp->next_ident; 5167 ldcp->next_ident++; 5168 5169 dp->end_idx = 0; 5170 dp->next = NULL; 5171 5172 /* 5173 * Link it onto the end of the list of drings 5174 * for this lane. 5175 */ 5176 if (ldcp->lane_in.dringp == NULL) { 5177 D2(vswp, "%s: adding first INBOUND dring", __func__); 5178 ldcp->lane_in.dringp = dp; 5179 } else { 5180 dbp = ldcp->lane_in.dringp; 5181 5182 while (dbp->next != NULL) 5183 dbp = dbp->next; 5184 5185 dbp->next = dp; 5186 } 5187 5188 /* acknowledge it */ 5189 dring_pkt->tag.vio_sid = ldcp->local_session; 5190 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5191 dring_pkt->dring_ident = dp->ident; 5192 5193 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5194 sizeof (vio_dring_reg_msg_t), B_TRUE); 5195 5196 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 5197 vsw_next_milestone(ldcp); 5198 break; 5199 5200 case VIO_SUBTYPE_ACK: 5201 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5202 5203 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 5204 return; 5205 5206 /* 5207 * Peer is acknowledging our dring info and will have 5208 * sent us a dring identifier which we will use to 5209 * refer to this ring w.r.t. our peer. 5210 */ 5211 dp = ldcp->lane_out.dringp; 5212 if (dp != NULL) { 5213 /* 5214 * Find the ring this ident should be associated 5215 * with. 5216 */ 5217 if (vsw_dring_match(dp, dring_pkt)) { 5218 dring_found = 1; 5219 5220 } else while (dp != NULL) { 5221 if (vsw_dring_match(dp, dring_pkt)) { 5222 dring_found = 1; 5223 break; 5224 } 5225 dp = dp->next; 5226 } 5227 5228 if (dring_found == 0) { 5229 DERR(NULL, "%s: unrecognised ring cookie", 5230 __func__); 5231 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5232 return; 5233 } 5234 5235 } else { 5236 DERR(vswp, "%s: DRING ACK received but no drings " 5237 "allocated", __func__); 5238 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5239 return; 5240 } 5241 5242 /* store ident */ 5243 dp->ident = dring_pkt->dring_ident; 5244 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 5245 vsw_next_milestone(ldcp); 5246 break; 5247 5248 case VIO_SUBTYPE_NACK: 5249 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5250 5251 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 5252 return; 5253 5254 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 5255 vsw_next_milestone(ldcp); 5256 break; 5257 5258 default: 5259 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5260 dring_pkt->tag.vio_subtype); 5261 } 5262 5263 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5264 } 5265 5266 /* 5267 * Process a request from peer to unregister a dring. 5268 * 5269 * For the moment we just restart the handshake if our 5270 * peer endpoint attempts to unregister a dring. 5271 */ 5272 void 5273 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 5274 { 5275 vsw_t *vswp = ldcp->ldc_vswp; 5276 vio_dring_unreg_msg_t *dring_pkt; 5277 5278 /* 5279 * We know this is a ctrl/dring packet so 5280 * cast it into the correct structure. 5281 */ 5282 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 5283 5284 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5285 5286 switch (dring_pkt->tag.vio_subtype) { 5287 case VIO_SUBTYPE_INFO: 5288 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5289 5290 DWARN(vswp, "%s: restarting handshake..", __func__); 5291 break; 5292 5293 case VIO_SUBTYPE_ACK: 5294 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5295 5296 DWARN(vswp, "%s: restarting handshake..", __func__); 5297 break; 5298 5299 case VIO_SUBTYPE_NACK: 5300 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5301 5302 DWARN(vswp, "%s: restarting handshake..", __func__); 5303 break; 5304 5305 default: 5306 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5307 dring_pkt->tag.vio_subtype); 5308 } 5309 5310 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5311 5312 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5313 } 5314 5315 #define SND_MCST_NACK(ldcp, pkt) \ 5316 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5317 pkt->tag.vio_sid = ldcp->local_session; \ 5318 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5319 sizeof (vnet_mcast_msg_t), B_TRUE); 5320 5321 /* 5322 * Process a multicast request from a vnet. 5323 * 5324 * Vnet's specify a multicast address that they are interested in. This 5325 * address is used as a key into the hash table which forms the multicast 5326 * forwarding database (mFDB). 5327 * 5328 * The table keys are the multicast addresses, while the table entries 5329 * are pointers to lists of ports which wish to receive packets for the 5330 * specified multicast address. 5331 * 5332 * When a multicast packet is being switched we use the address as a key 5333 * into the hash table, and then walk the appropriate port list forwarding 5334 * the pkt to each port in turn. 5335 * 5336 * If a vnet is no longer interested in a particular multicast grouping 5337 * we simply find the correct location in the hash table and then delete 5338 * the relevant port from the port list. 5339 * 5340 * To deal with the case whereby a port is being deleted without first 5341 * removing itself from the lists in the hash table, we maintain a list 5342 * of multicast addresses the port has registered an interest in, within 5343 * the port structure itself. We then simply walk that list of addresses 5344 * using them as keys into the hash table and remove the port from the 5345 * appropriate lists. 5346 */ 5347 static void 5348 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 5349 { 5350 vnet_mcast_msg_t *mcst_pkt; 5351 vsw_port_t *port = ldcp->ldc_port; 5352 vsw_t *vswp = ldcp->ldc_vswp; 5353 int i; 5354 5355 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5356 5357 /* 5358 * We know this is a ctrl/mcast packet so 5359 * cast it into the correct structure. 5360 */ 5361 mcst_pkt = (vnet_mcast_msg_t *)pkt; 5362 5363 switch (mcst_pkt->tag.vio_subtype) { 5364 case VIO_SUBTYPE_INFO: 5365 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5366 5367 /* 5368 * Check if in correct state to receive a multicast 5369 * message (i.e. handshake complete). If not reset 5370 * the handshake. 5371 */ 5372 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 5373 return; 5374 5375 /* 5376 * Before attempting to add or remove address check 5377 * that they are valid multicast addresses. 5378 * If not, then NACK back. 5379 */ 5380 for (i = 0; i < mcst_pkt->count; i++) { 5381 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 5382 DERR(vswp, "%s: invalid multicast address", 5383 __func__); 5384 SND_MCST_NACK(ldcp, mcst_pkt); 5385 return; 5386 } 5387 } 5388 5389 /* 5390 * Now add/remove the addresses. If this fails we 5391 * NACK back. 5392 */ 5393 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 5394 SND_MCST_NACK(ldcp, mcst_pkt); 5395 return; 5396 } 5397 5398 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5399 mcst_pkt->tag.vio_sid = ldcp->local_session; 5400 5401 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 5402 5403 (void) vsw_send_msg(ldcp, (void *)mcst_pkt, 5404 sizeof (vnet_mcast_msg_t), B_TRUE); 5405 break; 5406 5407 case VIO_SUBTYPE_ACK: 5408 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5409 5410 /* 5411 * We shouldn't ever get a multicast ACK message as 5412 * at the moment we never request multicast addresses 5413 * to be set on some other device. This may change in 5414 * the future if we have cascading switches. 5415 */ 5416 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 5417 return; 5418 5419 /* Do nothing */ 5420 break; 5421 5422 case VIO_SUBTYPE_NACK: 5423 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5424 5425 /* 5426 * We shouldn't get a multicast NACK packet for the 5427 * same reasons as we shouldn't get a ACK packet. 5428 */ 5429 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 5430 return; 5431 5432 /* Do nothing */ 5433 break; 5434 5435 default: 5436 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5437 mcst_pkt->tag.vio_subtype); 5438 } 5439 5440 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5441 } 5442 5443 static void 5444 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 5445 { 5446 vio_rdx_msg_t *rdx_pkt; 5447 vsw_t *vswp = ldcp->ldc_vswp; 5448 5449 /* 5450 * We know this is a ctrl/rdx packet so 5451 * cast it into the correct structure. 5452 */ 5453 rdx_pkt = (vio_rdx_msg_t *)pkt; 5454 5455 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5456 5457 switch (rdx_pkt->tag.vio_subtype) { 5458 case VIO_SUBTYPE_INFO: 5459 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5460 5461 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) 5462 return; 5463 5464 rdx_pkt->tag.vio_sid = ldcp->local_session; 5465 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5466 5467 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 5468 5469 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; 5470 5471 (void) vsw_send_msg(ldcp, (void *)rdx_pkt, 5472 sizeof (vio_rdx_msg_t), B_TRUE); 5473 5474 vsw_next_milestone(ldcp); 5475 break; 5476 5477 case VIO_SUBTYPE_ACK: 5478 /* 5479 * Should be handled in-band by callback handler. 5480 */ 5481 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 5482 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5483 break; 5484 5485 case VIO_SUBTYPE_NACK: 5486 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5487 5488 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) 5489 return; 5490 5491 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; 5492 vsw_next_milestone(ldcp); 5493 break; 5494 5495 default: 5496 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5497 rdx_pkt->tag.vio_subtype); 5498 } 5499 5500 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5501 } 5502 5503 static void 5504 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 5505 { 5506 uint16_t env = tag.vio_subtype_env; 5507 vsw_t *vswp = ldcp->ldc_vswp; 5508 5509 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5510 5511 /* session id check */ 5512 if (ldcp->session_status & VSW_PEER_SESSION) { 5513 if (ldcp->peer_session != tag.vio_sid) { 5514 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5515 __func__, ldcp->ldc_id, tag.vio_sid); 5516 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5517 return; 5518 } 5519 } 5520 5521 /* 5522 * It is an error for us to be getting data packets 5523 * before the handshake has completed. 5524 */ 5525 if (ldcp->hphase != VSW_MILESTONE4) { 5526 DERR(vswp, "%s: got data packet before handshake complete " 5527 "hphase %d (%x: %x)", __func__, ldcp->hphase, 5528 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 5529 DUMP_FLAGS(ldcp->lane_in.lstate); 5530 DUMP_FLAGS(ldcp->lane_out.lstate); 5531 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5532 return; 5533 } 5534 5535 /* 5536 * Switch on vio_subtype envelope, then let lower routines 5537 * decide if its an INFO, ACK or NACK packet. 5538 */ 5539 if (env == VIO_DRING_DATA) { 5540 vsw_process_data_dring_pkt(ldcp, dpkt); 5541 } else if (env == VIO_PKT_DATA) { 5542 vsw_process_data_raw_pkt(ldcp, dpkt); 5543 } else if (env == VIO_DESC_DATA) { 5544 vsw_process_data_ibnd_pkt(ldcp, dpkt); 5545 } else { 5546 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 5547 __func__, env); 5548 } 5549 5550 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5551 } 5552 5553 #define SND_DRING_NACK(ldcp, pkt) \ 5554 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5555 pkt->tag.vio_sid = ldcp->local_session; \ 5556 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5557 sizeof (vio_dring_msg_t), B_TRUE); 5558 5559 static void 5560 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 5561 { 5562 vio_dring_msg_t *dring_pkt; 5563 vnet_public_desc_t *pub_addr = NULL; 5564 vsw_private_desc_t *priv_addr = NULL; 5565 dring_info_t *dp = NULL; 5566 vsw_t *vswp = ldcp->ldc_vswp; 5567 mblk_t *mp = NULL; 5568 mblk_t *bp = NULL; 5569 mblk_t *bpt = NULL; 5570 size_t nbytes = 0; 5571 size_t off = 0; 5572 uint64_t ncookies = 0; 5573 uint64_t chain = 0; 5574 uint64_t j, len; 5575 uint32_t pos, start, datalen; 5576 uint32_t range_start, range_end; 5577 int32_t end, num, cnt = 0; 5578 int i, rv, msg_rv = 0; 5579 boolean_t ack_needed = B_FALSE; 5580 boolean_t prev_desc_ack = B_FALSE; 5581 int read_attempts = 0; 5582 5583 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5584 5585 /* 5586 * We know this is a data/dring packet so 5587 * cast it into the correct structure. 5588 */ 5589 dring_pkt = (vio_dring_msg_t *)dpkt; 5590 5591 /* 5592 * Switch on the vio_subtype. If its INFO then we need to 5593 * process the data. If its an ACK we need to make sure 5594 * it makes sense (i.e did we send an earlier data/info), 5595 * and if its a NACK then we maybe attempt a retry. 5596 */ 5597 switch (dring_pkt->tag.vio_subtype) { 5598 case VIO_SUBTYPE_INFO: 5599 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 5600 5601 READ_ENTER(&ldcp->lane_in.dlistrw); 5602 if ((dp = vsw_ident2dring(&ldcp->lane_in, 5603 dring_pkt->dring_ident)) == NULL) { 5604 RW_EXIT(&ldcp->lane_in.dlistrw); 5605 5606 DERR(vswp, "%s(%lld): unable to find dring from " 5607 "ident 0x%llx", __func__, ldcp->ldc_id, 5608 dring_pkt->dring_ident); 5609 5610 SND_DRING_NACK(ldcp, dring_pkt); 5611 return; 5612 } 5613 5614 start = pos = dring_pkt->start_idx; 5615 end = dring_pkt->end_idx; 5616 len = dp->num_descriptors; 5617 5618 range_start = range_end = pos; 5619 5620 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 5621 __func__, ldcp->ldc_id, start, end); 5622 5623 if (end == -1) { 5624 num = -1; 5625 } else if (end >= 0) { 5626 num = end >= pos ? 5627 end - pos + 1: (len - pos + 1) + end; 5628 5629 /* basic sanity check */ 5630 if (end > len) { 5631 RW_EXIT(&ldcp->lane_in.dlistrw); 5632 DERR(vswp, "%s(%lld): endpoint %lld outside " 5633 "ring length %lld", __func__, 5634 ldcp->ldc_id, end, len); 5635 5636 SND_DRING_NACK(ldcp, dring_pkt); 5637 return; 5638 } 5639 } else { 5640 RW_EXIT(&ldcp->lane_in.dlistrw); 5641 DERR(vswp, "%s(%lld): invalid endpoint %lld", 5642 __func__, ldcp->ldc_id, end); 5643 SND_DRING_NACK(ldcp, dring_pkt); 5644 return; 5645 } 5646 5647 while (cnt != num) { 5648 vsw_recheck_desc: 5649 if ((rv = ldc_mem_dring_acquire(dp->handle, 5650 pos, pos)) != 0) { 5651 RW_EXIT(&ldcp->lane_in.dlistrw); 5652 DERR(vswp, "%s(%lld): unable to acquire " 5653 "descriptor at pos %d: err %d", 5654 __func__, pos, ldcp->ldc_id, rv); 5655 SND_DRING_NACK(ldcp, dring_pkt); 5656 return; 5657 } 5658 5659 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 5660 5661 /* 5662 * When given a bounded range of descriptors 5663 * to process, its an error to hit a descriptor 5664 * which is not ready. In the non-bounded case 5665 * (end_idx == -1) this simply indicates we have 5666 * reached the end of the current active range. 5667 */ 5668 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 5669 /* unbound - no error */ 5670 if (end == -1) { 5671 if (read_attempts == vsw_read_attempts) 5672 break; 5673 5674 delay(drv_usectohz(vsw_desc_delay)); 5675 read_attempts++; 5676 goto vsw_recheck_desc; 5677 } 5678 5679 /* bounded - error - so NACK back */ 5680 RW_EXIT(&ldcp->lane_in.dlistrw); 5681 DERR(vswp, "%s(%lld): descriptor not READY " 5682 "(%d)", __func__, ldcp->ldc_id, 5683 pub_addr->hdr.dstate); 5684 SND_DRING_NACK(ldcp, dring_pkt); 5685 return; 5686 } 5687 5688 DTRACE_PROBE1(read_attempts, int, read_attempts); 5689 5690 range_end = pos; 5691 5692 /* 5693 * If we ACK'd the previous descriptor then now 5694 * record the new range start position for later 5695 * ACK's. 5696 */ 5697 if (prev_desc_ack) { 5698 range_start = pos; 5699 5700 D2(vswp, "%s(%lld): updating range start " 5701 "to be %d", __func__, ldcp->ldc_id, 5702 range_start); 5703 5704 prev_desc_ack = B_FALSE; 5705 } 5706 5707 /* 5708 * Data is padded to align on 8 byte boundary, 5709 * datalen is actual data length, i.e. minus that 5710 * padding. 5711 */ 5712 datalen = pub_addr->nbytes; 5713 5714 /* 5715 * Does peer wish us to ACK when we have finished 5716 * with this descriptor ? 5717 */ 5718 if (pub_addr->hdr.ack) 5719 ack_needed = B_TRUE; 5720 5721 D2(vswp, "%s(%lld): processing desc %lld at pos" 5722 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 5723 __func__, ldcp->ldc_id, pos, pub_addr, 5724 pub_addr->hdr.dstate, datalen); 5725 5726 /* 5727 * Mark that we are starting to process descriptor. 5728 */ 5729 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 5730 5731 mp = vio_allocb(ldcp->rxh); 5732 if (mp == NULL) { 5733 /* 5734 * No free receive buffers available, so 5735 * fallback onto allocb(9F). Make sure that 5736 * we get a data buffer which is a multiple 5737 * of 8 as this is required by ldc_mem_copy. 5738 */ 5739 DTRACE_PROBE(allocb); 5740 mp = allocb(datalen + VNET_IPALIGN + 8, 5741 BPRI_MED); 5742 } 5743 5744 /* 5745 * Ensure that we ask ldc for an aligned 5746 * number of bytes. 5747 */ 5748 nbytes = datalen + VNET_IPALIGN; 5749 if (nbytes & 0x7) { 5750 off = 8 - (nbytes & 0x7); 5751 nbytes += off; 5752 } 5753 5754 ncookies = pub_addr->ncookies; 5755 rv = ldc_mem_copy(ldcp->ldc_handle, 5756 (caddr_t)mp->b_rptr, 0, &nbytes, 5757 pub_addr->memcookie, ncookies, 5758 LDC_COPY_IN); 5759 5760 if (rv != 0) { 5761 DERR(vswp, "%s(%d): unable to copy in " 5762 "data from %d cookies in desc %d" 5763 " (rv %d)", __func__, ldcp->ldc_id, 5764 ncookies, pos, rv); 5765 freemsg(mp); 5766 5767 pub_addr->hdr.dstate = VIO_DESC_DONE; 5768 (void) ldc_mem_dring_release(dp->handle, 5769 pos, pos); 5770 break; 5771 } else { 5772 D2(vswp, "%s(%d): copied in %ld bytes" 5773 " using %d cookies", __func__, 5774 ldcp->ldc_id, nbytes, ncookies); 5775 } 5776 5777 /* adjust the read pointer to skip over the padding */ 5778 mp->b_rptr += VNET_IPALIGN; 5779 5780 /* point to the actual end of data */ 5781 mp->b_wptr = mp->b_rptr + datalen; 5782 5783 /* build a chain of received packets */ 5784 if (bp == NULL) { 5785 /* first pkt */ 5786 bp = mp; 5787 bp->b_next = bp->b_prev = NULL; 5788 bpt = bp; 5789 chain = 1; 5790 } else { 5791 mp->b_next = NULL; 5792 mp->b_prev = bpt; 5793 bpt->b_next = mp; 5794 bpt = mp; 5795 chain++; 5796 } 5797 5798 /* mark we are finished with this descriptor */ 5799 pub_addr->hdr.dstate = VIO_DESC_DONE; 5800 5801 (void) ldc_mem_dring_release(dp->handle, pos, pos); 5802 5803 /* 5804 * Send an ACK back to peer if requested. 5805 */ 5806 if (ack_needed) { 5807 ack_needed = B_FALSE; 5808 5809 dring_pkt->start_idx = range_start; 5810 dring_pkt->end_idx = range_end; 5811 5812 DERR(vswp, "%s(%lld): processed %d %d, ACK" 5813 " requested", __func__, ldcp->ldc_id, 5814 dring_pkt->start_idx, 5815 dring_pkt->end_idx); 5816 5817 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 5818 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5819 dring_pkt->tag.vio_sid = ldcp->local_session; 5820 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 5821 sizeof (vio_dring_msg_t), 5822 B_FALSE); 5823 5824 /* 5825 * Check if ACK was successfully sent. If not 5826 * we break and deal with that below. 5827 */ 5828 if (msg_rv != 0) 5829 break; 5830 5831 prev_desc_ack = B_TRUE; 5832 range_start = pos; 5833 } 5834 5835 /* next descriptor */ 5836 pos = (pos + 1) % len; 5837 cnt++; 5838 5839 /* 5840 * Break out of loop here and stop processing to 5841 * allow some other network device (or disk) to 5842 * get access to the cpu. 5843 */ 5844 if (chain > vsw_chain_len) { 5845 D3(vswp, "%s(%lld): switching chain of %d " 5846 "msgs", __func__, ldcp->ldc_id, chain); 5847 break; 5848 } 5849 } 5850 RW_EXIT(&ldcp->lane_in.dlistrw); 5851 5852 /* 5853 * If when we attempted to send the ACK we found that the 5854 * channel had been reset then now handle this. We deal with 5855 * it here as we cannot reset the channel while holding the 5856 * dlistrw lock, and we don't want to acquire/release it 5857 * continuously in the above loop, as a channel reset should 5858 * be a rare event. 5859 */ 5860 if (msg_rv == ECONNRESET) { 5861 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 5862 break; 5863 } 5864 5865 /* send the chain of packets to be switched */ 5866 if (bp != NULL) { 5867 D3(vswp, "%s(%lld): switching chain of %d msgs", 5868 __func__, ldcp->ldc_id, chain); 5869 vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, 5870 ldcp->ldc_port, NULL); 5871 } 5872 5873 DTRACE_PROBE1(msg_cnt, int, cnt); 5874 5875 /* 5876 * We are now finished so ACK back with the state 5877 * set to STOPPING so our peer knows we are finished 5878 */ 5879 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5880 dring_pkt->tag.vio_sid = ldcp->local_session; 5881 5882 dring_pkt->dring_process_state = VIO_DP_STOPPED; 5883 5884 DTRACE_PROBE(stop_process_sent); 5885 5886 /* 5887 * We have not processed any more descriptors beyond 5888 * the last one we ACK'd. 5889 */ 5890 if (prev_desc_ack) 5891 range_start = range_end; 5892 5893 dring_pkt->start_idx = range_start; 5894 dring_pkt->end_idx = range_end; 5895 5896 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 5897 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5898 dring_pkt->end_idx); 5899 5900 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5901 sizeof (vio_dring_msg_t), B_TRUE); 5902 break; 5903 5904 case VIO_SUBTYPE_ACK: 5905 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 5906 /* 5907 * Verify that the relevant descriptors are all 5908 * marked as DONE 5909 */ 5910 READ_ENTER(&ldcp->lane_out.dlistrw); 5911 if ((dp = vsw_ident2dring(&ldcp->lane_out, 5912 dring_pkt->dring_ident)) == NULL) { 5913 RW_EXIT(&ldcp->lane_out.dlistrw); 5914 DERR(vswp, "%s: unknown ident in ACK", __func__); 5915 return; 5916 } 5917 5918 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 5919 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5920 5921 start = end = 0; 5922 start = dring_pkt->start_idx; 5923 end = dring_pkt->end_idx; 5924 len = dp->num_descriptors; 5925 5926 j = num = 0; 5927 /* calculate # descriptors taking into a/c wrap around */ 5928 num = end >= start ? end - start + 1: (len - start + 1) + end; 5929 5930 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 5931 __func__, ldcp->ldc_id, start, end, num); 5932 5933 mutex_enter(&dp->dlock); 5934 dp->last_ack_recv = end; 5935 mutex_exit(&dp->dlock); 5936 5937 for (i = start; j < num; i = (i + 1) % len, j++) { 5938 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 5939 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5940 5941 /* 5942 * If the last descriptor in a range has the ACK 5943 * bit set then we will get two messages from our 5944 * peer relating to it. The normal ACK msg and then 5945 * a subsequent STOP msg. The first message will have 5946 * resulted in the descriptor being reclaimed and 5947 * its state set to FREE so when we encounter a non 5948 * DONE descriptor we need to check to see if its 5949 * because we have just reclaimed it. 5950 */ 5951 mutex_enter(&priv_addr->dstate_lock); 5952 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 5953 /* clear all the fields */ 5954 bzero(priv_addr->datap, priv_addr->datalen); 5955 priv_addr->datalen = 0; 5956 5957 pub_addr->hdr.dstate = VIO_DESC_FREE; 5958 pub_addr->hdr.ack = 0; 5959 5960 priv_addr->dstate = VIO_DESC_FREE; 5961 mutex_exit(&priv_addr->dstate_lock); 5962 5963 D3(vswp, "clearing descp %d : pub state " 5964 "0x%llx : priv state 0x%llx", i, 5965 pub_addr->hdr.dstate, 5966 priv_addr->dstate); 5967 5968 } else { 5969 mutex_exit(&priv_addr->dstate_lock); 5970 5971 if (dring_pkt->dring_process_state != 5972 VIO_DP_STOPPED) { 5973 DERR(vswp, "%s: descriptor %lld at pos " 5974 " 0x%llx not DONE (0x%lx)\n", 5975 __func__, i, pub_addr, 5976 pub_addr->hdr.dstate); 5977 RW_EXIT(&ldcp->lane_out.dlistrw); 5978 return; 5979 } 5980 } 5981 } 5982 5983 /* 5984 * If our peer is stopping processing descriptors then 5985 * we check to make sure it has processed all the descriptors 5986 * we have updated. If not then we send it a new message 5987 * to prompt it to restart. 5988 */ 5989 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 5990 DTRACE_PROBE(stop_process_recv); 5991 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 5992 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5993 dring_pkt->end_idx); 5994 5995 /* 5996 * Check next descriptor in public section of ring. 5997 * If its marked as READY then we need to prompt our 5998 * peer to start processing the ring again. 5999 */ 6000 i = (end + 1) % len; 6001 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6002 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6003 6004 /* 6005 * Hold the restart lock across all of this to 6006 * make sure that its not possible for us to 6007 * decide that a msg needs to be sent in the future 6008 * but the sending code having already checked is 6009 * about to exit. 6010 */ 6011 mutex_enter(&dp->restart_lock); 6012 mutex_enter(&priv_addr->dstate_lock); 6013 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 6014 6015 mutex_exit(&priv_addr->dstate_lock); 6016 6017 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 6018 dring_pkt->tag.vio_sid = ldcp->local_session; 6019 6020 mutex_enter(&ldcp->lane_out.seq_lock); 6021 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 6022 mutex_exit(&ldcp->lane_out.seq_lock); 6023 6024 dring_pkt->start_idx = (end + 1) % len; 6025 dring_pkt->end_idx = -1; 6026 6027 D2(vswp, "%s(%lld) : sending restart msg:" 6028 " %d : %d", __func__, ldcp->ldc_id, 6029 dring_pkt->start_idx, 6030 dring_pkt->end_idx); 6031 6032 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6033 sizeof (vio_dring_msg_t), B_FALSE); 6034 6035 } else { 6036 mutex_exit(&priv_addr->dstate_lock); 6037 dp->restart_reqd = B_TRUE; 6038 } 6039 mutex_exit(&dp->restart_lock); 6040 } 6041 RW_EXIT(&ldcp->lane_out.dlistrw); 6042 6043 /* only do channel reset after dropping dlistrw lock */ 6044 if (msg_rv == ECONNRESET) 6045 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6046 6047 break; 6048 6049 case VIO_SUBTYPE_NACK: 6050 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 6051 __func__, ldcp->ldc_id); 6052 /* 6053 * Something is badly wrong if we are getting NACK's 6054 * for our data pkts. So reset the channel. 6055 */ 6056 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6057 6058 break; 6059 6060 default: 6061 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6062 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 6063 } 6064 6065 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6066 } 6067 6068 /* 6069 * VIO_PKT_DATA (a.k.a raw data mode ) 6070 * 6071 * Note - currently not supported. Do nothing. 6072 */ 6073 static void 6074 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 6075 { 6076 _NOTE(ARGUNUSED(dpkt)) 6077 6078 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6079 6080 DERR(NULL, "%s (%lld): currently not supported", 6081 __func__, ldcp->ldc_id); 6082 6083 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6084 } 6085 6086 /* 6087 * Process an in-band descriptor message (most likely from 6088 * OBP). 6089 */ 6090 static void 6091 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 6092 { 6093 vnet_ibnd_desc_t *ibnd_desc; 6094 dring_info_t *dp = NULL; 6095 vsw_private_desc_t *priv_addr = NULL; 6096 vsw_t *vswp = ldcp->ldc_vswp; 6097 mblk_t *mp = NULL; 6098 size_t nbytes = 0; 6099 size_t off = 0; 6100 uint64_t idx = 0; 6101 uint32_t num = 1, len, datalen = 0; 6102 uint64_t ncookies = 0; 6103 int i, rv; 6104 int j = 0; 6105 6106 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6107 6108 ibnd_desc = (vnet_ibnd_desc_t *)pkt; 6109 6110 switch (ibnd_desc->hdr.tag.vio_subtype) { 6111 case VIO_SUBTYPE_INFO: 6112 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6113 6114 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 6115 return; 6116 6117 /* 6118 * Data is padded to align on a 8 byte boundary, 6119 * nbytes is actual data length, i.e. minus that 6120 * padding. 6121 */ 6122 datalen = ibnd_desc->nbytes; 6123 6124 D2(vswp, "%s(%lld): processing inband desc : " 6125 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 6126 6127 ncookies = ibnd_desc->ncookies; 6128 6129 /* 6130 * allocb(9F) returns an aligned data block. We 6131 * need to ensure that we ask ldc for an aligned 6132 * number of bytes also. 6133 */ 6134 nbytes = datalen; 6135 if (nbytes & 0x7) { 6136 off = 8 - (nbytes & 0x7); 6137 nbytes += off; 6138 } 6139 6140 mp = allocb(datalen, BPRI_MED); 6141 if (mp == NULL) { 6142 DERR(vswp, "%s(%lld): allocb failed", 6143 __func__, ldcp->ldc_id); 6144 return; 6145 } 6146 6147 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 6148 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 6149 LDC_COPY_IN); 6150 6151 if (rv != 0) { 6152 DERR(vswp, "%s(%d): unable to copy in data from " 6153 "%d cookie(s)", __func__, 6154 ldcp->ldc_id, ncookies); 6155 freemsg(mp); 6156 return; 6157 } else { 6158 D2(vswp, "%s(%d): copied in %ld bytes using %d " 6159 "cookies", __func__, ldcp->ldc_id, nbytes, 6160 ncookies); 6161 } 6162 6163 /* point to the actual end of data */ 6164 mp->b_wptr = mp->b_rptr + datalen; 6165 6166 /* 6167 * We ACK back every in-band descriptor message we process 6168 */ 6169 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 6170 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 6171 (void) vsw_send_msg(ldcp, (void *)ibnd_desc, 6172 sizeof (vnet_ibnd_desc_t), B_TRUE); 6173 6174 /* send the packet to be switched */ 6175 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, 6176 ldcp->ldc_port, NULL); 6177 6178 break; 6179 6180 case VIO_SUBTYPE_ACK: 6181 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 6182 6183 /* Verify the ACK is valid */ 6184 idx = ibnd_desc->hdr.desc_handle; 6185 6186 if (idx >= VSW_RING_NUM_EL) { 6187 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " 6188 "(idx %ld)", vswp->instance, idx); 6189 return; 6190 } 6191 6192 if ((dp = ldcp->lane_out.dringp) == NULL) { 6193 DERR(vswp, "%s: no dring found", __func__); 6194 return; 6195 } 6196 6197 len = dp->num_descriptors; 6198 /* 6199 * If the descriptor we are being ACK'ed for is not the 6200 * one we expected, then pkts were lost somwhere, either 6201 * when we tried to send a msg, or a previous ACK msg from 6202 * our peer. In either case we now reclaim the descriptors 6203 * in the range from the last ACK we received up to the 6204 * current ACK. 6205 */ 6206 if (idx != dp->last_ack_recv) { 6207 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 6208 __func__, dp->last_ack_recv, idx); 6209 num = idx >= dp->last_ack_recv ? 6210 idx - dp->last_ack_recv + 1: 6211 (len - dp->last_ack_recv + 1) + idx; 6212 } 6213 6214 /* 6215 * When we sent the in-band message to our peer we 6216 * marked the copy in our private ring as READY. We now 6217 * check that the descriptor we are being ACK'ed for is in 6218 * fact READY, i.e. it is one we have shared with our peer. 6219 * 6220 * If its not we flag an error, but still reset the descr 6221 * back to FREE. 6222 */ 6223 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 6224 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6225 mutex_enter(&priv_addr->dstate_lock); 6226 if (priv_addr->dstate != VIO_DESC_READY) { 6227 DERR(vswp, "%s: (%ld) desc at index %ld not " 6228 "READY (0x%lx)", __func__, 6229 ldcp->ldc_id, idx, priv_addr->dstate); 6230 DERR(vswp, "%s: bound %d: ncookies %ld : " 6231 "datalen %ld", __func__, 6232 priv_addr->bound, priv_addr->ncookies, 6233 priv_addr->datalen); 6234 } 6235 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 6236 ldcp->ldc_id, idx); 6237 /* release resources associated with sent msg */ 6238 bzero(priv_addr->datap, priv_addr->datalen); 6239 priv_addr->datalen = 0; 6240 priv_addr->dstate = VIO_DESC_FREE; 6241 mutex_exit(&priv_addr->dstate_lock); 6242 } 6243 /* update to next expected value */ 6244 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 6245 6246 break; 6247 6248 case VIO_SUBTYPE_NACK: 6249 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6250 6251 /* 6252 * We should only get a NACK if our peer doesn't like 6253 * something about a message we have sent it. If this 6254 * happens we just release the resources associated with 6255 * the message. (We are relying on higher layers to decide 6256 * whether or not to resend. 6257 */ 6258 6259 /* limit check */ 6260 idx = ibnd_desc->hdr.desc_handle; 6261 6262 if (idx >= VSW_RING_NUM_EL) { 6263 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 6264 __func__, idx); 6265 return; 6266 } 6267 6268 if ((dp = ldcp->lane_out.dringp) == NULL) { 6269 DERR(vswp, "%s: no dring found", __func__); 6270 return; 6271 } 6272 6273 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6274 6275 /* move to correct location in ring */ 6276 priv_addr += idx; 6277 6278 /* release resources associated with sent msg */ 6279 mutex_enter(&priv_addr->dstate_lock); 6280 bzero(priv_addr->datap, priv_addr->datalen); 6281 priv_addr->datalen = 0; 6282 priv_addr->dstate = VIO_DESC_FREE; 6283 mutex_exit(&priv_addr->dstate_lock); 6284 6285 break; 6286 6287 default: 6288 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6289 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 6290 } 6291 6292 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6293 } 6294 6295 static void 6296 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 6297 { 6298 _NOTE(ARGUNUSED(epkt)) 6299 6300 vsw_t *vswp = ldcp->ldc_vswp; 6301 uint16_t env = tag.vio_subtype_env; 6302 6303 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6304 6305 /* 6306 * Error vio_subtypes have yet to be defined. So for 6307 * the moment we can't do anything. 6308 */ 6309 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 6310 6311 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6312 } 6313 6314 /* 6315 * Switch the given ethernet frame when operating in layer 2 mode. 6316 * 6317 * vswp: pointer to the vsw instance 6318 * mp: pointer to chain of ethernet frame(s) to be switched 6319 * caller: identifies the source of this frame as: 6320 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 6321 * 2. VSW_PHYSDEV - the physical ethernet device 6322 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 6323 * arg: argument provided by the caller. 6324 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 6325 * 2. for PHYSDEV - NULL 6326 * 3. for LOCALDEV - pointer to to this vsw_t(self) 6327 */ 6328 void 6329 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 6330 vsw_port_t *arg, mac_resource_handle_t mrh) 6331 { 6332 struct ether_header *ehp; 6333 vsw_port_t *port = NULL; 6334 mblk_t *bp, *ret_m; 6335 mblk_t *nmp = NULL; 6336 vsw_port_list_t *plist = &vswp->plist; 6337 6338 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6339 6340 /* 6341 * PERF: rather than breaking up the chain here, scan it 6342 * to find all mblks heading to same destination and then 6343 * pass that sub-chain to the lower transmit functions. 6344 */ 6345 6346 /* process the chain of packets */ 6347 bp = mp; 6348 while (bp) { 6349 mp = bp; 6350 bp = bp->b_next; 6351 mp->b_next = mp->b_prev = NULL; 6352 ehp = (struct ether_header *)mp->b_rptr; 6353 6354 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6355 __func__, MBLKSIZE(mp), MBLKL(mp)); 6356 6357 READ_ENTER(&vswp->if_lockrw); 6358 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 6359 /* 6360 * If destination is VSW_LOCALDEV (vsw as an eth 6361 * interface) and if the device is up & running, 6362 * send the packet up the stack on this host. 6363 * If the virtual interface is down, drop the packet. 6364 */ 6365 if (caller != VSW_LOCALDEV) { 6366 if (vswp->if_state & VSW_IF_UP) { 6367 RW_EXIT(&vswp->if_lockrw); 6368 mac_rx(vswp->if_mh, mrh, mp); 6369 } else { 6370 RW_EXIT(&vswp->if_lockrw); 6371 /* Interface down, drop pkt */ 6372 freemsg(mp); 6373 } 6374 } else { 6375 RW_EXIT(&vswp->if_lockrw); 6376 freemsg(mp); 6377 } 6378 continue; 6379 } 6380 RW_EXIT(&vswp->if_lockrw); 6381 6382 READ_ENTER(&plist->lockrw); 6383 port = vsw_lookup_fdb(vswp, ehp); 6384 if (port) { 6385 /* 6386 * Mark the port as in-use. 6387 */ 6388 mutex_enter(&port->ref_lock); 6389 port->ref_cnt++; 6390 mutex_exit(&port->ref_lock); 6391 RW_EXIT(&plist->lockrw); 6392 6393 /* 6394 * If plumbed and in promisc mode then copy msg 6395 * and send up the stack. 6396 */ 6397 READ_ENTER(&vswp->if_lockrw); 6398 if (VSW_U_P(vswp->if_state)) { 6399 RW_EXIT(&vswp->if_lockrw); 6400 nmp = copymsg(mp); 6401 if (nmp) 6402 mac_rx(vswp->if_mh, mrh, nmp); 6403 } else { 6404 RW_EXIT(&vswp->if_lockrw); 6405 } 6406 6407 /* 6408 * If the destination is in FDB, the packet 6409 * should be forwarded to the correponding 6410 * vsw_port (connected to a vnet device - 6411 * VSW_VNETPORT) 6412 */ 6413 (void) vsw_portsend(port, mp); 6414 6415 /* 6416 * Decrement use count in port and check if 6417 * should wake delete thread. 6418 */ 6419 mutex_enter(&port->ref_lock); 6420 port->ref_cnt--; 6421 if (port->ref_cnt == 0) 6422 cv_signal(&port->ref_cv); 6423 mutex_exit(&port->ref_lock); 6424 } else { 6425 RW_EXIT(&plist->lockrw); 6426 /* 6427 * Destination not in FDB. 6428 * 6429 * If the destination is broadcast or 6430 * multicast forward the packet to all 6431 * (VNETPORTs, PHYSDEV, LOCALDEV), 6432 * except the caller. 6433 */ 6434 if (IS_BROADCAST(ehp)) { 6435 D3(vswp, "%s: BROADCAST pkt", __func__); 6436 (void) vsw_forward_all(vswp, mp, 6437 caller, arg); 6438 } else if (IS_MULTICAST(ehp)) { 6439 D3(vswp, "%s: MULTICAST pkt", __func__); 6440 (void) vsw_forward_grp(vswp, mp, 6441 caller, arg); 6442 } else { 6443 /* 6444 * If the destination is unicast, and came 6445 * from either a logical network device or 6446 * the switch itself when it is plumbed, then 6447 * send it out on the physical device and also 6448 * up the stack if the logical interface is 6449 * in promiscious mode. 6450 * 6451 * NOTE: The assumption here is that if we 6452 * cannot find the destination in our fdb, its 6453 * a unicast address, and came from either a 6454 * vnet or down the stack (when plumbed) it 6455 * must be destinded for an ethernet device 6456 * outside our ldoms. 6457 */ 6458 if (caller == VSW_VNETPORT) { 6459 READ_ENTER(&vswp->if_lockrw); 6460 if (VSW_U_P(vswp->if_state)) { 6461 RW_EXIT(&vswp->if_lockrw); 6462 nmp = copymsg(mp); 6463 if (nmp) 6464 mac_rx(vswp->if_mh, 6465 mrh, nmp); 6466 } else { 6467 RW_EXIT(&vswp->if_lockrw); 6468 } 6469 if ((ret_m = vsw_tx_msg(vswp, mp)) 6470 != NULL) { 6471 DERR(vswp, "%s: drop mblks to " 6472 "phys dev", __func__); 6473 freemsg(ret_m); 6474 } 6475 6476 } else if (caller == VSW_PHYSDEV) { 6477 /* 6478 * Pkt seen because card in promisc 6479 * mode. Send up stack if plumbed in 6480 * promisc mode, else drop it. 6481 */ 6482 READ_ENTER(&vswp->if_lockrw); 6483 if (VSW_U_P(vswp->if_state)) { 6484 RW_EXIT(&vswp->if_lockrw); 6485 mac_rx(vswp->if_mh, mrh, mp); 6486 } else { 6487 RW_EXIT(&vswp->if_lockrw); 6488 freemsg(mp); 6489 } 6490 6491 } else if (caller == VSW_LOCALDEV) { 6492 /* 6493 * Pkt came down the stack, send out 6494 * over physical device. 6495 */ 6496 if ((ret_m = vsw_tx_msg(vswp, mp)) 6497 != NULL) { 6498 DERR(vswp, "%s: drop mblks to " 6499 "phys dev", __func__); 6500 freemsg(ret_m); 6501 } 6502 } 6503 } 6504 } 6505 } 6506 D1(vswp, "%s: exit\n", __func__); 6507 } 6508 6509 /* 6510 * Switch ethernet frame when in layer 3 mode (i.e. using IP 6511 * layer to do the routing). 6512 * 6513 * There is a large amount of overlap between this function and 6514 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 6515 * both these functions. 6516 */ 6517 void 6518 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 6519 vsw_port_t *arg, mac_resource_handle_t mrh) 6520 { 6521 struct ether_header *ehp; 6522 vsw_port_t *port = NULL; 6523 mblk_t *bp = NULL; 6524 vsw_port_list_t *plist = &vswp->plist; 6525 6526 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6527 6528 /* 6529 * In layer 3 mode should only ever be switching packets 6530 * between IP layer and vnet devices. So make sure thats 6531 * who is invoking us. 6532 */ 6533 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 6534 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 6535 freemsgchain(mp); 6536 return; 6537 } 6538 6539 /* process the chain of packets */ 6540 bp = mp; 6541 while (bp) { 6542 mp = bp; 6543 bp = bp->b_next; 6544 mp->b_next = mp->b_prev = NULL; 6545 ehp = (struct ether_header *)mp->b_rptr; 6546 6547 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6548 __func__, MBLKSIZE(mp), MBLKL(mp)); 6549 6550 READ_ENTER(&plist->lockrw); 6551 port = vsw_lookup_fdb(vswp, ehp); 6552 if (port) { 6553 /* 6554 * Mark port as in-use. 6555 */ 6556 mutex_enter(&port->ref_lock); 6557 port->ref_cnt++; 6558 mutex_exit(&port->ref_lock); 6559 RW_EXIT(&plist->lockrw); 6560 6561 D2(vswp, "%s: sending to target port", __func__); 6562 (void) vsw_portsend(port, mp); 6563 6564 /* 6565 * Finished with port so decrement ref count and 6566 * check if should wake delete thread. 6567 */ 6568 mutex_enter(&port->ref_lock); 6569 port->ref_cnt--; 6570 if (port->ref_cnt == 0) 6571 cv_signal(&port->ref_cv); 6572 mutex_exit(&port->ref_lock); 6573 } else { 6574 RW_EXIT(&plist->lockrw); 6575 /* 6576 * Destination not in FDB 6577 * 6578 * If the destination is broadcast or 6579 * multicast forward the packet to all 6580 * (VNETPORTs, PHYSDEV, LOCALDEV), 6581 * except the caller. 6582 */ 6583 if (IS_BROADCAST(ehp)) { 6584 D2(vswp, "%s: BROADCAST pkt", __func__); 6585 (void) vsw_forward_all(vswp, mp, 6586 caller, arg); 6587 } else if (IS_MULTICAST(ehp)) { 6588 D2(vswp, "%s: MULTICAST pkt", __func__); 6589 (void) vsw_forward_grp(vswp, mp, 6590 caller, arg); 6591 } else { 6592 /* 6593 * Unicast pkt from vnet that we don't have 6594 * an FDB entry for, so must be destinded for 6595 * the outside world. Attempt to send up to the 6596 * IP layer to allow it to deal with it. 6597 */ 6598 if (caller == VSW_VNETPORT) { 6599 READ_ENTER(&vswp->if_lockrw); 6600 if (vswp->if_state & VSW_IF_UP) { 6601 RW_EXIT(&vswp->if_lockrw); 6602 D2(vswp, "%s: sending up", 6603 __func__); 6604 mac_rx(vswp->if_mh, mrh, mp); 6605 } else { 6606 RW_EXIT(&vswp->if_lockrw); 6607 /* Interface down, drop pkt */ 6608 D2(vswp, "%s I/F down", 6609 __func__); 6610 freemsg(mp); 6611 } 6612 } 6613 } 6614 } 6615 } 6616 6617 D1(vswp, "%s: exit", __func__); 6618 } 6619 6620 /* 6621 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 6622 * except the caller (port on which frame arrived). 6623 */ 6624 static int 6625 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6626 { 6627 vsw_port_list_t *plist = &vswp->plist; 6628 vsw_port_t *portp; 6629 mblk_t *nmp = NULL; 6630 mblk_t *ret_m = NULL; 6631 int skip_port = 0; 6632 6633 D1(vswp, "vsw_forward_all: enter\n"); 6634 6635 /* 6636 * Broadcast message from inside ldoms so send to outside 6637 * world if in either of layer 2 modes. 6638 */ 6639 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6640 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6641 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 6642 6643 nmp = dupmsg(mp); 6644 if (nmp) { 6645 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6646 DERR(vswp, "%s: dropping pkt(s) " 6647 "consisting of %ld bytes of data for" 6648 " physical device", __func__, MBLKL(ret_m)); 6649 freemsg(ret_m); 6650 } 6651 } 6652 } 6653 6654 if (caller == VSW_VNETPORT) 6655 skip_port = 1; 6656 6657 /* 6658 * Broadcast message from other vnet (layer 2 or 3) or outside 6659 * world (layer 2 only), send up stack if plumbed. 6660 */ 6661 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 6662 READ_ENTER(&vswp->if_lockrw); 6663 if (vswp->if_state & VSW_IF_UP) { 6664 RW_EXIT(&vswp->if_lockrw); 6665 nmp = copymsg(mp); 6666 if (nmp) 6667 mac_rx(vswp->if_mh, NULL, nmp); 6668 } else { 6669 RW_EXIT(&vswp->if_lockrw); 6670 } 6671 } 6672 6673 /* send it to all VNETPORTs */ 6674 READ_ENTER(&plist->lockrw); 6675 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 6676 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 6677 /* 6678 * Caution ! - don't reorder these two checks as arg 6679 * will be NULL if the caller is PHYSDEV. skip_port is 6680 * only set if caller is VNETPORT. 6681 */ 6682 if ((skip_port) && (portp == arg)) 6683 continue; 6684 else { 6685 nmp = dupmsg(mp); 6686 if (nmp) { 6687 (void) vsw_portsend(portp, nmp); 6688 } else { 6689 DERR(vswp, "vsw_forward_all: nmp NULL"); 6690 } 6691 } 6692 } 6693 RW_EXIT(&plist->lockrw); 6694 6695 freemsg(mp); 6696 6697 D1(vswp, "vsw_forward_all: exit\n"); 6698 return (0); 6699 } 6700 6701 /* 6702 * Forward pkts to any devices or interfaces which have registered 6703 * an interest in them (i.e. multicast groups). 6704 */ 6705 static int 6706 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6707 { 6708 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 6709 mfdb_ent_t *entp = NULL; 6710 mfdb_ent_t *tpp = NULL; 6711 vsw_port_t *port; 6712 uint64_t key = 0; 6713 mblk_t *nmp = NULL; 6714 mblk_t *ret_m = NULL; 6715 boolean_t check_if = B_TRUE; 6716 6717 /* 6718 * Convert address to hash table key 6719 */ 6720 KEY_HASH(key, ehp->ether_dhost); 6721 6722 D1(vswp, "%s: key 0x%llx", __func__, key); 6723 6724 /* 6725 * If pkt came from either a vnet or down the stack (if we are 6726 * plumbed) and we are in layer 2 mode, then we send the pkt out 6727 * over the physical adapter, and then check to see if any other 6728 * vnets are interested in it. 6729 */ 6730 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6731 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6732 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 6733 nmp = dupmsg(mp); 6734 if (nmp) { 6735 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6736 DERR(vswp, "%s: dropping pkt(s) " 6737 "consisting of %ld bytes of " 6738 "data for physical device", 6739 __func__, MBLKL(ret_m)); 6740 freemsg(ret_m); 6741 } 6742 } 6743 } 6744 6745 READ_ENTER(&vswp->mfdbrw); 6746 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 6747 (mod_hash_val_t *)&entp) != 0) { 6748 D3(vswp, "%s: no table entry found for addr 0x%llx", 6749 __func__, key); 6750 } else { 6751 /* 6752 * Send to list of devices associated with this address... 6753 */ 6754 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 6755 6756 /* dont send to ourselves */ 6757 if ((caller == VSW_VNETPORT) && 6758 (tpp->d_addr == (void *)arg)) { 6759 port = (vsw_port_t *)tpp->d_addr; 6760 D3(vswp, "%s: not sending to ourselves" 6761 " : port %d", __func__, 6762 port->p_instance); 6763 continue; 6764 6765 } else if ((caller == VSW_LOCALDEV) && 6766 (tpp->d_type == VSW_LOCALDEV)) { 6767 D3(vswp, "%s: not sending back up stack", 6768 __func__); 6769 continue; 6770 } 6771 6772 if (tpp->d_type == VSW_VNETPORT) { 6773 port = (vsw_port_t *)tpp->d_addr; 6774 D3(vswp, "%s: sending to port %ld for " 6775 " addr 0x%llx", __func__, 6776 port->p_instance, key); 6777 6778 nmp = dupmsg(mp); 6779 if (nmp) 6780 (void) vsw_portsend(port, nmp); 6781 } else { 6782 if (vswp->if_state & VSW_IF_UP) { 6783 nmp = copymsg(mp); 6784 if (nmp) 6785 mac_rx(vswp->if_mh, NULL, nmp); 6786 check_if = B_FALSE; 6787 D3(vswp, "%s: sending up stack" 6788 " for addr 0x%llx", __func__, 6789 key); 6790 } 6791 } 6792 } 6793 } 6794 6795 RW_EXIT(&vswp->mfdbrw); 6796 6797 /* 6798 * If the pkt came from either a vnet or from physical device, 6799 * and if we havent already sent the pkt up the stack then we 6800 * check now if we can/should (i.e. the interface is plumbed 6801 * and in promisc mode). 6802 */ 6803 if ((check_if) && 6804 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 6805 READ_ENTER(&vswp->if_lockrw); 6806 if (VSW_U_P(vswp->if_state)) { 6807 RW_EXIT(&vswp->if_lockrw); 6808 D3(vswp, "%s: (caller %d) finally sending up stack" 6809 " for addr 0x%llx", __func__, caller, key); 6810 nmp = copymsg(mp); 6811 if (nmp) 6812 mac_rx(vswp->if_mh, NULL, nmp); 6813 } else { 6814 RW_EXIT(&vswp->if_lockrw); 6815 } 6816 } 6817 6818 freemsg(mp); 6819 6820 D1(vswp, "%s: exit", __func__); 6821 6822 return (0); 6823 } 6824 6825 /* transmit the packet over the given port */ 6826 static int 6827 vsw_portsend(vsw_port_t *port, mblk_t *mp) 6828 { 6829 vsw_ldc_list_t *ldcl = &port->p_ldclist; 6830 vsw_ldc_t *ldcp; 6831 int status = 0; 6832 6833 6834 READ_ENTER(&ldcl->lockrw); 6835 /* 6836 * Note for now, we have a single channel. 6837 */ 6838 ldcp = ldcl->head; 6839 if (ldcp == NULL) { 6840 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 6841 freemsg(mp); 6842 RW_EXIT(&ldcl->lockrw); 6843 return (1); 6844 } 6845 6846 /* 6847 * Send the message out using the appropriate 6848 * transmit function which will free mblock when it 6849 * is finished with it. 6850 */ 6851 mutex_enter(&port->tx_lock); 6852 if (port->transmit != NULL) 6853 status = (*port->transmit)(ldcp, mp); 6854 else { 6855 freemsg(mp); 6856 } 6857 mutex_exit(&port->tx_lock); 6858 6859 RW_EXIT(&ldcl->lockrw); 6860 6861 return (status); 6862 } 6863 6864 /* 6865 * Send packet out via descriptor ring to a logical device. 6866 */ 6867 static int 6868 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 6869 { 6870 vio_dring_msg_t dring_pkt; 6871 dring_info_t *dp = NULL; 6872 vsw_private_desc_t *priv_desc = NULL; 6873 vnet_public_desc_t *pub = NULL; 6874 vsw_t *vswp = ldcp->ldc_vswp; 6875 mblk_t *bp; 6876 size_t n, size; 6877 caddr_t bufp; 6878 int idx; 6879 int status = LDC_TX_SUCCESS; 6880 6881 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 6882 6883 /* TODO: make test a macro */ 6884 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 6885 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 6886 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 6887 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 6888 ldcp->lane_out.lstate); 6889 freemsg(mp); 6890 return (LDC_TX_FAILURE); 6891 } 6892 6893 /* 6894 * Note - using first ring only, this may change 6895 * in the future. 6896 */ 6897 READ_ENTER(&ldcp->lane_out.dlistrw); 6898 if ((dp = ldcp->lane_out.dringp) == NULL) { 6899 RW_EXIT(&ldcp->lane_out.dlistrw); 6900 DERR(vswp, "%s(%lld): no dring for outbound lane on" 6901 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 6902 freemsg(mp); 6903 return (LDC_TX_FAILURE); 6904 } 6905 6906 size = msgsize(mp); 6907 if (size > (size_t)ETHERMAX) { 6908 RW_EXIT(&ldcp->lane_out.dlistrw); 6909 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6910 ldcp->ldc_id, size); 6911 freemsg(mp); 6912 return (LDC_TX_FAILURE); 6913 } 6914 6915 /* 6916 * Find a free descriptor 6917 * 6918 * Note: for the moment we are assuming that we will only 6919 * have one dring going from the switch to each of its 6920 * peers. This may change in the future. 6921 */ 6922 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6923 D2(vswp, "%s(%lld): no descriptor available for ring " 6924 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6925 6926 /* nothing more we can do */ 6927 status = LDC_TX_NORESOURCES; 6928 goto vsw_dringsend_free_exit; 6929 } else { 6930 D2(vswp, "%s(%lld): free private descriptor found at pos " 6931 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 6932 priv_desc); 6933 } 6934 6935 /* copy data into the descriptor */ 6936 bufp = priv_desc->datap; 6937 bufp += VNET_IPALIGN; 6938 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6939 n = MBLKL(bp); 6940 bcopy(bp->b_rptr, bufp, n); 6941 bufp += n; 6942 } 6943 6944 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6945 6946 pub = priv_desc->descp; 6947 pub->nbytes = priv_desc->datalen; 6948 6949 mutex_enter(&priv_desc->dstate_lock); 6950 pub->hdr.dstate = VIO_DESC_READY; 6951 mutex_exit(&priv_desc->dstate_lock); 6952 6953 /* 6954 * Determine whether or not we need to send a message to our 6955 * peer prompting them to read our newly updated descriptor(s). 6956 */ 6957 mutex_enter(&dp->restart_lock); 6958 if (dp->restart_reqd) { 6959 dp->restart_reqd = B_FALSE; 6960 mutex_exit(&dp->restart_lock); 6961 6962 /* 6963 * Send a vio_dring_msg to peer to prompt them to read 6964 * the updated descriptor ring. 6965 */ 6966 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 6967 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 6968 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 6969 dring_pkt.tag.vio_sid = ldcp->local_session; 6970 6971 /* Note - for now using first ring */ 6972 dring_pkt.dring_ident = dp->ident; 6973 6974 mutex_enter(&ldcp->lane_out.seq_lock); 6975 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 6976 mutex_exit(&ldcp->lane_out.seq_lock); 6977 6978 /* 6979 * If last_ack_recv is -1 then we know we've not 6980 * received any ack's yet, so this must be the first 6981 * msg sent, so set the start to the begining of the ring. 6982 */ 6983 mutex_enter(&dp->dlock); 6984 if (dp->last_ack_recv == -1) { 6985 dring_pkt.start_idx = 0; 6986 } else { 6987 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 6988 dp->num_descriptors; 6989 } 6990 dring_pkt.end_idx = -1; 6991 mutex_exit(&dp->dlock); 6992 6993 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 6994 ldcp->ldc_id, dp, dring_pkt.dring_ident); 6995 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 6996 __func__, ldcp->ldc_id, dring_pkt.start_idx, 6997 dring_pkt.end_idx, dring_pkt.seq_num); 6998 6999 RW_EXIT(&ldcp->lane_out.dlistrw); 7000 7001 (void) vsw_send_msg(ldcp, (void *)&dring_pkt, 7002 sizeof (vio_dring_msg_t), B_TRUE); 7003 7004 /* free the message block */ 7005 freemsg(mp); 7006 return (status); 7007 7008 } else { 7009 mutex_exit(&dp->restart_lock); 7010 D2(vswp, "%s(%lld): updating descp %d", __func__, 7011 ldcp->ldc_id, idx); 7012 } 7013 7014 vsw_dringsend_free_exit: 7015 7016 RW_EXIT(&ldcp->lane_out.dlistrw); 7017 7018 /* free the message block */ 7019 freemsg(mp); 7020 7021 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 7022 return (status); 7023 } 7024 7025 /* 7026 * Send an in-band descriptor message over ldc. 7027 */ 7028 static int 7029 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 7030 { 7031 vsw_t *vswp = ldcp->ldc_vswp; 7032 vnet_ibnd_desc_t ibnd_msg; 7033 vsw_private_desc_t *priv_desc = NULL; 7034 dring_info_t *dp = NULL; 7035 size_t n, size = 0; 7036 caddr_t bufp; 7037 mblk_t *bp; 7038 int idx, i; 7039 int status = LDC_TX_SUCCESS; 7040 static int warn_msg = 1; 7041 7042 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7043 7044 ASSERT(mp != NULL); 7045 7046 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7047 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7048 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 7049 __func__, ldcp->ldc_id, ldcp->ldc_status, 7050 ldcp->lane_out.lstate); 7051 freemsg(mp); 7052 return (LDC_TX_FAILURE); 7053 } 7054 7055 /* 7056 * only expect single dring to exist, which we use 7057 * as an internal buffer, rather than a transfer channel. 7058 */ 7059 READ_ENTER(&ldcp->lane_out.dlistrw); 7060 if ((dp = ldcp->lane_out.dringp) == NULL) { 7061 DERR(vswp, "%s(%lld): no dring for outbound lane", 7062 __func__, ldcp->ldc_id); 7063 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 7064 __func__, ldcp->ldc_id, ldcp->ldc_status, 7065 ldcp->lane_out.lstate); 7066 RW_EXIT(&ldcp->lane_out.dlistrw); 7067 freemsg(mp); 7068 return (LDC_TX_FAILURE); 7069 } 7070 7071 size = msgsize(mp); 7072 if (size > (size_t)ETHERMAX) { 7073 RW_EXIT(&ldcp->lane_out.dlistrw); 7074 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7075 ldcp->ldc_id, size); 7076 freemsg(mp); 7077 return (LDC_TX_FAILURE); 7078 } 7079 7080 /* 7081 * Find a free descriptor in our buffer ring 7082 */ 7083 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7084 RW_EXIT(&ldcp->lane_out.dlistrw); 7085 if (warn_msg) { 7086 DERR(vswp, "%s(%lld): no descriptor available for ring " 7087 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7088 warn_msg = 0; 7089 } 7090 7091 /* nothing more we can do */ 7092 status = LDC_TX_NORESOURCES; 7093 goto vsw_descrsend_free_exit; 7094 } else { 7095 D2(vswp, "%s(%lld): free private descriptor found at pos " 7096 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 7097 priv_desc); 7098 warn_msg = 1; 7099 } 7100 7101 /* copy data into the descriptor */ 7102 bufp = priv_desc->datap; 7103 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7104 n = MBLKL(bp); 7105 bcopy(bp->b_rptr, bufp, n); 7106 bufp += n; 7107 } 7108 7109 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7110 7111 /* create and send the in-band descp msg */ 7112 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 7113 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 7114 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 7115 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 7116 7117 mutex_enter(&ldcp->lane_out.seq_lock); 7118 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 7119 mutex_exit(&ldcp->lane_out.seq_lock); 7120 7121 /* 7122 * Copy the mem cookies describing the data from the 7123 * private region of the descriptor ring into the inband 7124 * descriptor. 7125 */ 7126 for (i = 0; i < priv_desc->ncookies; i++) { 7127 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 7128 sizeof (ldc_mem_cookie_t)); 7129 } 7130 7131 ibnd_msg.hdr.desc_handle = idx; 7132 ibnd_msg.ncookies = priv_desc->ncookies; 7133 ibnd_msg.nbytes = size; 7134 7135 RW_EXIT(&ldcp->lane_out.dlistrw); 7136 7137 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, 7138 sizeof (vnet_ibnd_desc_t), B_TRUE); 7139 7140 vsw_descrsend_free_exit: 7141 7142 /* free the allocated message blocks */ 7143 freemsg(mp); 7144 7145 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7146 return (status); 7147 } 7148 7149 static void 7150 vsw_send_ver(void *arg) 7151 { 7152 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 7153 vsw_t *vswp = ldcp->ldc_vswp; 7154 lane_t *lp = &ldcp->lane_out; 7155 vio_ver_msg_t ver_msg; 7156 7157 D1(vswp, "%s enter", __func__); 7158 7159 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7160 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7161 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 7162 ver_msg.tag.vio_sid = ldcp->local_session; 7163 7164 ver_msg.ver_major = vsw_versions[0].ver_major; 7165 ver_msg.ver_minor = vsw_versions[0].ver_minor; 7166 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 7167 7168 lp->lstate |= VSW_VER_INFO_SENT; 7169 lp->ver_major = ver_msg.ver_major; 7170 lp->ver_minor = ver_msg.ver_minor; 7171 7172 DUMP_TAG(ver_msg.tag); 7173 7174 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); 7175 7176 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 7177 } 7178 7179 static void 7180 vsw_send_attr(vsw_ldc_t *ldcp) 7181 { 7182 vsw_t *vswp = ldcp->ldc_vswp; 7183 lane_t *lp = &ldcp->lane_out; 7184 vnet_attr_msg_t attr_msg; 7185 7186 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7187 7188 /* 7189 * Subtype is set to INFO by default 7190 */ 7191 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7192 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7193 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 7194 attr_msg.tag.vio_sid = ldcp->local_session; 7195 7196 /* payload copied from default settings for lane */ 7197 attr_msg.mtu = lp->mtu; 7198 attr_msg.addr_type = lp->addr_type; 7199 attr_msg.xfer_mode = lp->xfer_mode; 7200 attr_msg.ack_freq = lp->xfer_mode; 7201 7202 READ_ENTER(&vswp->if_lockrw); 7203 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 7204 RW_EXIT(&vswp->if_lockrw); 7205 7206 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 7207 7208 DUMP_TAG(attr_msg.tag); 7209 7210 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); 7211 7212 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7213 } 7214 7215 /* 7216 * Create dring info msg (which also results in the creation of 7217 * a dring). 7218 */ 7219 static vio_dring_reg_msg_t * 7220 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 7221 { 7222 vio_dring_reg_msg_t *mp; 7223 dring_info_t *dp; 7224 vsw_t *vswp = ldcp->ldc_vswp; 7225 7226 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 7227 7228 /* 7229 * If we can't create a dring, obviously no point sending 7230 * a message. 7231 */ 7232 if ((dp = vsw_create_dring(ldcp)) == NULL) 7233 return (NULL); 7234 7235 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 7236 7237 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 7238 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 7239 mp->tag.vio_subtype_env = VIO_DRING_REG; 7240 mp->tag.vio_sid = ldcp->local_session; 7241 7242 /* payload */ 7243 mp->num_descriptors = dp->num_descriptors; 7244 mp->descriptor_size = dp->descriptor_size; 7245 mp->options = dp->options; 7246 mp->ncookies = dp->ncookies; 7247 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 7248 7249 mp->dring_ident = 0; 7250 7251 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 7252 7253 return (mp); 7254 } 7255 7256 static void 7257 vsw_send_dring_info(vsw_ldc_t *ldcp) 7258 { 7259 vio_dring_reg_msg_t *dring_msg; 7260 vsw_t *vswp = ldcp->ldc_vswp; 7261 7262 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 7263 7264 dring_msg = vsw_create_dring_info_pkt(ldcp); 7265 if (dring_msg == NULL) { 7266 cmn_err(CE_WARN, "!vsw%d: %s: error creating msg", 7267 vswp->instance, __func__); 7268 return; 7269 } 7270 7271 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 7272 7273 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 7274 7275 (void) vsw_send_msg(ldcp, dring_msg, 7276 sizeof (vio_dring_reg_msg_t), B_TRUE); 7277 7278 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 7279 7280 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 7281 } 7282 7283 static void 7284 vsw_send_rdx(vsw_ldc_t *ldcp) 7285 { 7286 vsw_t *vswp = ldcp->ldc_vswp; 7287 vio_rdx_msg_t rdx_msg; 7288 7289 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7290 7291 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7292 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7293 rdx_msg.tag.vio_subtype_env = VIO_RDX; 7294 rdx_msg.tag.vio_sid = ldcp->local_session; 7295 7296 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; 7297 7298 DUMP_TAG(rdx_msg.tag); 7299 7300 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); 7301 7302 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7303 } 7304 7305 /* 7306 * Generic routine to send message out over ldc channel. 7307 * 7308 * It is possible that when we attempt to write over the ldc channel 7309 * that we get notified that it has been reset. Depending on the value 7310 * of the handle_reset flag we either handle that event here or simply 7311 * notify the caller that the channel was reset. 7312 */ 7313 static int 7314 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) 7315 { 7316 int rv; 7317 size_t msglen = size; 7318 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 7319 vsw_t *vswp = ldcp->ldc_vswp; 7320 7321 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 7322 ldcp->ldc_id, size); 7323 7324 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 7325 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 7326 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 7327 7328 mutex_enter(&ldcp->ldc_txlock); 7329 do { 7330 msglen = size; 7331 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 7332 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 7333 7334 if ((rv != 0) || (msglen != size)) { 7335 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 7336 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 7337 rv, size, msglen); 7338 } 7339 mutex_exit(&ldcp->ldc_txlock); 7340 7341 /* 7342 * If channel has been reset we either handle it here or 7343 * simply report back that it has been reset and let caller 7344 * decide what to do. 7345 */ 7346 if (rv == ECONNRESET) { 7347 DWARN(vswp, "%s (%lld) channel reset", 7348 __func__, ldcp->ldc_id); 7349 7350 /* 7351 * N.B - must never be holding the dlistrw lock when 7352 * we do a reset of the channel. 7353 */ 7354 if (handle_reset) { 7355 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 7356 } 7357 } 7358 7359 return (rv); 7360 } 7361 7362 /* 7363 * Add an entry into FDB, for the given mac address and port_id. 7364 * Returns 0 on success, 1 on failure. 7365 * 7366 * Lock protecting FDB must be held by calling process. 7367 */ 7368 static int 7369 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 7370 { 7371 uint64_t addr = 0; 7372 7373 D1(vswp, "%s: enter", __func__); 7374 7375 KEY_HASH(addr, port->p_macaddr); 7376 7377 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7378 7379 /* 7380 * Note: duplicate keys will be rejected by mod_hash. 7381 */ 7382 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 7383 (mod_hash_val_t)port) != 0) { 7384 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 7385 return (1); 7386 } 7387 7388 D1(vswp, "%s: exit", __func__); 7389 return (0); 7390 } 7391 7392 /* 7393 * Remove an entry from FDB. 7394 * Returns 0 on success, 1 on failure. 7395 */ 7396 static int 7397 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 7398 { 7399 uint64_t addr = 0; 7400 7401 D1(vswp, "%s: enter", __func__); 7402 7403 KEY_HASH(addr, port->p_macaddr); 7404 7405 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7406 7407 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 7408 7409 D1(vswp, "%s: enter", __func__); 7410 7411 return (0); 7412 } 7413 7414 /* 7415 * Search fdb for a given mac address. 7416 * Returns pointer to the entry if found, else returns NULL. 7417 */ 7418 static vsw_port_t * 7419 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 7420 { 7421 uint64_t key = 0; 7422 vsw_port_t *port = NULL; 7423 7424 D1(vswp, "%s: enter", __func__); 7425 7426 KEY_HASH(key, ehp->ether_dhost); 7427 7428 D2(vswp, "%s: key = 0x%llx", __func__, key); 7429 7430 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 7431 (mod_hash_val_t *)&port) != 0) { 7432 D2(vswp, "%s: no port found", __func__); 7433 return (NULL); 7434 } 7435 7436 D1(vswp, "%s: exit", __func__); 7437 7438 return (port); 7439 } 7440 7441 /* 7442 * Add or remove multicast address(es). 7443 * 7444 * Returns 0 on success, 1 on failure. 7445 */ 7446 static int 7447 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 7448 { 7449 mcst_addr_t *mcst_p = NULL; 7450 vsw_t *vswp = port->p_vswp; 7451 uint64_t addr = 0x0; 7452 int i; 7453 7454 D1(vswp, "%s: enter", __func__); 7455 7456 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 7457 7458 mutex_enter(&vswp->mac_lock); 7459 if (vswp->mh == NULL) { 7460 mutex_exit(&vswp->mac_lock); 7461 return (1); 7462 } 7463 mutex_exit(&vswp->mac_lock); 7464 7465 for (i = 0; i < mcst_pkt->count; i++) { 7466 /* 7467 * Convert address into form that can be used 7468 * as hash table key. 7469 */ 7470 KEY_HASH(addr, mcst_pkt->mca[i]); 7471 7472 /* 7473 * Add or delete the specified address/port combination. 7474 */ 7475 if (mcst_pkt->set == 0x1) { 7476 D3(vswp, "%s: adding multicast address 0x%llx for " 7477 "port %ld", __func__, addr, port->p_instance); 7478 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7479 /* 7480 * Update the list of multicast 7481 * addresses contained within the 7482 * port structure to include this new 7483 * one. 7484 */ 7485 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 7486 KM_NOSLEEP); 7487 if (mcst_p == NULL) { 7488 DERR(vswp, "%s: unable to alloc mem", 7489 __func__); 7490 return (1); 7491 } 7492 7493 mcst_p->nextp = NULL; 7494 mcst_p->addr = addr; 7495 7496 mutex_enter(&port->mca_lock); 7497 mcst_p->nextp = port->mcap; 7498 port->mcap = mcst_p; 7499 mutex_exit(&port->mca_lock); 7500 7501 /* 7502 * Program the address into HW. If the addr 7503 * has already been programmed then the MAC 7504 * just increments a ref counter (which is 7505 * used when the address is being deleted) 7506 */ 7507 mutex_enter(&vswp->mac_lock); 7508 if ((vswp->mh == NULL) || 7509 mac_multicst_add(vswp->mh, 7510 (uchar_t *)&mcst_pkt->mca[i])) { 7511 mutex_exit(&vswp->mac_lock); 7512 cmn_err(CE_WARN, "!vsw%d: unable to " 7513 "add multicast address", 7514 vswp->instance); 7515 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7516 addr, port); 7517 vsw_del_addr(VSW_VNETPORT, port, addr); 7518 return (1); 7519 } 7520 mutex_exit(&vswp->mac_lock); 7521 7522 } else { 7523 DERR(vswp, "%s: error adding multicast " 7524 "address 0x%llx for port %ld", 7525 __func__, addr, port->p_instance); 7526 return (1); 7527 } 7528 } else { 7529 /* 7530 * Delete an entry from the multicast hash 7531 * table and update the address list 7532 * appropriately. 7533 */ 7534 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7535 D3(vswp, "%s: deleting multicast address " 7536 "0x%llx for port %ld", __func__, addr, 7537 port->p_instance); 7538 7539 vsw_del_addr(VSW_VNETPORT, port, addr); 7540 7541 /* 7542 * Remove the address from HW. The address 7543 * will actually only be removed once the ref 7544 * count within the MAC layer has dropped to 7545 * zero. I.e. we can safely call this fn even 7546 * if other ports are interested in this 7547 * address. 7548 */ 7549 mutex_enter(&vswp->mac_lock); 7550 if ((vswp->mh == NULL) || 7551 mac_multicst_remove(vswp->mh, 7552 (uchar_t *)&mcst_pkt->mca[i])) { 7553 mutex_exit(&vswp->mac_lock); 7554 cmn_err(CE_WARN, "!vsw%d: unable to " 7555 "remove multicast address", 7556 vswp->instance); 7557 return (1); 7558 } 7559 mutex_exit(&vswp->mac_lock); 7560 7561 } else { 7562 DERR(vswp, "%s: error deleting multicast " 7563 "addr 0x%llx for port %ld", 7564 __func__, addr, port->p_instance); 7565 return (1); 7566 } 7567 } 7568 } 7569 D1(vswp, "%s: exit", __func__); 7570 return (0); 7571 } 7572 7573 /* 7574 * Add a new multicast entry. 7575 * 7576 * Search hash table based on address. If match found then 7577 * update associated val (which is chain of ports), otherwise 7578 * create new key/val (addr/port) pair and insert into table. 7579 */ 7580 static int 7581 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7582 { 7583 int dup = 0; 7584 int rv = 0; 7585 mfdb_ent_t *ment = NULL; 7586 mfdb_ent_t *tmp_ent = NULL; 7587 mfdb_ent_t *new_ent = NULL; 7588 void *tgt = NULL; 7589 7590 if (devtype == VSW_VNETPORT) { 7591 /* 7592 * Being invoked from a vnet. 7593 */ 7594 ASSERT(arg != NULL); 7595 tgt = arg; 7596 D2(NULL, "%s: port %d : address 0x%llx", __func__, 7597 ((vsw_port_t *)arg)->p_instance, addr); 7598 } else { 7599 /* 7600 * We are being invoked via the m_multicst mac entry 7601 * point. 7602 */ 7603 D2(NULL, "%s: address 0x%llx", __func__, addr); 7604 tgt = (void *)vswp; 7605 } 7606 7607 WRITE_ENTER(&vswp->mfdbrw); 7608 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7609 (mod_hash_val_t *)&ment) != 0) { 7610 7611 /* address not currently in table */ 7612 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7613 ment->d_addr = (void *)tgt; 7614 ment->d_type = devtype; 7615 ment->nextp = NULL; 7616 7617 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 7618 (mod_hash_val_t)ment) != 0) { 7619 DERR(vswp, "%s: hash table insertion failed", __func__); 7620 kmem_free(ment, sizeof (mfdb_ent_t)); 7621 rv = 1; 7622 } else { 7623 D2(vswp, "%s: added initial entry for 0x%llx to " 7624 "table", __func__, addr); 7625 } 7626 } else { 7627 /* 7628 * Address in table. Check to see if specified port 7629 * is already associated with the address. If not add 7630 * it now. 7631 */ 7632 tmp_ent = ment; 7633 while (tmp_ent != NULL) { 7634 if (tmp_ent->d_addr == (void *)tgt) { 7635 if (devtype == VSW_VNETPORT) { 7636 DERR(vswp, "%s: duplicate port entry " 7637 "found for portid %ld and key " 7638 "0x%llx", __func__, 7639 ((vsw_port_t *)arg)->p_instance, 7640 addr); 7641 } else { 7642 DERR(vswp, "%s: duplicate entry found" 7643 "for key 0x%llx", 7644 __func__, addr); 7645 } 7646 rv = 1; 7647 dup = 1; 7648 break; 7649 } 7650 tmp_ent = tmp_ent->nextp; 7651 } 7652 7653 /* 7654 * Port not on list so add it to end now. 7655 */ 7656 if (0 == dup) { 7657 D2(vswp, "%s: added entry for 0x%llx to table", 7658 __func__, addr); 7659 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7660 new_ent->d_addr = (void *)tgt; 7661 new_ent->d_type = devtype; 7662 new_ent->nextp = NULL; 7663 7664 tmp_ent = ment; 7665 while (tmp_ent->nextp != NULL) 7666 tmp_ent = tmp_ent->nextp; 7667 7668 tmp_ent->nextp = new_ent; 7669 } 7670 } 7671 7672 RW_EXIT(&vswp->mfdbrw); 7673 return (rv); 7674 } 7675 7676 /* 7677 * Remove a multicast entry from the hashtable. 7678 * 7679 * Search hash table based on address. If match found, scan 7680 * list of ports associated with address. If specified port 7681 * found remove it from list. 7682 */ 7683 static int 7684 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7685 { 7686 mfdb_ent_t *ment = NULL; 7687 mfdb_ent_t *curr_p, *prev_p; 7688 void *tgt = NULL; 7689 7690 D1(vswp, "%s: enter", __func__); 7691 7692 if (devtype == VSW_VNETPORT) { 7693 tgt = (vsw_port_t *)arg; 7694 D2(vswp, "%s: removing port %d from mFDB for address" 7695 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 7696 addr); 7697 } else { 7698 D2(vswp, "%s: removing entry", __func__); 7699 tgt = (void *)vswp; 7700 } 7701 7702 WRITE_ENTER(&vswp->mfdbrw); 7703 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7704 (mod_hash_val_t *)&ment) != 0) { 7705 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 7706 RW_EXIT(&vswp->mfdbrw); 7707 return (1); 7708 } 7709 7710 prev_p = curr_p = ment; 7711 7712 while (curr_p != NULL) { 7713 if (curr_p->d_addr == (void *)tgt) { 7714 if (devtype == VSW_VNETPORT) { 7715 D2(vswp, "%s: port %d found", __func__, 7716 ((vsw_port_t *)tgt)->p_instance); 7717 } else { 7718 D2(vswp, "%s: instance found", __func__); 7719 } 7720 7721 if (prev_p == curr_p) { 7722 /* 7723 * head of list, if no other element is in 7724 * list then destroy this entry, otherwise 7725 * just replace it with updated value. 7726 */ 7727 ment = curr_p->nextp; 7728 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7729 if (ment == NULL) { 7730 (void) mod_hash_destroy(vswp->mfdb, 7731 (mod_hash_val_t)addr); 7732 } else { 7733 (void) mod_hash_replace(vswp->mfdb, 7734 (mod_hash_key_t)addr, 7735 (mod_hash_val_t)ment); 7736 } 7737 } else { 7738 /* 7739 * Not head of list, no need to do 7740 * replacement, just adjust list pointers. 7741 */ 7742 prev_p->nextp = curr_p->nextp; 7743 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7744 } 7745 break; 7746 } 7747 7748 prev_p = curr_p; 7749 curr_p = curr_p->nextp; 7750 } 7751 7752 RW_EXIT(&vswp->mfdbrw); 7753 7754 D1(vswp, "%s: exit", __func__); 7755 7756 return (0); 7757 } 7758 7759 /* 7760 * Port is being deleted, but has registered an interest in one 7761 * or more multicast groups. Using the list of addresses maintained 7762 * within the port structure find the appropriate entry in the hash 7763 * table and remove this port from the list of interested ports. 7764 */ 7765 static void 7766 vsw_del_mcst_port(vsw_port_t *port) 7767 { 7768 mcst_addr_t *mcst_p = NULL; 7769 vsw_t *vswp = port->p_vswp; 7770 7771 D1(vswp, "%s: enter", __func__); 7772 7773 mutex_enter(&port->mca_lock); 7774 while (port->mcap != NULL) { 7775 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7776 port->mcap->addr, port); 7777 7778 mcst_p = port->mcap->nextp; 7779 kmem_free(port->mcap, sizeof (mcst_addr_t)); 7780 port->mcap = mcst_p; 7781 } 7782 mutex_exit(&port->mca_lock); 7783 7784 D1(vswp, "%s: exit", __func__); 7785 } 7786 7787 /* 7788 * This vsw instance is detaching, but has registered an interest in one 7789 * or more multicast groups. Using the list of addresses maintained 7790 * within the vsw structure find the appropriate entry in the hash 7791 * table and remove this instance from the list of interested ports. 7792 */ 7793 static void 7794 vsw_del_mcst_vsw(vsw_t *vswp) 7795 { 7796 mcst_addr_t *next_p = NULL; 7797 7798 D1(vswp, "%s: enter", __func__); 7799 7800 mutex_enter(&vswp->mca_lock); 7801 7802 while (vswp->mcap != NULL) { 7803 DERR(vswp, "%s: deleting addr 0x%llx", 7804 __func__, vswp->mcap->addr); 7805 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 7806 vswp->mcap->addr, NULL); 7807 7808 next_p = vswp->mcap->nextp; 7809 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 7810 vswp->mcap = next_p; 7811 } 7812 7813 vswp->mcap = NULL; 7814 mutex_exit(&vswp->mca_lock); 7815 7816 D1(vswp, "%s: exit", __func__); 7817 } 7818 7819 7820 /* 7821 * Remove the specified address from the list of address maintained 7822 * in this port node. 7823 */ 7824 static void 7825 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 7826 { 7827 vsw_t *vswp = NULL; 7828 vsw_port_t *port = NULL; 7829 mcst_addr_t *prev_p = NULL; 7830 mcst_addr_t *curr_p = NULL; 7831 7832 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 7833 __func__, devtype, addr); 7834 7835 if (devtype == VSW_VNETPORT) { 7836 port = (vsw_port_t *)arg; 7837 mutex_enter(&port->mca_lock); 7838 prev_p = curr_p = port->mcap; 7839 } else { 7840 vswp = (vsw_t *)arg; 7841 mutex_enter(&vswp->mca_lock); 7842 prev_p = curr_p = vswp->mcap; 7843 } 7844 7845 while (curr_p != NULL) { 7846 if (curr_p->addr == addr) { 7847 D2(NULL, "%s: address found", __func__); 7848 /* match found */ 7849 if (prev_p == curr_p) { 7850 /* list head */ 7851 if (devtype == VSW_VNETPORT) 7852 port->mcap = curr_p->nextp; 7853 else 7854 vswp->mcap = curr_p->nextp; 7855 } else { 7856 prev_p->nextp = curr_p->nextp; 7857 } 7858 kmem_free(curr_p, sizeof (mcst_addr_t)); 7859 break; 7860 } else { 7861 prev_p = curr_p; 7862 curr_p = curr_p->nextp; 7863 } 7864 } 7865 7866 if (devtype == VSW_VNETPORT) 7867 mutex_exit(&port->mca_lock); 7868 else 7869 mutex_exit(&vswp->mca_lock); 7870 7871 D1(NULL, "%s: exit", __func__); 7872 } 7873 7874 /* 7875 * Creates a descriptor ring (dring) and links it into the 7876 * link of outbound drings for this channel. 7877 * 7878 * Returns NULL if creation failed. 7879 */ 7880 static dring_info_t * 7881 vsw_create_dring(vsw_ldc_t *ldcp) 7882 { 7883 vsw_private_desc_t *priv_addr = NULL; 7884 vsw_t *vswp = ldcp->ldc_vswp; 7885 ldc_mem_info_t minfo; 7886 dring_info_t *dp, *tp; 7887 int i; 7888 7889 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 7890 7891 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 7892 7893 /* create public section of ring */ 7894 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 7895 VSW_PUB_SIZE, &dp->handle)) != 0) { 7896 7897 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 7898 "failed", ldcp->ldc_id); 7899 goto create_fail_exit; 7900 } 7901 7902 ASSERT(dp->handle != NULL); 7903 7904 /* 7905 * Get the base address of the public section of the ring. 7906 */ 7907 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 7908 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 7909 ldcp->ldc_id); 7910 goto dring_fail_exit; 7911 } else { 7912 ASSERT(minfo.vaddr != 0); 7913 dp->pub_addr = minfo.vaddr; 7914 } 7915 7916 dp->num_descriptors = VSW_RING_NUM_EL; 7917 dp->descriptor_size = VSW_PUB_SIZE; 7918 dp->options = VIO_TX_DRING; 7919 dp->ncookies = 1; /* guaranteed by ldc */ 7920 7921 /* 7922 * create private portion of ring 7923 */ 7924 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 7925 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 7926 7927 if (vsw_setup_ring(ldcp, dp)) { 7928 DERR(vswp, "%s: unable to setup ring", __func__); 7929 goto dring_fail_exit; 7930 } 7931 7932 /* haven't used any descriptors yet */ 7933 dp->end_idx = 0; 7934 dp->last_ack_recv = -1; 7935 7936 /* bind dring to the channel */ 7937 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 7938 LDC_SHADOW_MAP, LDC_MEM_RW, 7939 &dp->cookie[0], &dp->ncookies)) != 0) { 7940 DERR(vswp, "vsw_create_dring: unable to bind to channel " 7941 "%lld", ldcp->ldc_id); 7942 goto dring_fail_exit; 7943 } 7944 7945 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 7946 dp->restart_reqd = B_TRUE; 7947 7948 /* 7949 * Only ever create rings for outgoing lane. Link it onto 7950 * end of list. 7951 */ 7952 WRITE_ENTER(&ldcp->lane_out.dlistrw); 7953 if (ldcp->lane_out.dringp == NULL) { 7954 D2(vswp, "vsw_create_dring: adding first outbound ring"); 7955 ldcp->lane_out.dringp = dp; 7956 } else { 7957 tp = ldcp->lane_out.dringp; 7958 while (tp->next != NULL) 7959 tp = tp->next; 7960 7961 tp->next = dp; 7962 } 7963 RW_EXIT(&ldcp->lane_out.dlistrw); 7964 7965 return (dp); 7966 7967 dring_fail_exit: 7968 (void) ldc_mem_dring_destroy(dp->handle); 7969 7970 create_fail_exit: 7971 if (dp->priv_addr != NULL) { 7972 priv_addr = dp->priv_addr; 7973 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7974 if (priv_addr->memhandle != NULL) 7975 (void) ldc_mem_free_handle( 7976 priv_addr->memhandle); 7977 priv_addr++; 7978 } 7979 kmem_free(dp->priv_addr, 7980 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 7981 } 7982 mutex_destroy(&dp->dlock); 7983 7984 kmem_free(dp, sizeof (dring_info_t)); 7985 return (NULL); 7986 } 7987 7988 /* 7989 * Create a ring consisting of just a private portion and link 7990 * it into the list of rings for the outbound lane. 7991 * 7992 * These type of rings are used primarily for temporary data 7993 * storage (i.e. as data buffers). 7994 */ 7995 void 7996 vsw_create_privring(vsw_ldc_t *ldcp) 7997 { 7998 dring_info_t *dp, *tp; 7999 vsw_t *vswp = ldcp->ldc_vswp; 8000 8001 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 8002 8003 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8004 8005 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8006 8007 /* no public section */ 8008 dp->pub_addr = NULL; 8009 8010 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 8011 VSW_RING_NUM_EL), KM_SLEEP); 8012 8013 dp->num_descriptors = VSW_RING_NUM_EL; 8014 8015 if (vsw_setup_ring(ldcp, dp)) { 8016 DERR(vswp, "%s: setup of ring failed", __func__); 8017 kmem_free(dp->priv_addr, 8018 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8019 mutex_destroy(&dp->dlock); 8020 kmem_free(dp, sizeof (dring_info_t)); 8021 return; 8022 } 8023 8024 /* haven't used any descriptors yet */ 8025 dp->end_idx = 0; 8026 8027 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8028 dp->restart_reqd = B_TRUE; 8029 8030 /* 8031 * Only ever create rings for outgoing lane. Link it onto 8032 * end of list. 8033 */ 8034 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8035 if (ldcp->lane_out.dringp == NULL) { 8036 D2(vswp, "%s: adding first outbound privring", __func__); 8037 ldcp->lane_out.dringp = dp; 8038 } else { 8039 tp = ldcp->lane_out.dringp; 8040 while (tp->next != NULL) 8041 tp = tp->next; 8042 8043 tp->next = dp; 8044 } 8045 RW_EXIT(&ldcp->lane_out.dlistrw); 8046 8047 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 8048 } 8049 8050 /* 8051 * Setup the descriptors in the dring. Returns 0 on success, 1 on 8052 * failure. 8053 */ 8054 int 8055 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 8056 { 8057 vnet_public_desc_t *pub_addr = NULL; 8058 vsw_private_desc_t *priv_addr = NULL; 8059 vsw_t *vswp = ldcp->ldc_vswp; 8060 uint64_t *tmpp; 8061 uint64_t offset = 0; 8062 uint32_t ncookies = 0; 8063 static char *name = "vsw_setup_ring"; 8064 int i, j, nc, rv; 8065 8066 priv_addr = dp->priv_addr; 8067 pub_addr = dp->pub_addr; 8068 8069 /* public section may be null but private should never be */ 8070 ASSERT(priv_addr != NULL); 8071 8072 /* 8073 * Allocate the region of memory which will be used to hold 8074 * the data the descriptors will refer to. 8075 */ 8076 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 8077 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 8078 8079 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 8080 dp->data_sz, dp->data_addr); 8081 8082 tmpp = (uint64_t *)dp->data_addr; 8083 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 8084 8085 /* 8086 * Initialise some of the private and public (if they exist) 8087 * descriptor fields. 8088 */ 8089 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8090 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 8091 8092 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 8093 &priv_addr->memhandle)) != 0) { 8094 DERR(vswp, "%s: alloc mem handle failed", name); 8095 goto setup_ring_cleanup; 8096 } 8097 8098 priv_addr->datap = (void *)tmpp; 8099 8100 rv = ldc_mem_bind_handle(priv_addr->memhandle, 8101 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 8102 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 8103 &(priv_addr->memcookie[0]), &ncookies); 8104 if (rv != 0) { 8105 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 8106 "(rv %d)", name, ldcp->ldc_id, rv); 8107 goto setup_ring_cleanup; 8108 } 8109 priv_addr->bound = 1; 8110 8111 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 8112 name, i, priv_addr->memcookie[0].addr, 8113 priv_addr->memcookie[0].size); 8114 8115 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 8116 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 8117 "invalid num of cookies (%d) for size 0x%llx", 8118 name, ldcp->ldc_id, ncookies, 8119 VSW_RING_EL_DATA_SZ); 8120 8121 goto setup_ring_cleanup; 8122 } else { 8123 for (j = 1; j < ncookies; j++) { 8124 rv = ldc_mem_nextcookie(priv_addr->memhandle, 8125 &(priv_addr->memcookie[j])); 8126 if (rv != 0) { 8127 DERR(vswp, "%s: ldc_mem_nextcookie " 8128 "failed rv (%d)", name, rv); 8129 goto setup_ring_cleanup; 8130 } 8131 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 8132 "size 0x%llx", name, j, 8133 priv_addr->memcookie[j].addr, 8134 priv_addr->memcookie[j].size); 8135 } 8136 8137 } 8138 priv_addr->ncookies = ncookies; 8139 priv_addr->dstate = VIO_DESC_FREE; 8140 8141 if (pub_addr != NULL) { 8142 8143 /* link pub and private sides */ 8144 priv_addr->descp = pub_addr; 8145 8146 pub_addr->ncookies = priv_addr->ncookies; 8147 8148 for (nc = 0; nc < pub_addr->ncookies; nc++) { 8149 bcopy(&priv_addr->memcookie[nc], 8150 &pub_addr->memcookie[nc], 8151 sizeof (ldc_mem_cookie_t)); 8152 } 8153 8154 pub_addr->hdr.dstate = VIO_DESC_FREE; 8155 pub_addr++; 8156 } 8157 8158 /* 8159 * move to next element in the dring and the next 8160 * position in the data buffer. 8161 */ 8162 priv_addr++; 8163 tmpp += offset; 8164 } 8165 8166 return (0); 8167 8168 setup_ring_cleanup: 8169 priv_addr = dp->priv_addr; 8170 8171 for (j = 0; j < i; j++) { 8172 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 8173 (void) ldc_mem_free_handle(priv_addr->memhandle); 8174 8175 mutex_destroy(&priv_addr->dstate_lock); 8176 8177 priv_addr++; 8178 } 8179 kmem_free(dp->data_addr, dp->data_sz); 8180 8181 return (1); 8182 } 8183 8184 /* 8185 * Searches the private section of a ring for a free descriptor, 8186 * starting at the location of the last free descriptor found 8187 * previously. 8188 * 8189 * Returns 0 if free descriptor is available, and updates state 8190 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 8191 * 8192 * FUTURE: might need to return contiguous range of descriptors 8193 * as dring info msg assumes all will be contiguous. 8194 */ 8195 static int 8196 vsw_dring_find_free_desc(dring_info_t *dringp, 8197 vsw_private_desc_t **priv_p, int *idx) 8198 { 8199 vsw_private_desc_t *addr = NULL; 8200 int num = VSW_RING_NUM_EL; 8201 int ret = 1; 8202 8203 D1(NULL, "%s enter\n", __func__); 8204 8205 ASSERT(dringp->priv_addr != NULL); 8206 8207 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 8208 __func__, dringp, dringp->end_idx); 8209 8210 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 8211 8212 mutex_enter(&addr->dstate_lock); 8213 if (addr->dstate == VIO_DESC_FREE) { 8214 addr->dstate = VIO_DESC_READY; 8215 *priv_p = addr; 8216 *idx = dringp->end_idx; 8217 dringp->end_idx = (dringp->end_idx + 1) % num; 8218 ret = 0; 8219 8220 } 8221 mutex_exit(&addr->dstate_lock); 8222 8223 /* ring full */ 8224 if (ret == 1) { 8225 D2(NULL, "%s: no desp free: started at %d", __func__, 8226 dringp->end_idx); 8227 } 8228 8229 D1(NULL, "%s: exit\n", __func__); 8230 8231 return (ret); 8232 } 8233 8234 /* 8235 * Map from a dring identifier to the ring itself. Returns 8236 * pointer to ring or NULL if no match found. 8237 * 8238 * Should be called with dlistrw rwlock held as reader. 8239 */ 8240 static dring_info_t * 8241 vsw_ident2dring(lane_t *lane, uint64_t ident) 8242 { 8243 dring_info_t *dp = NULL; 8244 8245 if ((dp = lane->dringp) == NULL) { 8246 return (NULL); 8247 } else { 8248 if (dp->ident == ident) 8249 return (dp); 8250 8251 while (dp != NULL) { 8252 if (dp->ident == ident) 8253 break; 8254 dp = dp->next; 8255 } 8256 } 8257 8258 return (dp); 8259 } 8260 8261 /* 8262 * Set the default lane attributes. These are copied into 8263 * the attr msg we send to our peer. If they are not acceptable 8264 * then (currently) the handshake ends. 8265 */ 8266 static void 8267 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 8268 { 8269 bzero(lp, sizeof (lane_t)); 8270 8271 READ_ENTER(&vswp->if_lockrw); 8272 ether_copy(&(vswp->if_addr), &(lp->addr)); 8273 RW_EXIT(&vswp->if_lockrw); 8274 8275 lp->mtu = VSW_MTU; 8276 lp->addr_type = ADDR_TYPE_MAC; 8277 lp->xfer_mode = VIO_DRING_MODE; 8278 lp->ack_freq = 0; /* for shared mode */ 8279 8280 mutex_enter(&lp->seq_lock); 8281 lp->seq_num = VNET_ISS; 8282 mutex_exit(&lp->seq_lock); 8283 } 8284 8285 /* 8286 * Verify that the attributes are acceptable. 8287 * 8288 * FUTURE: If some attributes are not acceptable, change them 8289 * our desired values. 8290 */ 8291 static int 8292 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 8293 { 8294 int ret = 0; 8295 8296 D1(NULL, "vsw_check_attr enter\n"); 8297 8298 /* 8299 * Note we currently only support in-band descriptors 8300 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 8301 */ 8302 if ((pkt->xfer_mode != VIO_DESC_MODE) && 8303 (pkt->xfer_mode != VIO_DRING_MODE)) { 8304 D2(NULL, "vsw_check_attr: unknown mode %x\n", 8305 pkt->xfer_mode); 8306 ret = 1; 8307 } 8308 8309 /* Only support MAC addresses at moment. */ 8310 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 8311 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 8312 "or address 0x%llx\n", pkt->addr_type, 8313 pkt->addr); 8314 ret = 1; 8315 } 8316 8317 /* 8318 * MAC address supplied by device should match that stored 8319 * in the vsw-port OBP node. Need to decide what to do if they 8320 * don't match, for the moment just warn but don't fail. 8321 */ 8322 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 8323 DERR(NULL, "vsw_check_attr: device supplied address " 8324 "0x%llx doesn't match node address 0x%llx\n", 8325 pkt->addr, port->p_macaddr); 8326 } 8327 8328 /* 8329 * Ack freq only makes sense in pkt mode, in shared 8330 * mode the ring descriptors say whether or not to 8331 * send back an ACK. 8332 */ 8333 if ((pkt->xfer_mode == VIO_DRING_MODE) && 8334 (pkt->ack_freq > 0)) { 8335 D2(NULL, "vsw_check_attr: non zero ack freq " 8336 " in SHM mode\n"); 8337 ret = 1; 8338 } 8339 8340 /* 8341 * Note: for the moment we only support ETHER 8342 * frames. This may change in the future. 8343 */ 8344 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 8345 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 8346 pkt->mtu); 8347 ret = 1; 8348 } 8349 8350 D1(NULL, "vsw_check_attr exit\n"); 8351 8352 return (ret); 8353 } 8354 8355 /* 8356 * Returns 1 if there is a problem, 0 otherwise. 8357 */ 8358 static int 8359 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 8360 { 8361 _NOTE(ARGUNUSED(pkt)) 8362 8363 int ret = 0; 8364 8365 D1(NULL, "vsw_check_dring_info enter\n"); 8366 8367 if ((pkt->num_descriptors == 0) || 8368 (pkt->descriptor_size == 0) || 8369 (pkt->ncookies != 1)) { 8370 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 8371 ret = 1; 8372 } 8373 8374 D1(NULL, "vsw_check_dring_info exit\n"); 8375 8376 return (ret); 8377 } 8378 8379 /* 8380 * Returns 1 if two memory cookies match. Otherwise returns 0. 8381 */ 8382 static int 8383 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 8384 { 8385 if ((m1->addr != m2->addr) || 8386 (m2->size != m2->size)) { 8387 return (0); 8388 } else { 8389 return (1); 8390 } 8391 } 8392 8393 /* 8394 * Returns 1 if ring described in reg message matches that 8395 * described by dring_info structure. Otherwise returns 0. 8396 */ 8397 static int 8398 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 8399 { 8400 if ((msg->descriptor_size != dp->descriptor_size) || 8401 (msg->num_descriptors != dp->num_descriptors) || 8402 (msg->ncookies != dp->ncookies) || 8403 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 8404 return (0); 8405 } else { 8406 return (1); 8407 } 8408 8409 } 8410 8411 static caddr_t 8412 vsw_print_ethaddr(uint8_t *a, char *ebuf) 8413 { 8414 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 8415 a[0], a[1], a[2], a[3], a[4], a[5]); 8416 return (ebuf); 8417 } 8418 8419 /* 8420 * Reset and free all the resources associated with 8421 * the channel. 8422 */ 8423 static void 8424 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 8425 { 8426 dring_info_t *dp, *dpp; 8427 lane_t *lp = NULL; 8428 int rv = 0; 8429 8430 ASSERT(ldcp != NULL); 8431 8432 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 8433 8434 if (dir == INBOUND) { 8435 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 8436 " of channel %lld", __func__, ldcp->ldc_id); 8437 lp = &ldcp->lane_in; 8438 } else { 8439 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 8440 " of channel %lld", __func__, ldcp->ldc_id); 8441 lp = &ldcp->lane_out; 8442 } 8443 8444 lp->lstate = VSW_LANE_INACTIV; 8445 mutex_enter(&lp->seq_lock); 8446 lp->seq_num = VNET_ISS; 8447 mutex_exit(&lp->seq_lock); 8448 if (lp->dringp) { 8449 if (dir == INBOUND) { 8450 WRITE_ENTER(&lp->dlistrw); 8451 dp = lp->dringp; 8452 while (dp != NULL) { 8453 dpp = dp->next; 8454 if (dp->handle != NULL) 8455 (void) ldc_mem_dring_unmap(dp->handle); 8456 kmem_free(dp, sizeof (dring_info_t)); 8457 dp = dpp; 8458 } 8459 RW_EXIT(&lp->dlistrw); 8460 } else { 8461 /* 8462 * unbind, destroy exported dring, free dring struct 8463 */ 8464 WRITE_ENTER(&lp->dlistrw); 8465 dp = lp->dringp; 8466 rv = vsw_free_ring(dp); 8467 RW_EXIT(&lp->dlistrw); 8468 } 8469 if (rv == 0) { 8470 lp->dringp = NULL; 8471 } 8472 } 8473 8474 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 8475 } 8476 8477 /* 8478 * Free ring and all associated resources. 8479 * 8480 * Should be called with dlistrw rwlock held as writer. 8481 */ 8482 static int 8483 vsw_free_ring(dring_info_t *dp) 8484 { 8485 vsw_private_desc_t *paddr = NULL; 8486 dring_info_t *dpp; 8487 int i, rv = 1; 8488 8489 while (dp != NULL) { 8490 mutex_enter(&dp->dlock); 8491 dpp = dp->next; 8492 if (dp->priv_addr != NULL) { 8493 /* 8494 * First unbind and free the memory handles 8495 * stored in each descriptor within the ring. 8496 */ 8497 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8498 paddr = (vsw_private_desc_t *) 8499 dp->priv_addr + i; 8500 if (paddr->memhandle != NULL) { 8501 if (paddr->bound == 1) { 8502 rv = ldc_mem_unbind_handle( 8503 paddr->memhandle); 8504 8505 if (rv != 0) { 8506 DERR(NULL, "error " 8507 "unbinding handle for " 8508 "ring 0x%llx at pos %d", 8509 dp, i); 8510 mutex_exit(&dp->dlock); 8511 return (rv); 8512 } 8513 paddr->bound = 0; 8514 } 8515 8516 rv = ldc_mem_free_handle( 8517 paddr->memhandle); 8518 if (rv != 0) { 8519 DERR(NULL, "error freeing " 8520 "handle for ring " 8521 "0x%llx at pos %d", 8522 dp, i); 8523 mutex_exit(&dp->dlock); 8524 return (rv); 8525 } 8526 paddr->memhandle = NULL; 8527 } 8528 mutex_destroy(&paddr->dstate_lock); 8529 } 8530 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 8531 * VSW_RING_NUM_EL)); 8532 } 8533 8534 /* 8535 * Now unbind and destroy the ring itself. 8536 */ 8537 if (dp->handle != NULL) { 8538 (void) ldc_mem_dring_unbind(dp->handle); 8539 (void) ldc_mem_dring_destroy(dp->handle); 8540 } 8541 8542 if (dp->data_addr != NULL) { 8543 kmem_free(dp->data_addr, dp->data_sz); 8544 } 8545 8546 mutex_exit(&dp->dlock); 8547 mutex_destroy(&dp->dlock); 8548 mutex_destroy(&dp->restart_lock); 8549 kmem_free(dp, sizeof (dring_info_t)); 8550 8551 dp = dpp; 8552 } 8553 return (0); 8554 } 8555 8556 /* 8557 * Debugging routines 8558 */ 8559 static void 8560 display_state(void) 8561 { 8562 vsw_t *vswp; 8563 vsw_port_list_t *plist; 8564 vsw_port_t *port; 8565 vsw_ldc_list_t *ldcl; 8566 vsw_ldc_t *ldcp; 8567 8568 cmn_err(CE_NOTE, "***** system state *****"); 8569 8570 for (vswp = vsw_head; vswp; vswp = vswp->next) { 8571 plist = &vswp->plist; 8572 READ_ENTER(&plist->lockrw); 8573 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 8574 vswp->instance, plist->num_ports); 8575 8576 for (port = plist->head; port != NULL; port = port->p_next) { 8577 ldcl = &port->p_ldclist; 8578 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 8579 port->p_instance, ldcl->num_ldcs); 8580 READ_ENTER(&ldcl->lockrw); 8581 ldcp = ldcl->head; 8582 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 8583 cmn_err(CE_CONT, "chan %lu : dev %d : " 8584 "status %d : phase %u\n", 8585 ldcp->ldc_id, ldcp->dev_class, 8586 ldcp->ldc_status, ldcp->hphase); 8587 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 8588 "psession %lu\n", 8589 ldcp->ldc_id, 8590 ldcp->local_session, 8591 ldcp->peer_session); 8592 8593 cmn_err(CE_CONT, "Inbound lane:\n"); 8594 display_lane(&ldcp->lane_in); 8595 cmn_err(CE_CONT, "Outbound lane:\n"); 8596 display_lane(&ldcp->lane_out); 8597 } 8598 RW_EXIT(&ldcl->lockrw); 8599 } 8600 RW_EXIT(&plist->lockrw); 8601 } 8602 cmn_err(CE_NOTE, "***** system state *****"); 8603 } 8604 8605 static void 8606 display_lane(lane_t *lp) 8607 { 8608 dring_info_t *drp; 8609 8610 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 8611 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 8612 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 8613 lp->addr_type, lp->addr, lp->xfer_mode); 8614 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 8615 8616 cmn_err(CE_CONT, "Dring info:\n"); 8617 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 8618 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 8619 drp->num_descriptors, drp->descriptor_size); 8620 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 8621 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 8622 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 8623 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 8624 drp->ident, drp->end_idx); 8625 display_ring(drp); 8626 } 8627 } 8628 8629 static void 8630 display_ring(dring_info_t *dringp) 8631 { 8632 uint64_t i; 8633 uint64_t priv_count = 0; 8634 uint64_t pub_count = 0; 8635 vnet_public_desc_t *pub_addr = NULL; 8636 vsw_private_desc_t *priv_addr = NULL; 8637 8638 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8639 if (dringp->pub_addr != NULL) { 8640 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 8641 8642 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 8643 pub_count++; 8644 } 8645 8646 if (dringp->priv_addr != NULL) { 8647 priv_addr = 8648 (vsw_private_desc_t *)dringp->priv_addr + i; 8649 8650 if (priv_addr->dstate == VIO_DESC_FREE) 8651 priv_count++; 8652 } 8653 } 8654 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 8655 i, priv_count, pub_count); 8656 } 8657 8658 static void 8659 dump_flags(uint64_t state) 8660 { 8661 int i; 8662 8663 typedef struct flag_name { 8664 int flag_val; 8665 char *flag_name; 8666 } flag_name_t; 8667 8668 flag_name_t flags[] = { 8669 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 8670 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 8671 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 8672 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 8673 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 8674 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 8675 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 8676 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 8677 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 8678 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 8679 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 8680 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 8681 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 8682 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 8683 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 8684 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 8685 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 8686 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 8687 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 8688 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 8689 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 8690 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 8691 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 8692 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 8693 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 8694 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 8695 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 8696 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 8697 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 8698 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 8699 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 8700 8701 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 8702 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 8703 if (state & flags[i].flag_val) 8704 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 8705 } 8706 } 8707