1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 #include <sys/atomic.h> 74 75 /* 76 * Function prototypes. 77 */ 78 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 79 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 80 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 81 static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); 82 static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); 83 static void vsw_setup_switching_timeout(void *arg); 84 static void vsw_stop_switching_timeout(vsw_t *vswp); 85 static int vsw_setup_switching(vsw_t *); 86 static int vsw_setup_layer2(vsw_t *); 87 static int vsw_setup_layer3(vsw_t *); 88 89 /* MAC Ring table functions. */ 90 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 91 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 92 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 93 static void vsw_queue_stop(vsw_queue_t *vqp); 94 static vsw_queue_t *vsw_queue_create(); 95 static void vsw_queue_destroy(vsw_queue_t *vqp); 96 97 /* MAC layer routines */ 98 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 99 mac_resource_t *mrp); 100 static int vsw_get_hw_maddr(vsw_t *); 101 static int vsw_set_hw(vsw_t *, vsw_port_t *, int); 102 static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *); 103 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int); 104 static int vsw_unset_hw(vsw_t *, vsw_port_t *, int); 105 static int vsw_unset_hw_addr(vsw_t *, int); 106 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int); 107 static void vsw_reconfig_hw(vsw_t *); 108 static int vsw_prog_if(vsw_t *); 109 static int vsw_prog_ports(vsw_t *); 110 static int vsw_mac_attach(vsw_t *vswp); 111 static void vsw_mac_detach(vsw_t *vswp); 112 static int vsw_mac_open(vsw_t *vswp); 113 static void vsw_mac_close(vsw_t *vswp); 114 static void vsw_set_addrs(vsw_t *vswp); 115 static void vsw_unset_addrs(vsw_t *vswp); 116 117 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 118 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 119 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 120 static int vsw_mac_register(vsw_t *); 121 static int vsw_mac_unregister(vsw_t *); 122 static int vsw_m_stat(void *, uint_t, uint64_t *); 123 static void vsw_m_stop(void *arg); 124 static int vsw_m_start(void *arg); 125 static int vsw_m_unicst(void *arg, const uint8_t *); 126 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 127 static int vsw_m_promisc(void *arg, boolean_t); 128 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 129 130 /* MDEG routines */ 131 static int vsw_mdeg_register(vsw_t *vswp); 132 static void vsw_mdeg_unregister(vsw_t *vswp); 133 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 134 static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); 135 static int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); 136 static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); 137 static int vsw_read_mdprops(vsw_t *vswp); 138 139 /* Port add/deletion routines */ 140 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 141 static int vsw_port_attach(vsw_t *vswp, int p_instance, 142 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 143 static int vsw_detach_ports(vsw_t *vswp); 144 static int vsw_port_detach(vsw_t *vswp, int p_instance); 145 static int vsw_port_delete(vsw_port_t *port); 146 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 147 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 148 static int vsw_init_ldcs(vsw_port_t *port); 149 static int vsw_uninit_ldcs(vsw_port_t *port); 150 static int vsw_ldc_init(vsw_ldc_t *ldcp); 151 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 152 static int vsw_drain_ldcs(vsw_port_t *port); 153 static int vsw_drain_port_taskq(vsw_port_t *port); 154 static void vsw_marker_task(void *); 155 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 156 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 157 158 /* Interrupt routines */ 159 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 160 161 /* Handshake routines */ 162 static void vsw_ldc_reinit(vsw_ldc_t *); 163 static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); 164 static void vsw_conn_task(void *); 165 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 166 static void vsw_next_milestone(vsw_ldc_t *); 167 static int vsw_supported_version(vio_ver_msg_t *); 168 169 /* Data processing routines */ 170 static void vsw_process_pkt(void *); 171 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 172 static void vsw_process_ctrl_pkt(void *); 173 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 174 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 175 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 176 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 177 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 178 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 179 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 180 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 181 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 182 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 183 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 184 185 /* Switching/data transmit routines */ 186 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 187 vsw_port_t *port, mac_resource_handle_t); 188 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 189 vsw_port_t *port, mac_resource_handle_t); 190 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 191 vsw_port_t *port); 192 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 193 vsw_port_t *port); 194 static int vsw_portsend(vsw_port_t *, mblk_t *); 195 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 196 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 197 198 /* Packet creation routines */ 199 static void vsw_send_ver(void *); 200 static void vsw_send_attr(vsw_ldc_t *); 201 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 202 static void vsw_send_dring_info(vsw_ldc_t *); 203 static void vsw_send_rdx(vsw_ldc_t *); 204 205 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); 206 207 /* Forwarding database (FDB) routines */ 208 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 209 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 210 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 211 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 212 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 213 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 214 static mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t); 215 static void vsw_del_mcst_port(vsw_port_t *); 216 static void vsw_del_mcst_vsw(vsw_t *); 217 218 /* Dring routines */ 219 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 220 static void vsw_create_privring(vsw_ldc_t *); 221 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 222 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 223 int *); 224 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 225 226 static void vsw_set_lane_attr(vsw_t *, lane_t *); 227 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 228 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 229 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 230 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 231 232 /* Misc support routines */ 233 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 234 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 235 static int vsw_free_ring(dring_info_t *); 236 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr); 237 238 /* Debugging routines */ 239 static void dump_flags(uint64_t); 240 static void display_state(void); 241 static void display_lane(lane_t *); 242 static void display_ring(dring_info_t *); 243 244 int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */ 245 int vsw_wretries = 100; /* # of write attempts */ 246 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 247 int vsw_desc_delay = 0; /* delay in us */ 248 int vsw_read_attempts = 5; /* # of reads of descriptor */ 249 int vsw_mac_open_retries = 20; /* max # of mac_open() retries */ 250 int vsw_setup_switching_delay = 3; /* setup sw timeout interval in sec */ 251 252 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 253 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 254 255 static mac_callbacks_t vsw_m_callbacks = { 256 0, 257 vsw_m_stat, 258 vsw_m_start, 259 vsw_m_stop, 260 vsw_m_promisc, 261 vsw_m_multicst, 262 vsw_m_unicst, 263 vsw_m_tx, 264 NULL, 265 NULL, 266 NULL 267 }; 268 269 static struct cb_ops vsw_cb_ops = { 270 nulldev, /* cb_open */ 271 nulldev, /* cb_close */ 272 nodev, /* cb_strategy */ 273 nodev, /* cb_print */ 274 nodev, /* cb_dump */ 275 nodev, /* cb_read */ 276 nodev, /* cb_write */ 277 nodev, /* cb_ioctl */ 278 nodev, /* cb_devmap */ 279 nodev, /* cb_mmap */ 280 nodev, /* cb_segmap */ 281 nochpoll, /* cb_chpoll */ 282 ddi_prop_op, /* cb_prop_op */ 283 NULL, /* cb_stream */ 284 D_MP, /* cb_flag */ 285 CB_REV, /* rev */ 286 nodev, /* int (*cb_aread)() */ 287 nodev /* int (*cb_awrite)() */ 288 }; 289 290 static struct dev_ops vsw_ops = { 291 DEVO_REV, /* devo_rev */ 292 0, /* devo_refcnt */ 293 vsw_getinfo, /* devo_getinfo */ 294 nulldev, /* devo_identify */ 295 nulldev, /* devo_probe */ 296 vsw_attach, /* devo_attach */ 297 vsw_detach, /* devo_detach */ 298 nodev, /* devo_reset */ 299 &vsw_cb_ops, /* devo_cb_ops */ 300 (struct bus_ops *)NULL, /* devo_bus_ops */ 301 ddi_power /* devo_power */ 302 }; 303 304 extern struct mod_ops mod_driverops; 305 static struct modldrv vswmodldrv = { 306 &mod_driverops, 307 "sun4v Virtual Switch", 308 &vsw_ops, 309 }; 310 311 #define LDC_ENTER_LOCK(ldcp) \ 312 mutex_enter(&((ldcp)->ldc_cblock));\ 313 mutex_enter(&((ldcp)->ldc_txlock)); 314 #define LDC_EXIT_LOCK(ldcp) \ 315 mutex_exit(&((ldcp)->ldc_txlock));\ 316 mutex_exit(&((ldcp)->ldc_cblock)); 317 318 /* Driver soft state ptr */ 319 static void *vsw_state; 320 321 /* 322 * Linked list of "vsw_t" structures - one per instance. 323 */ 324 vsw_t *vsw_head = NULL; 325 krwlock_t vsw_rw; 326 327 /* 328 * Property names 329 */ 330 static char vdev_propname[] = "virtual-device"; 331 static char vsw_propname[] = "virtual-network-switch"; 332 static char physdev_propname[] = "vsw-phys-dev"; 333 static char smode_propname[] = "vsw-switch-mode"; 334 static char macaddr_propname[] = "local-mac-address"; 335 static char remaddr_propname[] = "remote-mac-address"; 336 static char ldcids_propname[] = "ldc-ids"; 337 static char chan_propname[] = "channel-endpoint"; 338 static char id_propname[] = "id"; 339 static char reg_propname[] = "reg"; 340 341 /* supported versions */ 342 static ver_sup_t vsw_versions[] = { {1, 0} }; 343 344 /* 345 * Matching criteria passed to the MDEG to register interest 346 * in changes to 'virtual-device-port' nodes identified by their 347 * 'id' property. 348 */ 349 static md_prop_match_t vport_prop_match[] = { 350 { MDET_PROP_VAL, "id" }, 351 { MDET_LIST_END, NULL } 352 }; 353 354 static mdeg_node_match_t vport_match = { "virtual-device-port", 355 vport_prop_match }; 356 357 /* 358 * Matching criteria passed to the MDEG to register interest 359 * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified 360 * by their 'name' and 'cfg-handle' properties. 361 */ 362 static md_prop_match_t vdev_prop_match[] = { 363 { MDET_PROP_STR, "name" }, 364 { MDET_PROP_VAL, "cfg-handle" }, 365 { MDET_LIST_END, NULL } 366 }; 367 368 static mdeg_node_match_t vdev_match = { "virtual-device", 369 vdev_prop_match }; 370 371 372 /* 373 * Specification of an MD node passed to the MDEG to filter any 374 * 'vport' nodes that do not belong to the specified node. This 375 * template is copied for each vsw instance and filled in with 376 * the appropriate 'cfg-handle' value before being passed to the MDEG. 377 */ 378 static mdeg_prop_spec_t vsw_prop_template[] = { 379 { MDET_PROP_STR, "name", vsw_propname }, 380 { MDET_PROP_VAL, "cfg-handle", NULL }, 381 { MDET_LIST_END, NULL, NULL } 382 }; 383 384 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 385 386 /* 387 * From /etc/system enable/disable thread per ring. This is a mode 388 * selection that is done a vsw driver attach time. 389 */ 390 boolean_t vsw_multi_ring_enable = B_FALSE; 391 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 392 393 /* 394 * Print debug messages - set to 0x1f to enable all msgs 395 * or 0x0 to turn all off. 396 */ 397 int vswdbg = 0x0; 398 399 /* 400 * debug levels: 401 * 0x01: Function entry/exit tracing 402 * 0x02: Internal function messages 403 * 0x04: Verbose internal messages 404 * 0x08: Warning messages 405 * 0x10: Error messages 406 */ 407 408 static void 409 vswdebug(vsw_t *vswp, const char *fmt, ...) 410 { 411 char buf[512]; 412 va_list ap; 413 414 va_start(ap, fmt); 415 (void) vsprintf(buf, fmt, ap); 416 va_end(ap); 417 418 if (vswp == NULL) 419 cmn_err(CE_CONT, "%s\n", buf); 420 else 421 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 422 } 423 424 /* 425 * For the moment the state dump routines have their own 426 * private flag. 427 */ 428 #define DUMP_STATE 0 429 430 #if DUMP_STATE 431 432 #define DUMP_TAG(tag) \ 433 { \ 434 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 435 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 436 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 437 } 438 439 #define DUMP_TAG_PTR(tag) \ 440 { \ 441 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 442 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 443 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 444 } 445 446 #define DUMP_FLAGS(flags) dump_flags(flags); 447 #define DISPLAY_STATE() display_state() 448 449 #else 450 451 #define DUMP_TAG(tag) 452 #define DUMP_TAG_PTR(tag) 453 #define DUMP_FLAGS(state) 454 #define DISPLAY_STATE() 455 456 #endif /* DUMP_STATE */ 457 458 #ifdef DEBUG 459 460 #define D1 \ 461 if (vswdbg & 0x01) \ 462 vswdebug 463 464 #define D2 \ 465 if (vswdbg & 0x02) \ 466 vswdebug 467 468 #define D3 \ 469 if (vswdbg & 0x04) \ 470 vswdebug 471 472 #define DWARN \ 473 if (vswdbg & 0x08) \ 474 vswdebug 475 476 #define DERR \ 477 if (vswdbg & 0x10) \ 478 vswdebug 479 480 #else 481 482 #define DERR if (0) vswdebug 483 #define DWARN if (0) vswdebug 484 #define D1 if (0) vswdebug 485 #define D2 if (0) vswdebug 486 #define D3 if (0) vswdebug 487 488 #endif /* DEBUG */ 489 490 static struct modlinkage modlinkage = { 491 MODREV_1, 492 &vswmodldrv, 493 NULL 494 }; 495 496 int 497 _init(void) 498 { 499 int status; 500 501 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 502 503 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 504 if (status != 0) { 505 return (status); 506 } 507 508 mac_init_ops(&vsw_ops, "vsw"); 509 status = mod_install(&modlinkage); 510 if (status != 0) { 511 ddi_soft_state_fini(&vsw_state); 512 } 513 return (status); 514 } 515 516 int 517 _fini(void) 518 { 519 int status; 520 521 status = mod_remove(&modlinkage); 522 if (status != 0) 523 return (status); 524 mac_fini_ops(&vsw_ops); 525 ddi_soft_state_fini(&vsw_state); 526 527 rw_destroy(&vsw_rw); 528 529 return (status); 530 } 531 532 int 533 _info(struct modinfo *modinfop) 534 { 535 return (mod_info(&modlinkage, modinfop)); 536 } 537 538 static int 539 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 540 { 541 vsw_t *vswp; 542 int instance; 543 char hashname[MAXNAMELEN]; 544 char qname[TASKQ_NAMELEN]; 545 enum { PROG_init = 0x00, 546 PROG_locks = 0x01, 547 PROG_readmd = 0x02, 548 PROG_fdb = 0x04, 549 PROG_mfdb = 0x08, 550 PROG_taskq = 0x10, 551 PROG_swmode = 0x20, 552 PROG_macreg = 0x40, 553 PROG_mdreg = 0x80} 554 progress; 555 556 progress = PROG_init; 557 int rv; 558 559 switch (cmd) { 560 case DDI_ATTACH: 561 break; 562 case DDI_RESUME: 563 /* nothing to do for this non-device */ 564 return (DDI_SUCCESS); 565 case DDI_PM_RESUME: 566 default: 567 return (DDI_FAILURE); 568 } 569 570 instance = ddi_get_instance(dip); 571 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 572 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 573 return (DDI_FAILURE); 574 } 575 vswp = ddi_get_soft_state(vsw_state, instance); 576 577 if (vswp == NULL) { 578 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 579 goto vsw_attach_fail; 580 } 581 582 vswp->dip = dip; 583 vswp->instance = instance; 584 ddi_set_driver_private(dip, (caddr_t)vswp); 585 586 mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL); 587 mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); 588 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 589 mutex_init(&vswp->swtmout_lock, NULL, MUTEX_DRIVER, NULL); 590 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 591 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 592 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 593 594 progress |= PROG_locks; 595 596 rv = vsw_read_mdprops(vswp); 597 if (rv != 0) 598 goto vsw_attach_fail; 599 600 progress |= PROG_readmd; 601 602 /* setup the unicast forwarding database */ 603 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 604 vswp->instance); 605 D2(vswp, "creating unicast hash table (%s)...", hashname); 606 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 607 mod_hash_null_valdtor, sizeof (void *)); 608 609 progress |= PROG_fdb; 610 611 /* setup the multicast fowarding database */ 612 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 613 vswp->instance); 614 D2(vswp, "creating multicast hash table %s)...", hashname); 615 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 616 mod_hash_null_valdtor, sizeof (void *)); 617 618 progress |= PROG_mfdb; 619 620 /* 621 * Create the taskq which will process all the VIO 622 * control messages. 623 */ 624 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 625 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 626 TASKQ_DEFAULTPRI, 0)) == NULL) { 627 cmn_err(CE_WARN, "!vsw%d: Unable to create task queue", 628 vswp->instance); 629 goto vsw_attach_fail; 630 } 631 632 progress |= PROG_taskq; 633 634 /* prevent auto-detaching */ 635 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 636 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 637 cmn_err(CE_NOTE, "!Unable to set \"%s\" property for " 638 "instance %u", DDI_NO_AUTODETACH, instance); 639 } 640 641 /* 642 * Setup the required switching mode, 643 * based on the mdprops that we read earlier. 644 */ 645 rv = vsw_setup_switching(vswp); 646 if (rv == EAGAIN) { 647 /* 648 * Unable to setup switching mode; 649 * as the error is EAGAIN, schedule a timeout to retry. 650 */ 651 mutex_enter(&vswp->swtmout_lock); 652 653 vswp->swtmout_enabled = B_TRUE; 654 vswp->swtmout_id = 655 timeout(vsw_setup_switching_timeout, vswp, 656 (vsw_setup_switching_delay * drv_usectohz(MICROSEC))); 657 658 mutex_exit(&vswp->swtmout_lock); 659 } else if (rv != 0) { 660 goto vsw_attach_fail; 661 } 662 663 progress |= PROG_swmode; 664 665 /* Register with mac layer as a provider */ 666 rv = vsw_mac_register(vswp); 667 if (rv != 0) 668 goto vsw_attach_fail; 669 670 progress |= PROG_macreg; 671 672 /* 673 * Now we have everything setup, register an interest in 674 * specific MD nodes. 675 * 676 * The callback is invoked in 2 cases, firstly if upon mdeg 677 * registration there are existing nodes which match our specified 678 * criteria, and secondly if the MD is changed (and again, there 679 * are nodes which we are interested in present within it. Note 680 * that our callback will be invoked even if our specified nodes 681 * have not actually changed). 682 * 683 */ 684 rv = vsw_mdeg_register(vswp); 685 if (rv != 0) 686 goto vsw_attach_fail; 687 688 progress |= PROG_mdreg; 689 690 WRITE_ENTER(&vsw_rw); 691 vswp->next = vsw_head; 692 vsw_head = vswp; 693 RW_EXIT(&vsw_rw); 694 695 ddi_report_dev(vswp->dip); 696 return (DDI_SUCCESS); 697 698 vsw_attach_fail: 699 DERR(NULL, "vsw_attach: failed"); 700 701 if (progress & PROG_mdreg) { 702 vsw_mdeg_unregister(vswp); 703 (void) vsw_detach_ports(vswp); 704 } 705 706 if (progress & PROG_macreg) 707 (void) vsw_mac_unregister(vswp); 708 709 if (progress & PROG_swmode) { 710 vsw_stop_switching_timeout(vswp); 711 mutex_enter(&vswp->mac_lock); 712 vsw_mac_detach(vswp); 713 vsw_mac_close(vswp); 714 mutex_exit(&vswp->mac_lock); 715 } 716 717 if (progress & PROG_taskq) 718 ddi_taskq_destroy(vswp->taskq_p); 719 720 if (progress & PROG_mfdb) 721 mod_hash_destroy_hash(vswp->mfdb); 722 723 if (progress & PROG_fdb) 724 mod_hash_destroy_hash(vswp->fdb); 725 726 if (progress & PROG_locks) { 727 rw_destroy(&vswp->plist.lockrw); 728 rw_destroy(&vswp->mfdbrw); 729 rw_destroy(&vswp->if_lockrw); 730 mutex_destroy(&vswp->swtmout_lock); 731 mutex_destroy(&vswp->mca_lock); 732 mutex_destroy(&vswp->mac_lock); 733 mutex_destroy(&vswp->hw_lock); 734 } 735 736 ddi_soft_state_free(vsw_state, instance); 737 return (DDI_FAILURE); 738 } 739 740 static int 741 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 742 { 743 vio_mblk_pool_t *poolp, *npoolp; 744 vsw_t **vswpp, *vswp; 745 int instance; 746 747 instance = ddi_get_instance(dip); 748 vswp = ddi_get_soft_state(vsw_state, instance); 749 750 if (vswp == NULL) { 751 return (DDI_FAILURE); 752 } 753 754 switch (cmd) { 755 case DDI_DETACH: 756 break; 757 case DDI_SUSPEND: 758 case DDI_PM_SUSPEND: 759 default: 760 return (DDI_FAILURE); 761 } 762 763 D2(vswp, "detaching instance %d", instance); 764 765 /* Stop any pending timeout to setup switching mode. */ 766 vsw_stop_switching_timeout(vswp); 767 768 if (vswp->if_state & VSW_IF_REG) { 769 if (vsw_mac_unregister(vswp) != 0) { 770 cmn_err(CE_WARN, "!vsw%d: Unable to detach from " 771 "MAC layer", vswp->instance); 772 return (DDI_FAILURE); 773 } 774 } 775 776 vsw_mdeg_unregister(vswp); 777 778 /* remove mac layer callback */ 779 mutex_enter(&vswp->mac_lock); 780 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 781 mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); 782 vswp->mrh = NULL; 783 } 784 mutex_exit(&vswp->mac_lock); 785 786 if (vsw_detach_ports(vswp) != 0) { 787 cmn_err(CE_WARN, "!vsw%d: Unable to detach ports", 788 vswp->instance); 789 return (DDI_FAILURE); 790 } 791 792 rw_destroy(&vswp->if_lockrw); 793 794 mutex_destroy(&vswp->hw_lock); 795 796 /* 797 * Now that the ports have been deleted, stop and close 798 * the physical device. 799 */ 800 mutex_enter(&vswp->mac_lock); 801 802 vsw_mac_detach(vswp); 803 vsw_mac_close(vswp); 804 805 mutex_exit(&vswp->mac_lock); 806 807 mutex_destroy(&vswp->mac_lock); 808 mutex_destroy(&vswp->swtmout_lock); 809 810 /* 811 * Destroy any free pools that may still exist. 812 */ 813 poolp = vswp->rxh; 814 while (poolp != NULL) { 815 npoolp = vswp->rxh = poolp->nextp; 816 if (vio_destroy_mblks(poolp) != 0) { 817 vswp->rxh = poolp; 818 return (DDI_FAILURE); 819 } 820 poolp = npoolp; 821 } 822 823 /* 824 * Remove this instance from any entries it may be on in 825 * the hash table by using the list of addresses maintained 826 * in the vsw_t structure. 827 */ 828 vsw_del_mcst_vsw(vswp); 829 830 vswp->mcap = NULL; 831 mutex_destroy(&vswp->mca_lock); 832 833 /* 834 * By now any pending tasks have finished and the underlying 835 * ldc's have been destroyed, so its safe to delete the control 836 * message taskq. 837 */ 838 if (vswp->taskq_p != NULL) 839 ddi_taskq_destroy(vswp->taskq_p); 840 841 /* 842 * At this stage all the data pointers in the hash table 843 * should be NULL, as all the ports have been removed and will 844 * have deleted themselves from the port lists which the data 845 * pointers point to. Hence we can destroy the table using the 846 * default destructors. 847 */ 848 D2(vswp, "vsw_detach: destroying hash tables.."); 849 mod_hash_destroy_hash(vswp->fdb); 850 vswp->fdb = NULL; 851 852 WRITE_ENTER(&vswp->mfdbrw); 853 mod_hash_destroy_hash(vswp->mfdb); 854 vswp->mfdb = NULL; 855 RW_EXIT(&vswp->mfdbrw); 856 rw_destroy(&vswp->mfdbrw); 857 858 ddi_remove_minor_node(dip, NULL); 859 860 rw_destroy(&vswp->plist.lockrw); 861 WRITE_ENTER(&vsw_rw); 862 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 863 if (*vswpp == vswp) { 864 *vswpp = vswp->next; 865 break; 866 } 867 } 868 RW_EXIT(&vsw_rw); 869 ddi_soft_state_free(vsw_state, instance); 870 871 return (DDI_SUCCESS); 872 } 873 874 static int 875 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 876 { 877 _NOTE(ARGUNUSED(dip)) 878 879 vsw_t *vswp = NULL; 880 dev_t dev = (dev_t)arg; 881 int instance; 882 883 instance = getminor(dev); 884 885 switch (infocmd) { 886 case DDI_INFO_DEVT2DEVINFO: 887 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 888 *result = NULL; 889 return (DDI_FAILURE); 890 } 891 *result = vswp->dip; 892 return (DDI_SUCCESS); 893 894 case DDI_INFO_DEVT2INSTANCE: 895 *result = (void *)(uintptr_t)instance; 896 return (DDI_SUCCESS); 897 898 default: 899 *result = NULL; 900 return (DDI_FAILURE); 901 } 902 } 903 904 /* 905 * Get the value of the "vsw-phys-dev" property in the specified 906 * node. This property is the name of the physical device that 907 * the virtual switch will use to talk to the outside world. 908 * 909 * Note it is valid for this property to be NULL (but the property 910 * itself must exist). Callers of this routine should verify that 911 * the value returned is what they expected (i.e. either NULL or non NULL). 912 * 913 * On success returns value of the property in region pointed to by 914 * the 'name' argument, and with return value of 0. Otherwise returns 1. 915 */ 916 static int 917 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) 918 { 919 int len = 0; 920 char *physname = NULL; 921 char *dev; 922 923 if (md_get_prop_data(mdp, node, physdev_propname, 924 (uint8_t **)(&physname), &len) != 0) { 925 cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical " 926 "device(s) from MD", vswp->instance); 927 return (1); 928 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 929 cmn_err(CE_WARN, "!vsw%d: %s is too long a device name", 930 vswp->instance, physname); 931 return (1); 932 } else { 933 (void) strncpy(name, physname, strlen(physname) + 1); 934 D2(vswp, "%s: using first device specified (%s)", 935 __func__, physname); 936 } 937 938 #ifdef DEBUG 939 /* 940 * As a temporary measure to aid testing we check to see if there 941 * is a vsw.conf file present. If there is we use the value of the 942 * vsw_physname property in the file as the name of the physical 943 * device, overriding the value from the MD. 944 * 945 * There may be multiple devices listed, but for the moment 946 * we just use the first one. 947 */ 948 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 949 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 950 if ((strlen(dev) + 1) > LIFNAMSIZ) { 951 cmn_err(CE_WARN, "vsw%d: %s is too long a device name", 952 vswp->instance, dev); 953 ddi_prop_free(dev); 954 return (1); 955 } else { 956 cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from " 957 "config file", vswp->instance, dev); 958 959 (void) strncpy(name, dev, strlen(dev) + 1); 960 } 961 962 ddi_prop_free(dev); 963 } 964 #endif 965 966 return (0); 967 } 968 969 /* 970 * Read the 'vsw-switch-mode' property from the specified MD node. 971 * 972 * Returns 0 on success and the number of modes found in 'found', 973 * otherwise returns 1. 974 */ 975 static int 976 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, 977 uint8_t *modes, int *found) 978 { 979 int len = 0; 980 int smode_num = 0; 981 char *smode = NULL; 982 char *curr_mode = NULL; 983 984 D1(vswp, "%s: enter", __func__); 985 986 /* 987 * Get the switch-mode property. The modes are listed in 988 * decreasing order of preference, i.e. prefered mode is 989 * first item in list. 990 */ 991 len = 0; 992 smode_num = 0; 993 if (md_get_prop_data(mdp, node, smode_propname, 994 (uint8_t **)(&smode), &len) != 0) { 995 /* 996 * Unable to get switch-mode property from MD, nothing 997 * more we can do. 998 */ 999 cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" 1000 " from the MD", vswp->instance); 1001 *found = 0; 1002 return (1); 1003 } 1004 1005 curr_mode = smode; 1006 /* 1007 * Modes of operation: 1008 * 'switched' - layer 2 switching, underlying HW in 1009 * programmed mode. 1010 * 'promiscuous' - layer 2 switching, underlying HW in 1011 * promiscuous mode. 1012 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 1013 * in non-promiscuous mode. 1014 */ 1015 while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { 1016 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 1017 if (strcmp(curr_mode, "switched") == 0) { 1018 modes[smode_num++] = VSW_LAYER2; 1019 } else if (strcmp(curr_mode, "promiscuous") == 0) { 1020 modes[smode_num++] = VSW_LAYER2_PROMISC; 1021 } else if (strcmp(curr_mode, "routed") == 0) { 1022 modes[smode_num++] = VSW_LAYER3; 1023 } else { 1024 cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " 1025 "setting to default switched mode", 1026 vswp->instance, curr_mode); 1027 modes[smode_num++] = VSW_LAYER2; 1028 } 1029 curr_mode += strlen(curr_mode) + 1; 1030 } 1031 *found = smode_num; 1032 1033 D2(vswp, "%s: %d modes found", __func__, smode_num); 1034 1035 D1(vswp, "%s: exit", __func__); 1036 1037 return (0); 1038 } 1039 1040 /* 1041 * Check to see if the card supports the setting of multiple unicst 1042 * addresses. 1043 * 1044 * Returns 0 if card supports the programming of multiple unicast addresses, 1045 * otherwise returns 1. 1046 */ 1047 static int 1048 vsw_get_hw_maddr(vsw_t *vswp) 1049 { 1050 D1(vswp, "%s: enter", __func__); 1051 1052 ASSERT(MUTEX_HELD(&vswp->mac_lock)); 1053 1054 if (vswp->mh == NULL) 1055 return (1); 1056 1057 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1058 cmn_err(CE_WARN, "!vsw%d: device (%s) does not support " 1059 "setting multiple unicast addresses", vswp->instance, 1060 vswp->physname); 1061 return (1); 1062 } 1063 1064 D2(vswp, "%s: %d addrs : %d free", __func__, 1065 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1066 1067 D1(vswp, "%s: exit", __func__); 1068 1069 return (0); 1070 } 1071 1072 /* 1073 * Program unicast and multicast addresses of vsw interface and the ports 1074 * into the physical device. 1075 */ 1076 static void 1077 vsw_set_addrs(vsw_t *vswp) 1078 { 1079 vsw_port_list_t *plist = &vswp->plist; 1080 vsw_port_t *port; 1081 mcst_addr_t *mcap; 1082 int rv; 1083 1084 READ_ENTER(&vswp->if_lockrw); 1085 1086 if (vswp->if_state & VSW_IF_UP) { 1087 1088 /* program unicst addr of vsw interface in the physdev */ 1089 if (vswp->addr_set == VSW_ADDR_UNSET) { 1090 mutex_enter(&vswp->hw_lock); 1091 rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 1092 mutex_exit(&vswp->hw_lock); 1093 if (rv != 0) { 1094 cmn_err(CE_NOTE, 1095 "!vsw%d: failed to program interface " 1096 "unicast address\n", vswp->instance); 1097 } 1098 /* 1099 * Notify the MAC layer of the changed address. 1100 */ 1101 mac_unicst_update(vswp->if_mh, 1102 (uint8_t *)&vswp->if_addr); 1103 } 1104 1105 /* program mcast addrs of vsw interface in the physdev */ 1106 mutex_enter(&vswp->mca_lock); 1107 mutex_enter(&vswp->mac_lock); 1108 for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) { 1109 if (mcap->mac_added) 1110 continue; 1111 rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca); 1112 if (rv == 0) { 1113 mcap->mac_added = B_TRUE; 1114 } else { 1115 cmn_err(CE_WARN, "!vsw%d: unable to add " 1116 "multicast address: %s\n", vswp->instance, 1117 ether_sprintf((void *)&mcap->mca)); 1118 } 1119 } 1120 mutex_exit(&vswp->mac_lock); 1121 mutex_exit(&vswp->mca_lock); 1122 1123 } 1124 1125 RW_EXIT(&vswp->if_lockrw); 1126 1127 WRITE_ENTER(&plist->lockrw); 1128 1129 /* program unicast address of ports in the physical device */ 1130 mutex_enter(&vswp->hw_lock); 1131 for (port = plist->head; port != NULL; port = port->p_next) { 1132 if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */ 1133 continue; 1134 if (vsw_set_hw(vswp, port, VSW_VNETPORT)) { 1135 cmn_err(CE_NOTE, 1136 "!vsw%d: port:%d failed to set unicast address\n", 1137 vswp->instance, port->p_instance); 1138 } 1139 } 1140 mutex_exit(&vswp->hw_lock); 1141 1142 /* program multicast addresses of ports in the physdev */ 1143 for (port = plist->head; port != NULL; port = port->p_next) { 1144 mutex_enter(&port->mca_lock); 1145 mutex_enter(&vswp->mac_lock); 1146 for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) { 1147 if (mcap->mac_added) 1148 continue; 1149 rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca); 1150 if (rv == 0) { 1151 mcap->mac_added = B_TRUE; 1152 } else { 1153 cmn_err(CE_WARN, "!vsw%d: unable to add " 1154 "multicast address: %s\n", vswp->instance, 1155 ether_sprintf((void *)&mcap->mca)); 1156 } 1157 } 1158 mutex_exit(&vswp->mac_lock); 1159 mutex_exit(&port->mca_lock); 1160 } 1161 1162 RW_EXIT(&plist->lockrw); 1163 } 1164 1165 /* 1166 * Remove unicast and multicast addresses of vsw interface and the ports 1167 * from the physical device. 1168 */ 1169 static void 1170 vsw_unset_addrs(vsw_t *vswp) 1171 { 1172 vsw_port_list_t *plist = &vswp->plist; 1173 vsw_port_t *port; 1174 mcst_addr_t *mcap; 1175 1176 READ_ENTER(&vswp->if_lockrw); 1177 1178 if (vswp->if_state & VSW_IF_UP) { 1179 1180 /* 1181 * Remove unicast addr of vsw interfce 1182 * from current physdev 1183 */ 1184 mutex_enter(&vswp->hw_lock); 1185 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 1186 mutex_exit(&vswp->hw_lock); 1187 1188 /* 1189 * Remove mcast addrs of vsw interface 1190 * from current physdev 1191 */ 1192 mutex_enter(&vswp->mca_lock); 1193 mutex_enter(&vswp->mac_lock); 1194 for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) { 1195 if (!mcap->mac_added) 1196 continue; 1197 (void) mac_multicst_remove(vswp->mh, 1198 (uchar_t *)&mcap->mca); 1199 mcap->mac_added = B_FALSE; 1200 } 1201 mutex_exit(&vswp->mac_lock); 1202 mutex_exit(&vswp->mca_lock); 1203 1204 } 1205 1206 RW_EXIT(&vswp->if_lockrw); 1207 1208 WRITE_ENTER(&plist->lockrw); 1209 1210 /* 1211 * Remove unicast address of ports from the current physical device 1212 */ 1213 mutex_enter(&vswp->hw_lock); 1214 for (port = plist->head; port != NULL; port = port->p_next) { 1215 /* Remove address if was programmed into HW. */ 1216 if (port->addr_set == VSW_ADDR_UNSET) 1217 continue; 1218 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 1219 } 1220 mutex_exit(&vswp->hw_lock); 1221 1222 /* Remove multicast addresses of ports from the current physdev */ 1223 for (port = plist->head; port != NULL; port = port->p_next) { 1224 mutex_enter(&port->mca_lock); 1225 mutex_enter(&vswp->mac_lock); 1226 for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) { 1227 if (!mcap->mac_added) 1228 continue; 1229 (void) mac_multicst_remove(vswp->mh, 1230 (uchar_t *)&mcap->mca); 1231 mcap->mac_added = B_FALSE; 1232 } 1233 mutex_exit(&vswp->mac_lock); 1234 mutex_exit(&port->mca_lock); 1235 } 1236 1237 RW_EXIT(&plist->lockrw); 1238 } 1239 1240 /* copy mac address of vsw into soft state structure */ 1241 static void 1242 vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr) 1243 { 1244 int i; 1245 1246 WRITE_ENTER(&vswp->if_lockrw); 1247 for (i = ETHERADDRL - 1; i >= 0; i--) { 1248 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 1249 macaddr >>= 8; 1250 } 1251 RW_EXIT(&vswp->if_lockrw); 1252 } 1253 1254 /* 1255 * Timeout routine to setup switching mode: 1256 * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop() 1257 * initially. If it fails and the error is EAGAIN, then this timeout handler 1258 * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried 1259 * until we successfully finish it; or the returned error is not EAGAIN. 1260 */ 1261 static void 1262 vsw_setup_switching_timeout(void *arg) 1263 { 1264 vsw_t *vswp = (vsw_t *)arg; 1265 int rv; 1266 1267 if (vswp->swtmout_enabled == B_FALSE) 1268 return; 1269 1270 rv = vsw_setup_switching(vswp); 1271 1272 if (rv == 0) { 1273 /* 1274 * Successfully setup switching mode. 1275 * Program unicst, mcst addrs of vsw 1276 * interface and ports in the physdev. 1277 */ 1278 vsw_set_addrs(vswp); 1279 } 1280 1281 mutex_enter(&vswp->swtmout_lock); 1282 1283 if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) { 1284 /* 1285 * Reschedule timeout() if the error is EAGAIN and the 1286 * timeout is still enabled. For errors other than EAGAIN, 1287 * we simply return without rescheduling timeout(). 1288 */ 1289 vswp->swtmout_id = 1290 timeout(vsw_setup_switching_timeout, vswp, 1291 (vsw_setup_switching_delay * drv_usectohz(MICROSEC))); 1292 goto exit; 1293 } 1294 1295 /* timeout handler completed */ 1296 vswp->swtmout_enabled = B_FALSE; 1297 vswp->swtmout_id = 0; 1298 1299 exit: 1300 mutex_exit(&vswp->swtmout_lock); 1301 } 1302 1303 /* 1304 * Cancel the timeout handler to setup switching mode. 1305 */ 1306 static void 1307 vsw_stop_switching_timeout(vsw_t *vswp) 1308 { 1309 timeout_id_t tid; 1310 1311 mutex_enter(&vswp->swtmout_lock); 1312 1313 tid = vswp->swtmout_id; 1314 1315 if (tid != 0) { 1316 /* signal timeout handler to stop */ 1317 vswp->swtmout_enabled = B_FALSE; 1318 vswp->swtmout_id = 0; 1319 mutex_exit(&vswp->swtmout_lock); 1320 1321 (void) untimeout(tid); 1322 } else { 1323 mutex_exit(&vswp->swtmout_lock); 1324 } 1325 1326 (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE); 1327 1328 mutex_enter(&vswp->mac_lock); 1329 vswp->mac_open_retries = 0; 1330 mutex_exit(&vswp->mac_lock); 1331 } 1332 1333 /* 1334 * Setup the required switching mode. 1335 * This routine is invoked from vsw_attach() or vsw_update_md_prop() 1336 * initially. If it fails and the error is EAGAIN, then a timeout handler 1337 * is started to retry vsw_setup_switching(), until it successfully finishes; 1338 * or the returned error is not EAGAIN. 1339 * 1340 * Returns: 1341 * 0 on success. 1342 * EAGAIN if retry is needed. 1343 * 1 on all other failures. 1344 */ 1345 static int 1346 vsw_setup_switching(vsw_t *vswp) 1347 { 1348 int i, rv = 1; 1349 1350 D1(vswp, "%s: enter", __func__); 1351 1352 /* 1353 * Select best switching mode. 1354 * Note that we start from the saved smode_idx. This is done as 1355 * this routine can be called from the timeout handler to retry 1356 * setting up a specific mode. Currently only the function which 1357 * sets up layer2/promisc mode returns EAGAIN if the underlying 1358 * physical device is not available yet, causing retries. 1359 */ 1360 for (i = vswp->smode_idx; i < vswp->smode_num; i++) { 1361 vswp->smode_idx = i; 1362 switch (vswp->smode[i]) { 1363 case VSW_LAYER2: 1364 case VSW_LAYER2_PROMISC: 1365 rv = vsw_setup_layer2(vswp); 1366 break; 1367 1368 case VSW_LAYER3: 1369 rv = vsw_setup_layer3(vswp); 1370 break; 1371 1372 default: 1373 DERR(vswp, "unknown switch mode"); 1374 break; 1375 } 1376 1377 if ((rv == 0) || (rv == EAGAIN)) 1378 break; 1379 1380 /* all other errors(rv != 0): continue & select the next mode */ 1381 rv = 1; 1382 } 1383 1384 if (rv && (rv != EAGAIN)) { 1385 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 1386 "switching mode", vswp->instance); 1387 } else if (rv == 0) { 1388 (void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE); 1389 } 1390 1391 D2(vswp, "%s: Operating in mode %d", __func__, 1392 vswp->smode[vswp->smode_idx]); 1393 1394 D1(vswp, "%s: exit", __func__); 1395 1396 return (rv); 1397 } 1398 1399 /* 1400 * Setup for layer 2 switching. 1401 * 1402 * Returns: 1403 * 0 on success. 1404 * EAGAIN if retry is needed. 1405 * EIO on all other failures. 1406 */ 1407 static int 1408 vsw_setup_layer2(vsw_t *vswp) 1409 { 1410 int rv; 1411 1412 D1(vswp, "%s: enter", __func__); 1413 1414 vswp->vsw_switch_frame = vsw_switch_l2_frame; 1415 1416 rv = strlen(vswp->physname); 1417 if (rv == 0) { 1418 /* 1419 * Physical device name is NULL, which is 1420 * required for layer 2. 1421 */ 1422 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 1423 vswp->instance); 1424 return (EIO); 1425 } 1426 1427 mutex_enter(&vswp->mac_lock); 1428 1429 rv = vsw_mac_open(vswp); 1430 if (rv != 0) { 1431 if (rv != EAGAIN) { 1432 cmn_err(CE_WARN, "!vsw%d: Unable to open physical " 1433 "device: %s\n", vswp->instance, vswp->physname); 1434 } 1435 mutex_exit(&vswp->mac_lock); 1436 return (rv); 1437 } 1438 1439 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1440 /* 1441 * Verify that underlying device can support multiple 1442 * unicast mac addresses. 1443 */ 1444 rv = vsw_get_hw_maddr(vswp); 1445 if (rv != 0) { 1446 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 1447 "layer2 switching", vswp->instance); 1448 goto exit_error; 1449 } 1450 } 1451 1452 /* 1453 * Attempt to link into the MAC layer so we can get 1454 * and send packets out over the physical adapter. 1455 */ 1456 rv = vsw_mac_attach(vswp); 1457 if (rv != 0) { 1458 /* 1459 * Registration with the MAC layer has failed, 1460 * so return error so that can fall back to next 1461 * prefered switching method. 1462 */ 1463 cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: " 1464 "%s\n", vswp->instance, vswp->physname); 1465 goto exit_error; 1466 } 1467 1468 D1(vswp, "%s: exit", __func__); 1469 1470 mutex_exit(&vswp->mac_lock); 1471 return (0); 1472 1473 exit_error: 1474 vsw_mac_close(vswp); 1475 mutex_exit(&vswp->mac_lock); 1476 return (EIO); 1477 } 1478 1479 static int 1480 vsw_setup_layer3(vsw_t *vswp) 1481 { 1482 D1(vswp, "%s: enter", __func__); 1483 1484 D2(vswp, "%s: operating in layer 3 mode", __func__); 1485 vswp->vsw_switch_frame = vsw_switch_l3_frame; 1486 1487 D1(vswp, "%s: exit", __func__); 1488 1489 return (0); 1490 } 1491 1492 /* 1493 * Open the underlying physical device for access in layer2 mode. 1494 * Returns: 1495 * 0 on success 1496 * EAGAIN if mac_open() fails due to the device being not available yet. 1497 * EIO on any other failures. 1498 */ 1499 static int 1500 vsw_mac_open(vsw_t *vswp) 1501 { 1502 char drv[LIFNAMSIZ]; 1503 uint_t ddi_instance; 1504 int rv; 1505 1506 ASSERT(MUTEX_HELD(&vswp->mac_lock)); 1507 1508 if (vswp->mh != NULL) { 1509 /* already open */ 1510 return (0); 1511 } 1512 1513 if (vswp->mac_open_retries++ >= vsw_mac_open_retries) { 1514 /* exceeded max retries */ 1515 return (EIO); 1516 } 1517 1518 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1519 cmn_err(CE_WARN, "!vsw%d: invalid device name: %s", 1520 vswp->instance, vswp->physname); 1521 return (EIO); 1522 } 1523 1524 /* 1525 * Aggregation devices are special in that the device instance 1526 * must be set to zero when they are being mac_open()'ed. 1527 * 1528 * The only way to determine if we are being passed an aggregated 1529 * device is to check the device name. 1530 */ 1531 if (strcmp(drv, "aggr") == 0) { 1532 ddi_instance = 0; 1533 } 1534 1535 rv = mac_open(vswp->physname, ddi_instance, &vswp->mh); 1536 if (rv != 0) { 1537 /* 1538 * If mac_open() failed and the error indicates that the 1539 * device is not available yet, then, we return EAGAIN to 1540 * indicate that it needs to be retried. 1541 * For example, this may happen during boot up, as the 1542 * required link aggregation groups(devices) have not been 1543 * created yet. 1544 */ 1545 if (rv == ENOENT) { 1546 return (EAGAIN); 1547 } else { 1548 cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x", 1549 vswp->instance, vswp->physname, rv); 1550 return (EIO); 1551 } 1552 } 1553 1554 vswp->mac_open_retries = 0; 1555 1556 return (0); 1557 } 1558 1559 /* 1560 * Close the underlying physical device. 1561 */ 1562 static void 1563 vsw_mac_close(vsw_t *vswp) 1564 { 1565 ASSERT(MUTEX_HELD(&vswp->mac_lock)); 1566 1567 if (vswp->mh != NULL) { 1568 mac_close(vswp->mh); 1569 vswp->mh = NULL; 1570 } 1571 } 1572 1573 /* 1574 * Link into the MAC layer to gain access to the services provided by 1575 * the underlying physical device driver (which should also have 1576 * registered with the MAC layer). 1577 * 1578 * Only when in layer 2 mode. 1579 */ 1580 static int 1581 vsw_mac_attach(vsw_t *vswp) 1582 { 1583 D1(vswp, "%s: enter", __func__); 1584 1585 ASSERT(vswp->mrh == NULL); 1586 ASSERT(vswp->mstarted == B_FALSE); 1587 ASSERT(vswp->mresources == B_FALSE); 1588 1589 ASSERT(MUTEX_HELD(&vswp->mac_lock)); 1590 1591 ASSERT(vswp->mh != NULL); 1592 1593 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1594 1595 if (vsw_multi_ring_enable) { 1596 /* 1597 * Initialize the ring table. 1598 */ 1599 vsw_mac_ring_tbl_init(vswp); 1600 1601 /* 1602 * Register our rx callback function. 1603 */ 1604 vswp->mrh = mac_rx_add(vswp->mh, 1605 vsw_rx_queue_cb, (void *)vswp); 1606 ASSERT(vswp->mrh != NULL); 1607 1608 /* 1609 * Register our mac resource callback. 1610 */ 1611 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1612 vswp->mresources = B_TRUE; 1613 1614 /* 1615 * Get the ring resources available to us from 1616 * the mac below us. 1617 */ 1618 mac_resources(vswp->mh); 1619 } else { 1620 /* 1621 * Just register our rx callback function 1622 */ 1623 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1624 ASSERT(vswp->mrh != NULL); 1625 } 1626 1627 /* Get the MAC tx fn */ 1628 vswp->txinfo = mac_tx_get(vswp->mh); 1629 1630 /* start the interface */ 1631 if (mac_start(vswp->mh) != 0) { 1632 cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", 1633 vswp->instance); 1634 goto mac_fail_exit; 1635 } 1636 1637 vswp->mstarted = B_TRUE; 1638 1639 D1(vswp, "%s: exit", __func__); 1640 return (0); 1641 1642 mac_fail_exit: 1643 vsw_mac_detach(vswp); 1644 1645 D1(vswp, "%s: exit", __func__); 1646 return (1); 1647 } 1648 1649 static void 1650 vsw_mac_detach(vsw_t *vswp) 1651 { 1652 D1(vswp, "vsw_mac_detach: enter"); 1653 1654 ASSERT(vswp != NULL); 1655 ASSERT(MUTEX_HELD(&vswp->mac_lock)); 1656 1657 if (vsw_multi_ring_enable) { 1658 vsw_mac_ring_tbl_destroy(vswp); 1659 } 1660 1661 if (vswp->mh != NULL) { 1662 if (vswp->mstarted) 1663 mac_stop(vswp->mh); 1664 if (vswp->mrh != NULL) 1665 mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); 1666 if (vswp->mresources) 1667 mac_resource_set(vswp->mh, NULL, NULL); 1668 } 1669 1670 vswp->mrh = NULL; 1671 vswp->txinfo = NULL; 1672 vswp->mstarted = B_FALSE; 1673 1674 D1(vswp, "vsw_mac_detach: exit"); 1675 } 1676 1677 /* 1678 * Depending on the mode specified, the capabilites and capacity 1679 * of the underlying device setup the physical device. 1680 * 1681 * If in layer 3 mode, then do nothing. 1682 * 1683 * If in layer 2 programmed mode attempt to program the unicast address 1684 * associated with the port into the physical device. If this is not 1685 * possible due to resource exhaustion or simply because the device does 1686 * not support multiple unicast addresses then if required fallback onto 1687 * putting the card into promisc mode. 1688 * 1689 * If in promisc mode then simply set the card into promisc mode. 1690 * 1691 * Returns 0 success, 1 on failure. 1692 */ 1693 static int 1694 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type) 1695 { 1696 mac_multi_addr_t mac_addr; 1697 int err; 1698 1699 D1(vswp, "%s: enter", __func__); 1700 1701 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1702 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1703 1704 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1705 return (0); 1706 1707 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1708 return (vsw_set_hw_promisc(vswp, port, type)); 1709 } 1710 1711 /* 1712 * Attempt to program the unicast address into the HW. 1713 */ 1714 mac_addr.mma_addrlen = ETHERADDRL; 1715 if (type == VSW_VNETPORT) { 1716 ASSERT(port != NULL); 1717 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1718 } else { 1719 ether_copy(&vswp->if_addr, &mac_addr.mma_addr); 1720 } 1721 1722 err = vsw_set_hw_addr(vswp, &mac_addr); 1723 if (err == ENOSPC) { 1724 /* 1725 * Mark that attempt should be made to re-config sometime 1726 * in future if a port is deleted. 1727 */ 1728 vswp->recfg_reqd = B_TRUE; 1729 1730 /* 1731 * Only 1 mode specified, nothing more to do. 1732 */ 1733 if (vswp->smode_num == 1) 1734 return (err); 1735 1736 /* 1737 * If promiscuous was next mode specified try to 1738 * set the card into that mode. 1739 */ 1740 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1741 (vswp->smode[vswp->smode_idx + 1] == 1742 VSW_LAYER2_PROMISC)) { 1743 vswp->smode_idx += 1; 1744 return (vsw_set_hw_promisc(vswp, port, type)); 1745 } 1746 return (err); 1747 } 1748 1749 if (err != 0) 1750 return (err); 1751 1752 if (type == VSW_VNETPORT) { 1753 port->addr_slot = mac_addr.mma_slot; 1754 port->addr_set = VSW_ADDR_HW; 1755 } else { 1756 vswp->addr_slot = mac_addr.mma_slot; 1757 vswp->addr_set = VSW_ADDR_HW; 1758 } 1759 1760 D2(vswp, "programmed addr %s into slot %d " 1761 "of device %s", ether_sprintf((void *)mac_addr.mma_addr), 1762 mac_addr.mma_slot, vswp->physname); 1763 1764 D1(vswp, "%s: exit", __func__); 1765 1766 return (0); 1767 } 1768 1769 /* 1770 * If in layer 3 mode do nothing. 1771 * 1772 * If in layer 2 switched mode remove the address from the physical 1773 * device. 1774 * 1775 * If in layer 2 promiscuous mode disable promisc mode. 1776 * 1777 * Returns 0 on success. 1778 */ 1779 static int 1780 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type) 1781 { 1782 mac_addr_slot_t slot; 1783 int rv; 1784 1785 D1(vswp, "%s: enter", __func__); 1786 1787 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1788 1789 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1790 return (0); 1791 1792 switch (type) { 1793 case VSW_VNETPORT: 1794 ASSERT(port != NULL); 1795 1796 if (port->addr_set == VSW_ADDR_PROMISC) { 1797 return (vsw_unset_hw_promisc(vswp, port, type)); 1798 1799 } else if (port->addr_set == VSW_ADDR_HW) { 1800 slot = port->addr_slot; 1801 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1802 port->addr_set = VSW_ADDR_UNSET; 1803 } 1804 1805 break; 1806 1807 case VSW_LOCALDEV: 1808 if (vswp->addr_set == VSW_ADDR_PROMISC) { 1809 return (vsw_unset_hw_promisc(vswp, NULL, type)); 1810 1811 } else if (vswp->addr_set == VSW_ADDR_HW) { 1812 slot = vswp->addr_slot; 1813 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1814 vswp->addr_set = VSW_ADDR_UNSET; 1815 } 1816 1817 break; 1818 1819 default: 1820 /* should never happen */ 1821 DERR(vswp, "%s: unknown type %d", __func__, type); 1822 ASSERT(0); 1823 return (1); 1824 } 1825 1826 D1(vswp, "%s: exit", __func__); 1827 return (rv); 1828 } 1829 1830 /* 1831 * Attempt to program a unicast address into HW. 1832 * 1833 * Returns 0 on sucess, 1 on failure. 1834 */ 1835 static int 1836 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac) 1837 { 1838 void *mah; 1839 int rv = EINVAL; 1840 1841 D1(vswp, "%s: enter", __func__); 1842 1843 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1844 1845 if (vswp->maddr.maddr_handle == NULL) 1846 return (rv); 1847 1848 mah = vswp->maddr.maddr_handle; 1849 1850 rv = vswp->maddr.maddr_add(mah, mac); 1851 1852 if (rv == 0) 1853 return (rv); 1854 1855 /* 1856 * Its okay for the add to fail because we have exhausted 1857 * all the resouces in the hardware device. Any other error 1858 * we want to flag. 1859 */ 1860 if (rv != ENOSPC) { 1861 cmn_err(CE_WARN, "!vsw%d: error programming " 1862 "address %s into HW err (%d)", 1863 vswp->instance, ether_sprintf((void *)mac->mma_addr), rv); 1864 } 1865 D1(vswp, "%s: exit", __func__); 1866 return (rv); 1867 } 1868 1869 /* 1870 * Remove a unicast mac address which has previously been programmed 1871 * into HW. 1872 * 1873 * Returns 0 on sucess, 1 on failure. 1874 */ 1875 static int 1876 vsw_unset_hw_addr(vsw_t *vswp, int slot) 1877 { 1878 void *mah; 1879 int rv; 1880 1881 D1(vswp, "%s: enter", __func__); 1882 1883 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1884 ASSERT(slot >= 0); 1885 1886 if (vswp->maddr.maddr_handle == NULL) 1887 return (1); 1888 1889 mah = vswp->maddr.maddr_handle; 1890 1891 rv = vswp->maddr.maddr_remove(mah, slot); 1892 if (rv != 0) { 1893 cmn_err(CE_WARN, "!vsw%d: unable to remove address " 1894 "from slot %d in device %s (err %d)", 1895 vswp->instance, slot, vswp->physname, rv); 1896 return (1); 1897 } 1898 1899 D2(vswp, "removed addr from slot %d in device %s", 1900 slot, vswp->physname); 1901 1902 D1(vswp, "%s: exit", __func__); 1903 return (0); 1904 } 1905 1906 /* 1907 * Set network card into promisc mode. 1908 * 1909 * Returns 0 on success, 1 on failure. 1910 */ 1911 static int 1912 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1913 { 1914 D1(vswp, "%s: enter", __func__); 1915 1916 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1917 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1918 1919 mutex_enter(&vswp->mac_lock); 1920 if (vswp->mh == NULL) { 1921 mutex_exit(&vswp->mac_lock); 1922 return (1); 1923 } 1924 1925 if (vswp->promisc_cnt++ == 0) { 1926 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1927 vswp->promisc_cnt--; 1928 mutex_exit(&vswp->mac_lock); 1929 return (1); 1930 } 1931 cmn_err(CE_NOTE, "!vsw%d: switching device %s into " 1932 "promiscuous mode", vswp->instance, vswp->physname); 1933 } 1934 mutex_exit(&vswp->mac_lock); 1935 1936 if (type == VSW_VNETPORT) { 1937 ASSERT(port != NULL); 1938 port->addr_set = VSW_ADDR_PROMISC; 1939 } else { 1940 vswp->addr_set = VSW_ADDR_PROMISC; 1941 } 1942 1943 D1(vswp, "%s: exit", __func__); 1944 1945 return (0); 1946 } 1947 1948 /* 1949 * Turn off promiscuous mode on network card. 1950 * 1951 * Returns 0 on success, 1 on failure. 1952 */ 1953 static int 1954 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1955 { 1956 vsw_port_list_t *plist = &vswp->plist; 1957 1958 D2(vswp, "%s: enter", __func__); 1959 1960 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1961 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1962 1963 mutex_enter(&vswp->mac_lock); 1964 if (vswp->mh == NULL) { 1965 mutex_exit(&vswp->mac_lock); 1966 return (1); 1967 } 1968 1969 if (--vswp->promisc_cnt == 0) { 1970 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1971 vswp->promisc_cnt++; 1972 mutex_exit(&vswp->mac_lock); 1973 return (1); 1974 } 1975 1976 /* 1977 * We are exiting promisc mode either because we were 1978 * only in promisc mode because we had failed over from 1979 * switched mode due to HW resource issues, or the user 1980 * wanted the card in promisc mode for all the ports and 1981 * the last port is now being deleted. Tweak the message 1982 * accordingly. 1983 */ 1984 if (plist->num_ports != 0) { 1985 cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " 1986 "programmed mode", vswp->instance, vswp->physname); 1987 } else { 1988 cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " 1989 "promiscuous mode", vswp->instance, vswp->physname); 1990 } 1991 } 1992 mutex_exit(&vswp->mac_lock); 1993 1994 if (type == VSW_VNETPORT) { 1995 ASSERT(port != NULL); 1996 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1997 port->addr_set = VSW_ADDR_UNSET; 1998 } else { 1999 ASSERT(vswp->addr_set == VSW_ADDR_PROMISC); 2000 vswp->addr_set = VSW_ADDR_UNSET; 2001 } 2002 2003 D1(vswp, "%s: exit", __func__); 2004 return (0); 2005 } 2006 2007 /* 2008 * Determine whether or not we are operating in our prefered 2009 * mode and if not whether the physical resources now allow us 2010 * to operate in it. 2011 * 2012 * If a port is being removed should only be invoked after port has been 2013 * removed from the port list. 2014 */ 2015 static void 2016 vsw_reconfig_hw(vsw_t *vswp) 2017 { 2018 int s_idx; 2019 2020 D1(vswp, "%s: enter", __func__); 2021 2022 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 2023 2024 if (vswp->maddr.maddr_handle == NULL) { 2025 return; 2026 } 2027 2028 /* 2029 * If we are in layer 2 (i.e. switched) or would like to be 2030 * in layer 2 then check if any ports or the vswitch itself 2031 * need to be programmed into the HW. 2032 * 2033 * This can happen in two cases - switched was specified as 2034 * the prefered mode of operation but we exhausted the HW 2035 * resources and so failed over to the next specifed mode, 2036 * or switched was the only mode specified so after HW 2037 * resources were exhausted there was nothing more we 2038 * could do. 2039 */ 2040 if (vswp->smode_idx > 0) 2041 s_idx = vswp->smode_idx - 1; 2042 else 2043 s_idx = vswp->smode_idx; 2044 2045 if (vswp->smode[s_idx] != VSW_LAYER2) { 2046 return; 2047 } 2048 2049 D2(vswp, "%s: attempting reconfig..", __func__); 2050 2051 /* 2052 * First, attempt to set the vswitch mac address into HW, 2053 * if required. 2054 */ 2055 if (vsw_prog_if(vswp)) { 2056 return; 2057 } 2058 2059 /* 2060 * Next, attempt to set any ports which have not yet been 2061 * programmed into HW. 2062 */ 2063 if (vsw_prog_ports(vswp)) { 2064 return; 2065 } 2066 2067 /* 2068 * By now we know that have programmed all desired ports etc 2069 * into HW, so safe to mark reconfiguration as complete. 2070 */ 2071 vswp->recfg_reqd = B_FALSE; 2072 2073 vswp->smode_idx = s_idx; 2074 2075 D1(vswp, "%s: exit", __func__); 2076 } 2077 2078 /* 2079 * Check to see if vsw itself is plumbed, and if so whether or not 2080 * its mac address should be written into HW. 2081 * 2082 * Returns 0 if could set address, or didn't have to set it. 2083 * Returns 1 if failed to set address. 2084 */ 2085 static int 2086 vsw_prog_if(vsw_t *vswp) 2087 { 2088 mac_multi_addr_t addr; 2089 2090 D1(vswp, "%s: enter", __func__); 2091 2092 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 2093 2094 READ_ENTER(&vswp->if_lockrw); 2095 if ((vswp->if_state & VSW_IF_UP) && 2096 (vswp->addr_set != VSW_ADDR_HW)) { 2097 2098 addr.mma_addrlen = ETHERADDRL; 2099 ether_copy(&vswp->if_addr, &addr.mma_addr); 2100 2101 if (vsw_set_hw_addr(vswp, &addr) != 0) { 2102 RW_EXIT(&vswp->if_lockrw); 2103 return (1); 2104 } 2105 2106 vswp->addr_slot = addr.mma_slot; 2107 2108 /* 2109 * If previously when plumbed had had to place 2110 * interface into promisc mode, now reverse that. 2111 * 2112 * Note that interface will only actually be set into 2113 * non-promisc mode when last port/interface has been 2114 * programmed into HW. 2115 */ 2116 if (vswp->addr_set == VSW_ADDR_PROMISC) 2117 (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV); 2118 2119 vswp->addr_set = VSW_ADDR_HW; 2120 } 2121 RW_EXIT(&vswp->if_lockrw); 2122 2123 D1(vswp, "%s: exit", __func__); 2124 return (0); 2125 } 2126 2127 /* 2128 * Scan the port list for any ports which have not yet been set 2129 * into HW. For those found attempt to program their mac addresses 2130 * into the physical device. 2131 * 2132 * Returns 0 if able to program all required ports (can be 0) into HW. 2133 * Returns 1 if failed to set at least one mac address. 2134 */ 2135 static int 2136 vsw_prog_ports(vsw_t *vswp) 2137 { 2138 mac_multi_addr_t addr; 2139 vsw_port_list_t *plist = &vswp->plist; 2140 vsw_port_t *tp; 2141 int rv = 0; 2142 2143 D1(vswp, "%s: enter", __func__); 2144 2145 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 2146 2147 READ_ENTER(&plist->lockrw); 2148 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 2149 if (tp->addr_set != VSW_ADDR_HW) { 2150 addr.mma_addrlen = ETHERADDRL; 2151 ether_copy(&tp->p_macaddr, &addr.mma_addr); 2152 2153 if (vsw_set_hw_addr(vswp, &addr) != 0) { 2154 rv = 1; 2155 break; 2156 } 2157 2158 tp->addr_slot = addr.mma_slot; 2159 2160 /* 2161 * If when this port had first attached we had 2162 * had to place the interface into promisc mode, 2163 * then now reverse that. 2164 * 2165 * Note that the interface will not actually 2166 * change to non-promisc mode until all ports 2167 * have been programmed. 2168 */ 2169 if (tp->addr_set == VSW_ADDR_PROMISC) 2170 (void) vsw_unset_hw_promisc(vswp, 2171 tp, VSW_VNETPORT); 2172 2173 tp->addr_set = VSW_ADDR_HW; 2174 } 2175 } 2176 RW_EXIT(&plist->lockrw); 2177 2178 D1(vswp, "%s: exit", __func__); 2179 return (rv); 2180 } 2181 2182 static void 2183 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 2184 { 2185 ringp->ring_state = VSW_MAC_RING_FREE; 2186 ringp->ring_arg = NULL; 2187 ringp->ring_blank = NULL; 2188 ringp->ring_vqp = NULL; 2189 ringp->ring_vswp = vswp; 2190 } 2191 2192 static void 2193 vsw_mac_ring_tbl_init(vsw_t *vswp) 2194 { 2195 int i; 2196 2197 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 2198 2199 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 2200 vswp->mac_ring_tbl = 2201 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP); 2202 2203 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 2204 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 2205 } 2206 2207 static void 2208 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 2209 { 2210 int i; 2211 vsw_mac_ring_t *ringp; 2212 2213 mutex_enter(&vswp->mac_ring_lock); 2214 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 2215 ringp = &vswp->mac_ring_tbl[i]; 2216 2217 if (ringp->ring_state != VSW_MAC_RING_FREE) { 2218 /* 2219 * Destroy the queue. 2220 */ 2221 vsw_queue_stop(ringp->ring_vqp); 2222 vsw_queue_destroy(ringp->ring_vqp); 2223 2224 /* 2225 * Re-initialize the structure. 2226 */ 2227 vsw_mac_ring_tbl_entry_init(vswp, ringp); 2228 } 2229 } 2230 mutex_exit(&vswp->mac_ring_lock); 2231 2232 mutex_destroy(&vswp->mac_ring_lock); 2233 kmem_free(vswp->mac_ring_tbl, 2234 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 2235 vswp->mac_ring_tbl_sz = 0; 2236 } 2237 2238 /* 2239 * Handle resource add callbacks from the driver below. 2240 */ 2241 static mac_resource_handle_t 2242 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 2243 { 2244 vsw_t *vswp = (vsw_t *)arg; 2245 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2246 vsw_mac_ring_t *ringp; 2247 vsw_queue_t *vqp; 2248 int i; 2249 2250 ASSERT(vswp != NULL); 2251 ASSERT(mrp != NULL); 2252 ASSERT(vswp->mac_ring_tbl != NULL); 2253 2254 D1(vswp, "%s: enter", __func__); 2255 2256 /* 2257 * Check to make sure we have the correct resource type. 2258 */ 2259 if (mrp->mr_type != MAC_RX_FIFO) 2260 return (NULL); 2261 2262 /* 2263 * Find a open entry in the ring table. 2264 */ 2265 mutex_enter(&vswp->mac_ring_lock); 2266 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 2267 ringp = &vswp->mac_ring_tbl[i]; 2268 2269 /* 2270 * Check for an empty slot, if found, then setup queue 2271 * and thread. 2272 */ 2273 if (ringp->ring_state == VSW_MAC_RING_FREE) { 2274 /* 2275 * Create the queue for this ring. 2276 */ 2277 vqp = vsw_queue_create(); 2278 2279 /* 2280 * Initialize the ring data structure. 2281 */ 2282 ringp->ring_vqp = vqp; 2283 ringp->ring_arg = mrfp->mrf_arg; 2284 ringp->ring_blank = mrfp->mrf_blank; 2285 ringp->ring_state = VSW_MAC_RING_INUSE; 2286 2287 /* 2288 * Create the worker thread. 2289 */ 2290 vqp->vq_worker = thread_create(NULL, 0, 2291 vsw_queue_worker, ringp, 0, &p0, 2292 TS_RUN, minclsyspri); 2293 if (vqp->vq_worker == NULL) { 2294 vsw_queue_destroy(vqp); 2295 vsw_mac_ring_tbl_entry_init(vswp, ringp); 2296 ringp = NULL; 2297 } 2298 2299 if (ringp != NULL) { 2300 /* 2301 * Make sure thread get's running state for 2302 * this ring. 2303 */ 2304 mutex_enter(&vqp->vq_lock); 2305 while ((vqp->vq_state != VSW_QUEUE_RUNNING) && 2306 (vqp->vq_state != VSW_QUEUE_DRAINED)) { 2307 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2308 } 2309 2310 /* 2311 * If the thread is not running, cleanup. 2312 */ 2313 if (vqp->vq_state == VSW_QUEUE_DRAINED) { 2314 vsw_queue_destroy(vqp); 2315 vsw_mac_ring_tbl_entry_init(vswp, 2316 ringp); 2317 ringp = NULL; 2318 } 2319 mutex_exit(&vqp->vq_lock); 2320 } 2321 2322 mutex_exit(&vswp->mac_ring_lock); 2323 D1(vswp, "%s: exit", __func__); 2324 return ((mac_resource_handle_t)ringp); 2325 } 2326 } 2327 mutex_exit(&vswp->mac_ring_lock); 2328 2329 /* 2330 * No slots in the ring table available. 2331 */ 2332 D1(vswp, "%s: exit", __func__); 2333 return (NULL); 2334 } 2335 2336 static void 2337 vsw_queue_stop(vsw_queue_t *vqp) 2338 { 2339 mutex_enter(&vqp->vq_lock); 2340 2341 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 2342 vqp->vq_state = VSW_QUEUE_STOP; 2343 cv_signal(&vqp->vq_cv); 2344 2345 while (vqp->vq_state != VSW_QUEUE_DRAINED) 2346 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2347 } 2348 2349 vqp->vq_state = VSW_QUEUE_STOPPED; 2350 2351 mutex_exit(&vqp->vq_lock); 2352 } 2353 2354 static vsw_queue_t * 2355 vsw_queue_create() 2356 { 2357 vsw_queue_t *vqp; 2358 2359 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 2360 2361 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 2362 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 2363 vqp->vq_first = NULL; 2364 vqp->vq_last = NULL; 2365 vqp->vq_state = VSW_QUEUE_STOPPED; 2366 2367 return (vqp); 2368 } 2369 2370 static void 2371 vsw_queue_destroy(vsw_queue_t *vqp) 2372 { 2373 cv_destroy(&vqp->vq_cv); 2374 mutex_destroy(&vqp->vq_lock); 2375 kmem_free(vqp, sizeof (vsw_queue_t)); 2376 } 2377 2378 static void 2379 vsw_queue_worker(vsw_mac_ring_t *rrp) 2380 { 2381 mblk_t *mp; 2382 vsw_queue_t *vqp = rrp->ring_vqp; 2383 vsw_t *vswp = rrp->ring_vswp; 2384 2385 mutex_enter(&vqp->vq_lock); 2386 2387 ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); 2388 2389 /* 2390 * Set the state to running, since the thread is now active. 2391 */ 2392 vqp->vq_state = VSW_QUEUE_RUNNING; 2393 cv_signal(&vqp->vq_cv); 2394 2395 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 2396 /* 2397 * Wait for work to do or the state has changed 2398 * to not running. 2399 */ 2400 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 2401 (vqp->vq_first == NULL)) { 2402 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2403 } 2404 2405 /* 2406 * Process packets that we received from the interface. 2407 */ 2408 if (vqp->vq_first != NULL) { 2409 mp = vqp->vq_first; 2410 2411 vqp->vq_first = NULL; 2412 vqp->vq_last = NULL; 2413 2414 mutex_exit(&vqp->vq_lock); 2415 2416 /* switch the chain of packets received */ 2417 vswp->vsw_switch_frame(vswp, mp, 2418 VSW_PHYSDEV, NULL, NULL); 2419 2420 mutex_enter(&vqp->vq_lock); 2421 } 2422 } 2423 2424 /* 2425 * We are drained and signal we are done. 2426 */ 2427 vqp->vq_state = VSW_QUEUE_DRAINED; 2428 cv_signal(&vqp->vq_cv); 2429 2430 /* 2431 * Exit lock and drain the remaining packets. 2432 */ 2433 mutex_exit(&vqp->vq_lock); 2434 2435 /* 2436 * Exit the thread 2437 */ 2438 thread_exit(); 2439 } 2440 2441 /* 2442 * static void 2443 * vsw_rx_queue_cb() - Receive callback routine when 2444 * vsw_multi_ring_enable is non-zero. Queue the packets 2445 * to a packet queue for a worker thread to process. 2446 */ 2447 static void 2448 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2449 { 2450 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 2451 vsw_t *vswp = (vsw_t *)arg; 2452 vsw_queue_t *vqp; 2453 mblk_t *bp, *last; 2454 2455 ASSERT(mrh != NULL); 2456 ASSERT(vswp != NULL); 2457 ASSERT(mp != NULL); 2458 2459 D1(vswp, "%s: enter", __func__); 2460 2461 /* 2462 * Find the last element in the mblk chain. 2463 */ 2464 bp = mp; 2465 do { 2466 last = bp; 2467 bp = bp->b_next; 2468 } while (bp != NULL); 2469 2470 /* Get the queue for the packets */ 2471 vqp = ringp->ring_vqp; 2472 2473 /* 2474 * Grab the lock such we can queue the packets. 2475 */ 2476 mutex_enter(&vqp->vq_lock); 2477 2478 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 2479 freemsg(mp); 2480 mutex_exit(&vqp->vq_lock); 2481 goto vsw_rx_queue_cb_exit; 2482 } 2483 2484 /* 2485 * Add the mblk chain to the queue. If there 2486 * is some mblks in the queue, then add the new 2487 * chain to the end. 2488 */ 2489 if (vqp->vq_first == NULL) 2490 vqp->vq_first = mp; 2491 else 2492 vqp->vq_last->b_next = mp; 2493 2494 vqp->vq_last = last; 2495 2496 /* 2497 * Signal the worker thread that there is work to 2498 * do. 2499 */ 2500 cv_signal(&vqp->vq_cv); 2501 2502 /* 2503 * Let go of the lock and exit. 2504 */ 2505 mutex_exit(&vqp->vq_lock); 2506 2507 vsw_rx_queue_cb_exit: 2508 D1(vswp, "%s: exit", __func__); 2509 } 2510 2511 /* 2512 * receive callback routine. Invoked by MAC layer when there 2513 * are pkts being passed up from physical device. 2514 * 2515 * PERF: It may be more efficient when the card is in promisc 2516 * mode to check the dest address of the pkts here (against 2517 * the FDB) rather than checking later. Needs to be investigated. 2518 */ 2519 static void 2520 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2521 { 2522 _NOTE(ARGUNUSED(mrh)) 2523 2524 vsw_t *vswp = (vsw_t *)arg; 2525 2526 ASSERT(vswp != NULL); 2527 2528 D1(vswp, "vsw_rx_cb: enter"); 2529 2530 /* switch the chain of packets received */ 2531 vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2532 2533 D1(vswp, "vsw_rx_cb: exit"); 2534 } 2535 2536 /* 2537 * Send a message out over the physical device via the MAC layer. 2538 * 2539 * Returns any mblks that it was unable to transmit. 2540 */ 2541 static mblk_t * 2542 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2543 { 2544 const mac_txinfo_t *mtp; 2545 mblk_t *nextp; 2546 2547 mutex_enter(&vswp->mac_lock); 2548 if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) { 2549 2550 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2551 mutex_exit(&vswp->mac_lock); 2552 return (mp); 2553 } else { 2554 for (;;) { 2555 nextp = mp->b_next; 2556 mp->b_next = NULL; 2557 2558 mtp = vswp->txinfo; 2559 2560 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2561 mp->b_next = nextp; 2562 break; 2563 } 2564 2565 if ((mp = nextp) == NULL) 2566 break; 2567 } 2568 } 2569 mutex_exit(&vswp->mac_lock); 2570 2571 return (mp); 2572 } 2573 2574 /* 2575 * Register with the MAC layer as a network device, so we 2576 * can be plumbed if necessary. 2577 */ 2578 static int 2579 vsw_mac_register(vsw_t *vswp) 2580 { 2581 mac_register_t *macp; 2582 int rv; 2583 2584 D1(vswp, "%s: enter", __func__); 2585 2586 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2587 return (EINVAL); 2588 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2589 macp->m_driver = vswp; 2590 macp->m_dip = vswp->dip; 2591 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2592 macp->m_callbacks = &vsw_m_callbacks; 2593 macp->m_min_sdu = 0; 2594 macp->m_max_sdu = ETHERMTU; 2595 rv = mac_register(macp, &vswp->if_mh); 2596 mac_free(macp); 2597 if (rv != 0) { 2598 /* 2599 * Treat this as a non-fatal error as we may be 2600 * able to operate in some other mode. 2601 */ 2602 cmn_err(CE_NOTE, "!vsw%d: Unable to register as " 2603 "a provider with MAC layer", vswp->instance); 2604 return (rv); 2605 } 2606 2607 vswp->if_state |= VSW_IF_REG; 2608 2609 D1(vswp, "%s: exit", __func__); 2610 2611 return (rv); 2612 } 2613 2614 static int 2615 vsw_mac_unregister(vsw_t *vswp) 2616 { 2617 int rv = 0; 2618 2619 D1(vswp, "%s: enter", __func__); 2620 2621 WRITE_ENTER(&vswp->if_lockrw); 2622 2623 if (vswp->if_state & VSW_IF_REG) { 2624 rv = mac_unregister(vswp->if_mh); 2625 if (rv != 0) { 2626 DWARN(vswp, "%s: unable to unregister from MAC " 2627 "framework", __func__); 2628 2629 RW_EXIT(&vswp->if_lockrw); 2630 D1(vswp, "%s: fail exit", __func__); 2631 return (rv); 2632 } 2633 2634 /* mark i/f as down and unregistered */ 2635 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2636 } 2637 RW_EXIT(&vswp->if_lockrw); 2638 2639 D1(vswp, "%s: exit", __func__); 2640 2641 return (rv); 2642 } 2643 2644 static int 2645 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2646 { 2647 vsw_t *vswp = (vsw_t *)arg; 2648 2649 D1(vswp, "%s: enter", __func__); 2650 2651 mutex_enter(&vswp->mac_lock); 2652 if (vswp->mh == NULL) { 2653 mutex_exit(&vswp->mac_lock); 2654 return (EINVAL); 2655 } 2656 2657 /* return stats from underlying device */ 2658 *val = mac_stat_get(vswp->mh, stat); 2659 2660 mutex_exit(&vswp->mac_lock); 2661 2662 return (0); 2663 } 2664 2665 static void 2666 vsw_m_stop(void *arg) 2667 { 2668 vsw_t *vswp = (vsw_t *)arg; 2669 2670 D1(vswp, "%s: enter", __func__); 2671 2672 WRITE_ENTER(&vswp->if_lockrw); 2673 vswp->if_state &= ~VSW_IF_UP; 2674 RW_EXIT(&vswp->if_lockrw); 2675 2676 mutex_enter(&vswp->hw_lock); 2677 2678 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 2679 2680 if (vswp->recfg_reqd) 2681 vsw_reconfig_hw(vswp); 2682 2683 mutex_exit(&vswp->hw_lock); 2684 2685 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2686 } 2687 2688 static int 2689 vsw_m_start(void *arg) 2690 { 2691 vsw_t *vswp = (vsw_t *)arg; 2692 2693 D1(vswp, "%s: enter", __func__); 2694 2695 WRITE_ENTER(&vswp->if_lockrw); 2696 2697 vswp->if_state |= VSW_IF_UP; 2698 2699 if (vswp->switching_setup_done == B_FALSE) { 2700 /* 2701 * If the switching mode has not been setup yet, just 2702 * return. The unicast address will be programmed 2703 * after the physical device is successfully setup by the 2704 * timeout handler. 2705 */ 2706 RW_EXIT(&vswp->if_lockrw); 2707 return (0); 2708 } 2709 2710 /* if in layer2 mode, program unicast address. */ 2711 if (vswp->mh != NULL) { 2712 mutex_enter(&vswp->hw_lock); 2713 (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 2714 mutex_exit(&vswp->hw_lock); 2715 } 2716 2717 RW_EXIT(&vswp->if_lockrw); 2718 2719 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2720 return (0); 2721 } 2722 2723 /* 2724 * Change the local interface address. 2725 * 2726 * Note: we don't support this entry point. The local 2727 * mac address of the switch can only be changed via its 2728 * MD node properties. 2729 */ 2730 static int 2731 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2732 { 2733 _NOTE(ARGUNUSED(arg, macaddr)) 2734 2735 return (DDI_FAILURE); 2736 } 2737 2738 static int 2739 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2740 { 2741 vsw_t *vswp = (vsw_t *)arg; 2742 mcst_addr_t *mcst_p = NULL; 2743 uint64_t addr = 0x0; 2744 int i, ret = 0; 2745 2746 D1(vswp, "%s: enter", __func__); 2747 2748 /* 2749 * Convert address into form that can be used 2750 * as hash table key. 2751 */ 2752 for (i = 0; i < ETHERADDRL; i++) { 2753 addr = (addr << 8) | mca[i]; 2754 } 2755 2756 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2757 2758 if (add) { 2759 D2(vswp, "%s: adding multicast", __func__); 2760 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2761 /* 2762 * Update the list of multicast addresses 2763 * contained within the vsw_t structure to 2764 * include this new one. 2765 */ 2766 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2767 if (mcst_p == NULL) { 2768 DERR(vswp, "%s unable to alloc mem", __func__); 2769 (void) vsw_del_mcst(vswp, 2770 VSW_LOCALDEV, addr, NULL); 2771 return (1); 2772 } 2773 mcst_p->addr = addr; 2774 ether_copy(mca, &mcst_p->mca); 2775 2776 /* 2777 * Call into the underlying driver to program the 2778 * address into HW. 2779 */ 2780 mutex_enter(&vswp->mac_lock); 2781 if (vswp->mh != NULL) { 2782 ret = mac_multicst_add(vswp->mh, mca); 2783 if (ret != 0) { 2784 cmn_err(CE_WARN, "!vsw%d: unable to " 2785 "add multicast address", 2786 vswp->instance); 2787 mutex_exit(&vswp->mac_lock); 2788 (void) vsw_del_mcst(vswp, 2789 VSW_LOCALDEV, addr, NULL); 2790 kmem_free(mcst_p, sizeof (*mcst_p)); 2791 return (ret); 2792 } 2793 mcst_p->mac_added = B_TRUE; 2794 } 2795 mutex_exit(&vswp->mac_lock); 2796 2797 mutex_enter(&vswp->mca_lock); 2798 mcst_p->nextp = vswp->mcap; 2799 vswp->mcap = mcst_p; 2800 mutex_exit(&vswp->mca_lock); 2801 } else { 2802 cmn_err(CE_WARN, "!vsw%d: unable to add multicast " 2803 "address", vswp->instance); 2804 } 2805 return (ret); 2806 } 2807 2808 D2(vswp, "%s: removing multicast", __func__); 2809 /* 2810 * Remove the address from the hash table.. 2811 */ 2812 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2813 2814 /* 2815 * ..and then from the list maintained in the 2816 * vsw_t structure. 2817 */ 2818 mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2819 ASSERT(mcst_p != NULL); 2820 2821 mutex_enter(&vswp->mac_lock); 2822 if (vswp->mh != NULL && mcst_p->mac_added) { 2823 (void) mac_multicst_remove(vswp->mh, mca); 2824 mcst_p->mac_added = B_FALSE; 2825 } 2826 mutex_exit(&vswp->mac_lock); 2827 kmem_free(mcst_p, sizeof (*mcst_p)); 2828 } 2829 2830 D1(vswp, "%s: exit", __func__); 2831 2832 return (0); 2833 } 2834 2835 static int 2836 vsw_m_promisc(void *arg, boolean_t on) 2837 { 2838 vsw_t *vswp = (vsw_t *)arg; 2839 2840 D1(vswp, "%s: enter", __func__); 2841 2842 WRITE_ENTER(&vswp->if_lockrw); 2843 if (on) 2844 vswp->if_state |= VSW_IF_PROMISC; 2845 else 2846 vswp->if_state &= ~VSW_IF_PROMISC; 2847 RW_EXIT(&vswp->if_lockrw); 2848 2849 D1(vswp, "%s: exit", __func__); 2850 2851 return (0); 2852 } 2853 2854 static mblk_t * 2855 vsw_m_tx(void *arg, mblk_t *mp) 2856 { 2857 vsw_t *vswp = (vsw_t *)arg; 2858 2859 D1(vswp, "%s: enter", __func__); 2860 2861 vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2862 2863 D1(vswp, "%s: exit", __func__); 2864 2865 return (NULL); 2866 } 2867 2868 /* 2869 * Register for machine description (MD) updates. 2870 * 2871 * Returns 0 on success, 1 on failure. 2872 */ 2873 static int 2874 vsw_mdeg_register(vsw_t *vswp) 2875 { 2876 mdeg_prop_spec_t *pspecp; 2877 mdeg_node_spec_t *inst_specp; 2878 mdeg_handle_t mdeg_hdl, mdeg_port_hdl; 2879 size_t templatesz; 2880 int rv; 2881 2882 D1(vswp, "%s: enter", __func__); 2883 2884 /* 2885 * Allocate and initialize a per-instance copy 2886 * of the global property spec array that will 2887 * uniquely identify this vsw instance. 2888 */ 2889 templatesz = sizeof (vsw_prop_template); 2890 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2891 2892 bcopy(vsw_prop_template, pspecp, templatesz); 2893 2894 VSW_SET_MDEG_PROP_INST(pspecp, vswp->regprop); 2895 2896 /* initialize the complete prop spec structure */ 2897 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2898 inst_specp->namep = "virtual-device"; 2899 inst_specp->specp = pspecp; 2900 2901 D2(vswp, "%s: instance %d registering with mdeg", __func__, 2902 vswp->regprop); 2903 /* 2904 * Register an interest in 'virtual-device' nodes with a 2905 * 'name' property of 'virtual-network-switch' 2906 */ 2907 rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb, 2908 (void *)vswp, &mdeg_hdl); 2909 if (rv != MDEG_SUCCESS) { 2910 DERR(vswp, "%s: mdeg_register failed (%d) for vsw node", 2911 __func__, rv); 2912 goto mdeg_reg_fail; 2913 } 2914 2915 /* 2916 * Register an interest in 'vsw-port' nodes. 2917 */ 2918 rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb, 2919 (void *)vswp, &mdeg_port_hdl); 2920 if (rv != MDEG_SUCCESS) { 2921 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2922 (void) mdeg_unregister(mdeg_hdl); 2923 goto mdeg_reg_fail; 2924 } 2925 2926 /* save off data that will be needed later */ 2927 vswp->inst_spec = inst_specp; 2928 vswp->mdeg_hdl = mdeg_hdl; 2929 vswp->mdeg_port_hdl = mdeg_port_hdl; 2930 2931 D1(vswp, "%s: exit", __func__); 2932 return (0); 2933 2934 mdeg_reg_fail: 2935 cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks", 2936 vswp->instance); 2937 kmem_free(pspecp, templatesz); 2938 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2939 2940 vswp->mdeg_hdl = NULL; 2941 vswp->mdeg_port_hdl = NULL; 2942 2943 return (1); 2944 } 2945 2946 static void 2947 vsw_mdeg_unregister(vsw_t *vswp) 2948 { 2949 D1(vswp, "vsw_mdeg_unregister: enter"); 2950 2951 if (vswp->mdeg_hdl != NULL) 2952 (void) mdeg_unregister(vswp->mdeg_hdl); 2953 2954 if (vswp->mdeg_port_hdl != NULL) 2955 (void) mdeg_unregister(vswp->mdeg_port_hdl); 2956 2957 if (vswp->inst_spec != NULL) { 2958 if (vswp->inst_spec->specp != NULL) { 2959 (void) kmem_free(vswp->inst_spec->specp, 2960 sizeof (vsw_prop_template)); 2961 vswp->inst_spec->specp = NULL; 2962 } 2963 2964 (void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t)); 2965 vswp->inst_spec = NULL; 2966 } 2967 2968 D1(vswp, "vsw_mdeg_unregister: exit"); 2969 } 2970 2971 /* 2972 * Mdeg callback invoked for the vsw node itself. 2973 */ 2974 static int 2975 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2976 { 2977 vsw_t *vswp; 2978 md_t *mdp; 2979 mde_cookie_t node; 2980 uint64_t inst; 2981 char *node_name = NULL; 2982 2983 if (resp == NULL) 2984 return (MDEG_FAILURE); 2985 2986 vswp = (vsw_t *)cb_argp; 2987 2988 D1(vswp, "%s: added %d : removed %d : curr matched %d" 2989 " : prev matched %d", __func__, resp->added.nelem, 2990 resp->removed.nelem, resp->match_curr.nelem, 2991 resp->match_prev.nelem); 2992 2993 /* 2994 * We get an initial callback for this node as 'added' 2995 * after registering with mdeg. Note that we would have 2996 * already gathered information about this vsw node by 2997 * walking MD earlier during attach (in vsw_read_mdprops()). 2998 * So, there is a window where the properties of this 2999 * node might have changed when we get this initial 'added' 3000 * callback. We handle this as if an update occured 3001 * and invoke the same function which handles updates to 3002 * the properties of this vsw-node if any. 3003 * 3004 * A non-zero 'match' value indicates that the MD has been 3005 * updated and that a virtual-network-switch node is 3006 * present which may or may not have been updated. It is 3007 * up to the clients to examine their own nodes and 3008 * determine if they have changed. 3009 */ 3010 if (resp->added.nelem != 0) { 3011 3012 if (resp->added.nelem != 1) { 3013 cmn_err(CE_NOTE, "!vsw%d: number of nodes added " 3014 "invalid: %d\n", vswp->instance, resp->added.nelem); 3015 return (MDEG_FAILURE); 3016 } 3017 3018 mdp = resp->added.mdp; 3019 node = resp->added.mdep[0]; 3020 3021 } else if (resp->match_curr.nelem != 0) { 3022 3023 if (resp->match_curr.nelem != 1) { 3024 cmn_err(CE_NOTE, "!vsw%d: number of nodes updated " 3025 "invalid: %d\n", vswp->instance, 3026 resp->match_curr.nelem); 3027 return (MDEG_FAILURE); 3028 } 3029 3030 mdp = resp->match_curr.mdp; 3031 node = resp->match_curr.mdep[0]; 3032 3033 } else { 3034 return (MDEG_FAILURE); 3035 } 3036 3037 /* Validate name and instance */ 3038 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 3039 DERR(vswp, "%s: unable to get node name\n", __func__); 3040 return (MDEG_FAILURE); 3041 } 3042 3043 /* is this a virtual-network-switch? */ 3044 if (strcmp(node_name, vsw_propname) != 0) { 3045 DERR(vswp, "%s: Invalid node name: %s\n", 3046 __func__, node_name); 3047 return (MDEG_FAILURE); 3048 } 3049 3050 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 3051 DERR(vswp, "%s: prop(cfg-handle) not found\n", 3052 __func__); 3053 return (MDEG_FAILURE); 3054 } 3055 3056 /* is this the right instance of vsw? */ 3057 if (inst != vswp->regprop) { 3058 DERR(vswp, "%s: Invalid cfg-handle: %lx\n", 3059 __func__, inst); 3060 return (MDEG_FAILURE); 3061 } 3062 3063 vsw_update_md_prop(vswp, mdp, node); 3064 3065 return (MDEG_SUCCESS); 3066 } 3067 3068 /* 3069 * Mdeg callback invoked for changes to the vsw-port nodes 3070 * under the vsw node. 3071 */ 3072 static int 3073 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 3074 { 3075 vsw_t *vswp; 3076 int idx; 3077 md_t *mdp; 3078 mde_cookie_t node; 3079 uint64_t inst; 3080 3081 if ((resp == NULL) || (cb_argp == NULL)) 3082 return (MDEG_FAILURE); 3083 3084 vswp = (vsw_t *)cb_argp; 3085 3086 D2(vswp, "%s: added %d : removed %d : curr matched %d" 3087 " : prev matched %d", __func__, resp->added.nelem, 3088 resp->removed.nelem, resp->match_curr.nelem, 3089 resp->match_prev.nelem); 3090 3091 /* process added ports */ 3092 for (idx = 0; idx < resp->added.nelem; idx++) { 3093 mdp = resp->added.mdp; 3094 node = resp->added.mdep[idx]; 3095 3096 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 3097 3098 if (vsw_port_add(vswp, mdp, &node) != 0) { 3099 cmn_err(CE_WARN, "!vsw%d: Unable to add new port " 3100 "(0x%lx)", vswp->instance, node); 3101 } 3102 } 3103 3104 /* process removed ports */ 3105 for (idx = 0; idx < resp->removed.nelem; idx++) { 3106 mdp = resp->removed.mdp; 3107 node = resp->removed.mdep[idx]; 3108 3109 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 3110 DERR(vswp, "%s: prop(%s) not found in port(%d)", 3111 __func__, id_propname, idx); 3112 continue; 3113 } 3114 3115 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 3116 3117 if (vsw_port_detach(vswp, inst) != 0) { 3118 cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld", 3119 vswp->instance, inst); 3120 } 3121 } 3122 3123 /* 3124 * Currently no support for updating already active ports. 3125 * So, ignore the match_curr and match_priv arrays for now. 3126 */ 3127 3128 D1(vswp, "%s: exit", __func__); 3129 3130 return (MDEG_SUCCESS); 3131 } 3132 3133 /* 3134 * Scan the machine description for this instance of vsw 3135 * and read its properties. Called only from vsw_attach(). 3136 * Returns: 0 on success, 1 on failure. 3137 */ 3138 static int 3139 vsw_read_mdprops(vsw_t *vswp) 3140 { 3141 md_t *mdp = NULL; 3142 mde_cookie_t rootnode; 3143 mde_cookie_t *listp = NULL; 3144 uint64_t inst; 3145 uint64_t cfgh; 3146 char *name; 3147 int rv = 1; 3148 int num_nodes = 0; 3149 int num_devs = 0; 3150 int listsz = 0; 3151 int i; 3152 3153 /* 3154 * In each 'virtual-device' node in the MD there is a 3155 * 'cfg-handle' property which is the MD's concept of 3156 * an instance number (this may be completely different from 3157 * the device drivers instance #). OBP reads that value and 3158 * stores it in the 'reg' property of the appropriate node in 3159 * the device tree. We first read this reg property and use this 3160 * to compare against the 'cfg-handle' property of vsw nodes 3161 * in MD to get to this specific vsw instance and then read 3162 * other properties that we are interested in. 3163 * We also cache the value of 'reg' property and use it later 3164 * to register callbacks with mdeg (see vsw_mdeg_register()) 3165 */ 3166 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 3167 DDI_PROP_DONTPASS, reg_propname, -1); 3168 if (inst == -1) { 3169 cmn_err(CE_NOTE, "!vsw%d: Unable to read %s property from " 3170 "OBP device tree", vswp->instance, reg_propname); 3171 return (rv); 3172 } 3173 3174 vswp->regprop = inst; 3175 3176 if ((mdp = md_get_handle()) == NULL) { 3177 DWARN(vswp, "%s: cannot init MD\n", __func__); 3178 return (rv); 3179 } 3180 3181 num_nodes = md_node_count(mdp); 3182 ASSERT(num_nodes > 0); 3183 3184 listsz = num_nodes * sizeof (mde_cookie_t); 3185 listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP); 3186 3187 rootnode = md_root_node(mdp); 3188 3189 /* search for all "virtual_device" nodes */ 3190 num_devs = md_scan_dag(mdp, rootnode, 3191 md_find_name(mdp, vdev_propname), 3192 md_find_name(mdp, "fwd"), listp); 3193 if (num_devs <= 0) { 3194 DWARN(vswp, "%s: invalid num_devs:%d\n", __func__, num_devs); 3195 goto vsw_readmd_exit; 3196 } 3197 3198 /* 3199 * Now loop through the list of virtual-devices looking for 3200 * devices with name "virtual-network-switch" and for each 3201 * such device compare its instance with what we have from 3202 * the 'reg' property to find the right node in MD and then 3203 * read all its properties. 3204 */ 3205 for (i = 0; i < num_devs; i++) { 3206 3207 if (md_get_prop_str(mdp, listp[i], "name", &name) != 0) { 3208 DWARN(vswp, "%s: name property not found\n", 3209 __func__); 3210 goto vsw_readmd_exit; 3211 } 3212 3213 /* is this a virtual-network-switch? */ 3214 if (strcmp(name, vsw_propname) != 0) 3215 continue; 3216 3217 if (md_get_prop_val(mdp, listp[i], "cfg-handle", &cfgh) != 0) { 3218 DWARN(vswp, "%s: cfg-handle property not found\n", 3219 __func__); 3220 goto vsw_readmd_exit; 3221 } 3222 3223 /* is this the required instance of vsw? */ 3224 if (inst != cfgh) 3225 continue; 3226 3227 /* now read all properties of this vsw instance */ 3228 rv = vsw_get_initial_md_properties(vswp, mdp, listp[i]); 3229 break; 3230 } 3231 3232 vsw_readmd_exit: 3233 3234 kmem_free(listp, listsz); 3235 (void) md_fini_handle(mdp); 3236 return (rv); 3237 } 3238 3239 /* 3240 * Read the initial start-of-day values from the specified MD node. 3241 */ 3242 static int 3243 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 3244 { 3245 int i; 3246 uint64_t macaddr = 0; 3247 3248 D1(vswp, "%s: enter", __func__); 3249 3250 if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) != 0) { 3251 return (1); 3252 } 3253 3254 /* mac address for vswitch device itself */ 3255 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 3256 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 3257 vswp->instance); 3258 return (1); 3259 } 3260 3261 vsw_save_lmacaddr(vswp, macaddr); 3262 3263 if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) { 3264 cmn_err(CE_WARN, "vsw%d: Unable to read %s property from " 3265 "MD, defaulting to programmed mode", vswp->instance, 3266 smode_propname); 3267 3268 for (i = 0; i < NUM_SMODES; i++) 3269 vswp->smode[i] = VSW_LAYER2; 3270 3271 vswp->smode_num = NUM_SMODES; 3272 } else { 3273 ASSERT(vswp->smode_num != 0); 3274 } 3275 3276 D1(vswp, "%s: exit", __func__); 3277 return (0); 3278 } 3279 3280 /* 3281 * Check to see if the relevant properties in the specified node have 3282 * changed, and if so take the appropriate action. 3283 * 3284 * If any of the properties are missing or invalid we don't take 3285 * any action, as this function should only be invoked when modifications 3286 * have been made to what we assume is a working configuration, which 3287 * we leave active. 3288 * 3289 * Note it is legal for this routine to be invoked even if none of the 3290 * properties in the port node within the MD have actually changed. 3291 */ 3292 static void 3293 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 3294 { 3295 char physname[LIFNAMSIZ]; 3296 char drv[LIFNAMSIZ]; 3297 uint_t ddi_instance; 3298 uint8_t new_smode[NUM_SMODES]; 3299 int i, smode_num = 0; 3300 uint64_t macaddr = 0; 3301 enum {MD_init = 0x1, 3302 MD_physname = 0x2, 3303 MD_macaddr = 0x4, 3304 MD_smode = 0x8} updated; 3305 int rv; 3306 3307 updated = MD_init; 3308 3309 D1(vswp, "%s: enter", __func__); 3310 3311 /* 3312 * Check if name of physical device in MD has changed. 3313 */ 3314 if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) { 3315 /* 3316 * Do basic sanity check on new device name/instance, 3317 * if its non NULL. It is valid for the device name to 3318 * have changed from a non NULL to a NULL value, i.e. 3319 * the vsw is being changed to 'routed' mode. 3320 */ 3321 if ((strlen(physname) != 0) && 3322 (ddi_parse(physname, drv, 3323 &ddi_instance) != DDI_SUCCESS)) { 3324 cmn_err(CE_WARN, "!vsw%d: new device name %s is not" 3325 " a valid device name/instance", 3326 vswp->instance, physname); 3327 goto fail_reconf; 3328 } 3329 3330 if (strcmp(physname, vswp->physname)) { 3331 D2(vswp, "%s: device name changed from %s to %s", 3332 __func__, vswp->physname, physname); 3333 3334 updated |= MD_physname; 3335 } else { 3336 D2(vswp, "%s: device name unchanged at %s", 3337 __func__, vswp->physname); 3338 } 3339 } else { 3340 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 3341 "device from updated MD.", vswp->instance); 3342 goto fail_reconf; 3343 } 3344 3345 /* 3346 * Check if MAC address has changed. 3347 */ 3348 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 3349 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 3350 vswp->instance); 3351 goto fail_reconf; 3352 } else { 3353 uint64_t maddr = macaddr; 3354 READ_ENTER(&vswp->if_lockrw); 3355 for (i = ETHERADDRL - 1; i >= 0; i--) { 3356 if (vswp->if_addr.ether_addr_octet[i] 3357 != (macaddr & 0xFF)) { 3358 D2(vswp, "%s: octet[%d] 0x%x != 0x%x", 3359 __func__, i, 3360 vswp->if_addr.ether_addr_octet[i], 3361 (macaddr & 0xFF)); 3362 updated |= MD_macaddr; 3363 macaddr = maddr; 3364 break; 3365 } 3366 macaddr >>= 8; 3367 } 3368 RW_EXIT(&vswp->if_lockrw); 3369 if (updated & MD_macaddr) { 3370 vsw_save_lmacaddr(vswp, macaddr); 3371 } 3372 } 3373 3374 /* 3375 * Check if switching modes have changed. 3376 */ 3377 if (vsw_get_md_smodes(vswp, mdp, node, 3378 new_smode, &smode_num)) { 3379 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", 3380 vswp->instance, smode_propname); 3381 goto fail_reconf; 3382 } else { 3383 ASSERT(smode_num != 0); 3384 if (smode_num != vswp->smode_num) { 3385 D2(vswp, "%s: number of modes changed from %d to %d", 3386 __func__, vswp->smode_num, smode_num); 3387 } 3388 3389 for (i = 0; i < smode_num; i++) { 3390 if (new_smode[i] != vswp->smode[i]) { 3391 D2(vswp, "%s: mode changed from %d to %d", 3392 __func__, vswp->smode[i], new_smode[i]); 3393 updated |= MD_smode; 3394 break; 3395 } 3396 } 3397 } 3398 3399 /* 3400 * Now make any changes which are needed... 3401 */ 3402 3403 if (updated & (MD_physname | MD_smode)) { 3404 3405 /* 3406 * Stop any pending timeout to setup switching mode. 3407 */ 3408 vsw_stop_switching_timeout(vswp); 3409 3410 /* 3411 * Remove unicst, mcst addrs of vsw interface 3412 * and ports from the physdev. 3413 */ 3414 vsw_unset_addrs(vswp); 3415 3416 /* 3417 * Stop, detach and close the old device.. 3418 */ 3419 mutex_enter(&vswp->mac_lock); 3420 3421 vsw_mac_detach(vswp); 3422 vsw_mac_close(vswp); 3423 3424 mutex_exit(&vswp->mac_lock); 3425 3426 /* 3427 * Update phys name. 3428 */ 3429 if (updated & MD_physname) { 3430 cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s", 3431 vswp->instance, vswp->physname, physname); 3432 (void) strncpy(vswp->physname, 3433 physname, strlen(physname) + 1); 3434 } 3435 3436 /* 3437 * Update array with the new switch mode values. 3438 */ 3439 if (updated & MD_smode) { 3440 for (i = 0; i < smode_num; i++) 3441 vswp->smode[i] = new_smode[i]; 3442 3443 vswp->smode_num = smode_num; 3444 vswp->smode_idx = 0; 3445 } 3446 3447 /* 3448 * ..and attach, start the new device. 3449 */ 3450 rv = vsw_setup_switching(vswp); 3451 if (rv == EAGAIN) { 3452 /* 3453 * Unable to setup switching mode. 3454 * As the error is EAGAIN, schedule a timeout to retry 3455 * and return. Programming addresses of ports and 3456 * vsw interface will be done when the timeout handler 3457 * completes successfully. 3458 */ 3459 mutex_enter(&vswp->swtmout_lock); 3460 3461 vswp->swtmout_enabled = B_TRUE; 3462 vswp->swtmout_id = 3463 timeout(vsw_setup_switching_timeout, vswp, 3464 (vsw_setup_switching_delay * 3465 drv_usectohz(MICROSEC))); 3466 3467 mutex_exit(&vswp->swtmout_lock); 3468 3469 return; 3470 3471 } else if (rv) { 3472 goto fail_update; 3473 } 3474 3475 /* 3476 * program unicst, mcst addrs of vsw interface 3477 * and ports in the physdev. 3478 */ 3479 vsw_set_addrs(vswp); 3480 3481 } else if (updated & MD_macaddr) { 3482 /* 3483 * We enter here if only MD_macaddr is exclusively updated. 3484 * If MD_physname and/or MD_smode are also updated, then 3485 * as part of that, we would have implicitly processed 3486 * MD_macaddr update (above). 3487 */ 3488 cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx", 3489 vswp->instance, macaddr); 3490 3491 READ_ENTER(&vswp->if_lockrw); 3492 if (vswp->if_state & VSW_IF_UP) { 3493 3494 mutex_enter(&vswp->hw_lock); 3495 /* 3496 * Remove old mac address of vsw interface 3497 * from the physdev 3498 */ 3499 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 3500 /* 3501 * Program new mac address of vsw interface 3502 * in the physdev 3503 */ 3504 rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 3505 mutex_exit(&vswp->hw_lock); 3506 if (rv != 0) { 3507 cmn_err(CE_NOTE, 3508 "!vsw%d: failed to program interface " 3509 "unicast address\n", vswp->instance); 3510 } 3511 /* 3512 * Notify the MAC layer of the changed address. 3513 */ 3514 mac_unicst_update(vswp->if_mh, 3515 (uint8_t *)&vswp->if_addr); 3516 3517 } 3518 RW_EXIT(&vswp->if_lockrw); 3519 3520 } 3521 3522 return; 3523 3524 fail_reconf: 3525 cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance); 3526 return; 3527 3528 fail_update: 3529 cmn_err(CE_WARN, "!vsw%d: update of configuration failed", 3530 vswp->instance); 3531 } 3532 3533 /* 3534 * Add a new port to the system. 3535 * 3536 * Returns 0 on success, 1 on failure. 3537 */ 3538 int 3539 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 3540 { 3541 uint64_t ldc_id; 3542 uint8_t *addrp; 3543 int i, addrsz; 3544 int num_nodes = 0, nchan = 0; 3545 int listsz = 0; 3546 mde_cookie_t *listp = NULL; 3547 struct ether_addr ea; 3548 uint64_t macaddr; 3549 uint64_t inst = 0; 3550 vsw_port_t *port; 3551 3552 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 3553 DWARN(vswp, "%s: prop(%s) not found", __func__, 3554 id_propname); 3555 return (1); 3556 } 3557 3558 /* 3559 * Find the channel endpoint node(s) (which should be under this 3560 * port node) which contain the channel id(s). 3561 */ 3562 if ((num_nodes = md_node_count(mdp)) <= 0) { 3563 DERR(vswp, "%s: invalid number of nodes found (%d)", 3564 __func__, num_nodes); 3565 return (1); 3566 } 3567 3568 D2(vswp, "%s: %d nodes found", __func__, num_nodes); 3569 3570 /* allocate enough space for node list */ 3571 listsz = num_nodes * sizeof (mde_cookie_t); 3572 listp = kmem_zalloc(listsz, KM_SLEEP); 3573 3574 nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname), 3575 md_find_name(mdp, "fwd"), listp); 3576 3577 if (nchan <= 0) { 3578 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 3579 kmem_free(listp, listsz); 3580 return (1); 3581 } 3582 3583 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 3584 3585 /* use property from first node found */ 3586 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 3587 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 3588 id_propname); 3589 kmem_free(listp, listsz); 3590 return (1); 3591 } 3592 3593 /* don't need list any more */ 3594 kmem_free(listp, listsz); 3595 3596 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 3597 3598 /* read mac-address property */ 3599 if (md_get_prop_data(mdp, *node, remaddr_propname, 3600 &addrp, &addrsz)) { 3601 DWARN(vswp, "%s: prop(%s) not found", 3602 __func__, remaddr_propname); 3603 return (1); 3604 } 3605 3606 if (addrsz < ETHERADDRL) { 3607 DWARN(vswp, "%s: invalid address size", __func__); 3608 return (1); 3609 } 3610 3611 macaddr = *((uint64_t *)addrp); 3612 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 3613 3614 for (i = ETHERADDRL - 1; i >= 0; i--) { 3615 ea.ether_addr_octet[i] = macaddr & 0xFF; 3616 macaddr >>= 8; 3617 } 3618 3619 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 3620 DERR(vswp, "%s: failed to attach port", __func__); 3621 return (1); 3622 } 3623 3624 port = vsw_lookup_port(vswp, (int)inst); 3625 3626 /* just successfuly created the port, so it should exist */ 3627 ASSERT(port != NULL); 3628 3629 return (0); 3630 } 3631 3632 /* 3633 * Attach the specified port. 3634 * 3635 * Returns 0 on success, 1 on failure. 3636 */ 3637 static int 3638 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 3639 struct ether_addr *macaddr) 3640 { 3641 vsw_port_list_t *plist = &vswp->plist; 3642 vsw_port_t *port, **prev_port; 3643 int i; 3644 3645 D1(vswp, "%s: enter : port %d", __func__, p_instance); 3646 3647 /* port already exists? */ 3648 READ_ENTER(&plist->lockrw); 3649 for (port = plist->head; port != NULL; port = port->p_next) { 3650 if (port->p_instance == p_instance) { 3651 DWARN(vswp, "%s: port instance %d already attached", 3652 __func__, p_instance); 3653 RW_EXIT(&plist->lockrw); 3654 return (1); 3655 } 3656 } 3657 RW_EXIT(&plist->lockrw); 3658 3659 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 3660 port->p_vswp = vswp; 3661 port->p_instance = p_instance; 3662 port->p_ldclist.num_ldcs = 0; 3663 port->p_ldclist.head = NULL; 3664 port->addr_set = VSW_ADDR_UNSET; 3665 3666 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 3667 3668 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 3669 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 3670 3671 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 3672 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 3673 3674 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 3675 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 3676 port->state = VSW_PORT_INIT; 3677 3678 if (nids > VSW_PORT_MAX_LDCS) { 3679 D2(vswp, "%s: using first of %d ldc ids", 3680 __func__, nids); 3681 nids = VSW_PORT_MAX_LDCS; 3682 } 3683 3684 D2(vswp, "%s: %d nids", __func__, nids); 3685 for (i = 0; i < nids; i++) { 3686 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 3687 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 3688 DERR(vswp, "%s: ldc_attach failed", __func__); 3689 3690 rw_destroy(&port->p_ldclist.lockrw); 3691 3692 cv_destroy(&port->ref_cv); 3693 mutex_destroy(&port->ref_lock); 3694 3695 cv_destroy(&port->state_cv); 3696 mutex_destroy(&port->state_lock); 3697 3698 mutex_destroy(&port->tx_lock); 3699 mutex_destroy(&port->mca_lock); 3700 kmem_free(port, sizeof (vsw_port_t)); 3701 return (1); 3702 } 3703 } 3704 3705 ether_copy(macaddr, &port->p_macaddr); 3706 3707 if (vswp->switching_setup_done == B_TRUE) { 3708 /* 3709 * If the underlying physical device has been setup, 3710 * program the mac address of this port in it. 3711 * Otherwise, port macaddr will be set after the physical 3712 * device is successfully setup by the timeout handler. 3713 */ 3714 mutex_enter(&vswp->hw_lock); 3715 (void) vsw_set_hw(vswp, port, VSW_VNETPORT); 3716 mutex_exit(&vswp->hw_lock); 3717 } 3718 3719 WRITE_ENTER(&plist->lockrw); 3720 3721 /* create the fdb entry for this port/mac address */ 3722 (void) vsw_add_fdb(vswp, port); 3723 3724 /* link it into the list of ports for this vsw instance */ 3725 prev_port = (vsw_port_t **)(&plist->head); 3726 port->p_next = *prev_port; 3727 *prev_port = port; 3728 plist->num_ports++; 3729 3730 RW_EXIT(&plist->lockrw); 3731 3732 /* 3733 * Initialise the port and any ldc's under it. 3734 */ 3735 (void) vsw_init_ldcs(port); 3736 3737 D1(vswp, "%s: exit", __func__); 3738 return (0); 3739 } 3740 3741 /* 3742 * Detach the specified port. 3743 * 3744 * Returns 0 on success, 1 on failure. 3745 */ 3746 static int 3747 vsw_port_detach(vsw_t *vswp, int p_instance) 3748 { 3749 vsw_port_t *port = NULL; 3750 vsw_port_list_t *plist = &vswp->plist; 3751 3752 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 3753 3754 WRITE_ENTER(&plist->lockrw); 3755 3756 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 3757 RW_EXIT(&plist->lockrw); 3758 return (1); 3759 } 3760 3761 if (vsw_plist_del_node(vswp, port)) { 3762 RW_EXIT(&plist->lockrw); 3763 return (1); 3764 } 3765 3766 /* Remove the fdb entry for this port/mac address */ 3767 (void) vsw_del_fdb(vswp, port); 3768 3769 /* Remove any multicast addresses.. */ 3770 vsw_del_mcst_port(port); 3771 3772 /* 3773 * No longer need to hold writer lock on port list now 3774 * that we have unlinked the target port from the list. 3775 */ 3776 RW_EXIT(&plist->lockrw); 3777 3778 /* Remove address if was programmed into HW. */ 3779 mutex_enter(&vswp->hw_lock); 3780 3781 /* 3782 * Port's address may not have been set in hardware. This could 3783 * happen if the underlying physical device is not yet available and 3784 * vsw_setup_switching_timeout() may be in progress. 3785 * We remove its addr from hardware only if it has been set before. 3786 */ 3787 if (port->addr_set != VSW_ADDR_UNSET) 3788 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3789 3790 if (vswp->recfg_reqd) 3791 vsw_reconfig_hw(vswp); 3792 3793 mutex_exit(&vswp->hw_lock); 3794 3795 if (vsw_port_delete(port)) { 3796 return (1); 3797 } 3798 3799 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 3800 return (0); 3801 } 3802 3803 /* 3804 * Detach all active ports. 3805 * 3806 * Returns 0 on success, 1 on failure. 3807 */ 3808 static int 3809 vsw_detach_ports(vsw_t *vswp) 3810 { 3811 vsw_port_list_t *plist = &vswp->plist; 3812 vsw_port_t *port = NULL; 3813 3814 D1(vswp, "%s: enter", __func__); 3815 3816 WRITE_ENTER(&plist->lockrw); 3817 3818 while ((port = plist->head) != NULL) { 3819 if (vsw_plist_del_node(vswp, port)) { 3820 DERR(vswp, "%s: Error deleting port %d" 3821 " from port list", __func__, port->p_instance); 3822 RW_EXIT(&plist->lockrw); 3823 return (1); 3824 } 3825 3826 /* Remove address if was programmed into HW. */ 3827 mutex_enter(&vswp->hw_lock); 3828 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3829 mutex_exit(&vswp->hw_lock); 3830 3831 /* Remove the fdb entry for this port/mac address */ 3832 (void) vsw_del_fdb(vswp, port); 3833 3834 /* Remove any multicast addresses.. */ 3835 vsw_del_mcst_port(port); 3836 3837 /* 3838 * No longer need to hold the lock on the port list 3839 * now that we have unlinked the target port from the 3840 * list. 3841 */ 3842 RW_EXIT(&plist->lockrw); 3843 if (vsw_port_delete(port)) { 3844 DERR(vswp, "%s: Error deleting port %d", 3845 __func__, port->p_instance); 3846 return (1); 3847 } 3848 WRITE_ENTER(&plist->lockrw); 3849 } 3850 RW_EXIT(&plist->lockrw); 3851 3852 D1(vswp, "%s: exit", __func__); 3853 3854 return (0); 3855 } 3856 3857 /* 3858 * Delete the specified port. 3859 * 3860 * Returns 0 on success, 1 on failure. 3861 */ 3862 static int 3863 vsw_port_delete(vsw_port_t *port) 3864 { 3865 vsw_ldc_list_t *ldcl; 3866 vsw_t *vswp = port->p_vswp; 3867 3868 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 3869 3870 (void) vsw_uninit_ldcs(port); 3871 3872 /* 3873 * Wait for any pending ctrl msg tasks which reference this 3874 * port to finish. 3875 */ 3876 if (vsw_drain_port_taskq(port)) 3877 return (1); 3878 3879 /* 3880 * Wait for port reference count to hit zero. 3881 */ 3882 mutex_enter(&port->ref_lock); 3883 while (port->ref_cnt != 0) 3884 cv_wait(&port->ref_cv, &port->ref_lock); 3885 mutex_exit(&port->ref_lock); 3886 3887 /* 3888 * Wait for any active callbacks to finish 3889 */ 3890 if (vsw_drain_ldcs(port)) 3891 return (1); 3892 3893 ldcl = &port->p_ldclist; 3894 WRITE_ENTER(&ldcl->lockrw); 3895 while (ldcl->num_ldcs > 0) { 3896 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) { 3897 cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld", 3898 vswp->instance, ldcl->head->ldc_id); 3899 RW_EXIT(&ldcl->lockrw); 3900 return (1); 3901 } 3902 } 3903 RW_EXIT(&ldcl->lockrw); 3904 3905 rw_destroy(&port->p_ldclist.lockrw); 3906 3907 mutex_destroy(&port->mca_lock); 3908 mutex_destroy(&port->tx_lock); 3909 cv_destroy(&port->ref_cv); 3910 mutex_destroy(&port->ref_lock); 3911 3912 cv_destroy(&port->state_cv); 3913 mutex_destroy(&port->state_lock); 3914 3915 kmem_free(port, sizeof (vsw_port_t)); 3916 3917 D1(vswp, "%s: exit", __func__); 3918 3919 return (0); 3920 } 3921 3922 /* 3923 * Attach a logical domain channel (ldc) under a specified port. 3924 * 3925 * Returns 0 on success, 1 on failure. 3926 */ 3927 static int 3928 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 3929 { 3930 vsw_t *vswp = port->p_vswp; 3931 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3932 vsw_ldc_t *ldcp = NULL; 3933 ldc_attr_t attr; 3934 ldc_status_t istatus; 3935 int status = DDI_FAILURE; 3936 int rv; 3937 enum { PROG_init = 0x0, PROG_mblks = 0x1, 3938 PROG_callback = 0x2} 3939 progress; 3940 3941 progress = PROG_init; 3942 3943 D1(vswp, "%s: enter", __func__); 3944 3945 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 3946 if (ldcp == NULL) { 3947 DERR(vswp, "%s: kmem_zalloc failed", __func__); 3948 return (1); 3949 } 3950 ldcp->ldc_id = ldc_id; 3951 3952 /* allocate pool of receive mblks */ 3953 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 3954 if (rv) { 3955 DWARN(vswp, "%s: unable to create free mblk pool for" 3956 " channel %ld (rv %d)", __func__, ldc_id, rv); 3957 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3958 return (1); 3959 } 3960 3961 progress |= PROG_mblks; 3962 3963 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 3964 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 3965 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 3966 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 3967 rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL); 3968 rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL); 3969 3970 /* required for handshake with peer */ 3971 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 3972 ldcp->peer_session = 0; 3973 ldcp->session_status = 0; 3974 3975 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 3976 ldcp->hss_id = 1; /* Initial handshake session id */ 3977 3978 /* only set for outbound lane, inbound set by peer */ 3979 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 3980 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 3981 vsw_set_lane_attr(vswp, &ldcp->lane_out); 3982 3983 attr.devclass = LDC_DEV_NT_SVC; 3984 attr.instance = ddi_get_instance(vswp->dip); 3985 attr.mode = LDC_MODE_UNRELIABLE; 3986 attr.mtu = VSW_LDC_MTU; 3987 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 3988 if (status != 0) { 3989 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 3990 __func__, ldc_id, status); 3991 goto ldc_attach_fail; 3992 } 3993 3994 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 3995 if (status != 0) { 3996 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 3997 __func__, ldc_id, status); 3998 (void) ldc_fini(ldcp->ldc_handle); 3999 goto ldc_attach_fail; 4000 } 4001 4002 progress |= PROG_callback; 4003 4004 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 4005 4006 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 4007 DERR(vswp, "%s: ldc_status failed", __func__); 4008 mutex_destroy(&ldcp->status_lock); 4009 goto ldc_attach_fail; 4010 } 4011 4012 ldcp->ldc_status = istatus; 4013 ldcp->ldc_port = port; 4014 ldcp->ldc_vswp = vswp; 4015 4016 /* link it into the list of channels for this port */ 4017 WRITE_ENTER(&ldcl->lockrw); 4018 ldcp->ldc_next = ldcl->head; 4019 ldcl->head = ldcp; 4020 ldcl->num_ldcs++; 4021 RW_EXIT(&ldcl->lockrw); 4022 4023 D1(vswp, "%s: exit", __func__); 4024 return (0); 4025 4026 ldc_attach_fail: 4027 mutex_destroy(&ldcp->ldc_txlock); 4028 mutex_destroy(&ldcp->ldc_cblock); 4029 4030 cv_destroy(&ldcp->drain_cv); 4031 4032 rw_destroy(&ldcp->lane_in.dlistrw); 4033 rw_destroy(&ldcp->lane_out.dlistrw); 4034 4035 if (progress & PROG_callback) { 4036 (void) ldc_unreg_callback(ldcp->ldc_handle); 4037 } 4038 4039 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 4040 if (vio_destroy_mblks(ldcp->rxh) != 0) { 4041 /* 4042 * Something odd has happened, as the destroy 4043 * will only fail if some mblks have been allocated 4044 * from the pool already (which shouldn't happen) 4045 * and have not been returned. 4046 * 4047 * Add the pool pointer to a list maintained in 4048 * the device instance. Another attempt will be made 4049 * to free the pool when the device itself detaches. 4050 */ 4051 cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld " 4052 "failed and cannot destroy associated mblk " 4053 "pool", vswp->instance, ldc_id); 4054 ldcp->rxh->nextp = vswp->rxh; 4055 vswp->rxh = ldcp->rxh; 4056 } 4057 } 4058 mutex_destroy(&ldcp->drain_cv_lock); 4059 mutex_destroy(&ldcp->hss_lock); 4060 4061 mutex_destroy(&ldcp->lane_in.seq_lock); 4062 mutex_destroy(&ldcp->lane_out.seq_lock); 4063 kmem_free(ldcp, sizeof (vsw_ldc_t)); 4064 4065 return (1); 4066 } 4067 4068 /* 4069 * Detach a logical domain channel (ldc) belonging to a 4070 * particular port. 4071 * 4072 * Returns 0 on success, 1 on failure. 4073 */ 4074 static int 4075 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 4076 { 4077 vsw_t *vswp = port->p_vswp; 4078 vsw_ldc_t *ldcp, *prev_ldcp; 4079 vsw_ldc_list_t *ldcl = &port->p_ldclist; 4080 int rv; 4081 4082 prev_ldcp = ldcl->head; 4083 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 4084 if (ldcp->ldc_id == ldc_id) { 4085 break; 4086 } 4087 } 4088 4089 /* specified ldc id not found */ 4090 if (ldcp == NULL) { 4091 DERR(vswp, "%s: ldcp = NULL", __func__); 4092 return (1); 4093 } 4094 4095 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 4096 4097 /* 4098 * Before we can close the channel we must release any mapped 4099 * resources (e.g. drings). 4100 */ 4101 vsw_free_lane_resources(ldcp, INBOUND); 4102 vsw_free_lane_resources(ldcp, OUTBOUND); 4103 4104 /* 4105 * If the close fails we are in serious trouble, as won't 4106 * be able to delete the parent port. 4107 */ 4108 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 4109 DERR(vswp, "%s: error %d closing channel %lld", 4110 __func__, rv, ldcp->ldc_id); 4111 return (1); 4112 } 4113 4114 (void) ldc_fini(ldcp->ldc_handle); 4115 4116 ldcp->ldc_status = LDC_INIT; 4117 ldcp->ldc_handle = NULL; 4118 ldcp->ldc_vswp = NULL; 4119 4120 if (ldcp->rxh != NULL) { 4121 if (vio_destroy_mblks(ldcp->rxh)) { 4122 /* 4123 * Mostly likely some mblks are still in use and 4124 * have not been returned to the pool. Add the pool 4125 * to the list maintained in the device instance. 4126 * Another attempt will be made to destroy the pool 4127 * when the device detaches. 4128 */ 4129 ldcp->rxh->nextp = vswp->rxh; 4130 vswp->rxh = ldcp->rxh; 4131 } 4132 } 4133 4134 /* unlink it from the list */ 4135 prev_ldcp = ldcp->ldc_next; 4136 ldcl->num_ldcs--; 4137 4138 mutex_destroy(&ldcp->ldc_txlock); 4139 mutex_destroy(&ldcp->ldc_cblock); 4140 cv_destroy(&ldcp->drain_cv); 4141 mutex_destroy(&ldcp->drain_cv_lock); 4142 mutex_destroy(&ldcp->hss_lock); 4143 mutex_destroy(&ldcp->lane_in.seq_lock); 4144 mutex_destroy(&ldcp->lane_out.seq_lock); 4145 mutex_destroy(&ldcp->status_lock); 4146 rw_destroy(&ldcp->lane_in.dlistrw); 4147 rw_destroy(&ldcp->lane_out.dlistrw); 4148 4149 kmem_free(ldcp, sizeof (vsw_ldc_t)); 4150 4151 return (0); 4152 } 4153 4154 /* 4155 * Open and attempt to bring up the channel. Note that channel 4156 * can only be brought up if peer has also opened channel. 4157 * 4158 * Returns 0 if can open and bring up channel, otherwise 4159 * returns 1. 4160 */ 4161 static int 4162 vsw_ldc_init(vsw_ldc_t *ldcp) 4163 { 4164 vsw_t *vswp = ldcp->ldc_vswp; 4165 ldc_status_t istatus = 0; 4166 int rv; 4167 4168 D1(vswp, "%s: enter", __func__); 4169 4170 LDC_ENTER_LOCK(ldcp); 4171 4172 /* don't start at 0 in case clients don't like that */ 4173 ldcp->next_ident = 1; 4174 4175 rv = ldc_open(ldcp->ldc_handle); 4176 if (rv != 0) { 4177 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 4178 __func__, ldcp->ldc_id, rv); 4179 LDC_EXIT_LOCK(ldcp); 4180 return (1); 4181 } 4182 4183 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 4184 DERR(vswp, "%s: unable to get status", __func__); 4185 LDC_EXIT_LOCK(ldcp); 4186 return (1); 4187 4188 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 4189 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 4190 __func__, ldcp->ldc_id, istatus); 4191 LDC_EXIT_LOCK(ldcp); 4192 return (1); 4193 } 4194 4195 mutex_enter(&ldcp->status_lock); 4196 ldcp->ldc_status = istatus; 4197 mutex_exit(&ldcp->status_lock); 4198 4199 rv = ldc_up(ldcp->ldc_handle); 4200 if (rv != 0) { 4201 /* 4202 * Not a fatal error for ldc_up() to fail, as peer 4203 * end point may simply not be ready yet. 4204 */ 4205 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 4206 ldcp->ldc_id, rv); 4207 LDC_EXIT_LOCK(ldcp); 4208 return (1); 4209 } 4210 4211 /* 4212 * ldc_up() call is non-blocking so need to explicitly 4213 * check channel status to see if in fact the channel 4214 * is UP. 4215 */ 4216 mutex_enter(&ldcp->status_lock); 4217 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 4218 DERR(vswp, "%s: unable to get status", __func__); 4219 mutex_exit(&ldcp->status_lock); 4220 LDC_EXIT_LOCK(ldcp); 4221 return (1); 4222 4223 } 4224 4225 if (ldcp->ldc_status == LDC_UP) { 4226 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 4227 ldcp->ldc_id, istatus); 4228 mutex_exit(&ldcp->status_lock); 4229 LDC_EXIT_LOCK(ldcp); 4230 4231 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4232 return (0); 4233 } 4234 4235 mutex_exit(&ldcp->status_lock); 4236 LDC_EXIT_LOCK(ldcp); 4237 4238 D1(vswp, "%s: exit", __func__); 4239 return (0); 4240 } 4241 4242 /* disable callbacks on the channel */ 4243 static int 4244 vsw_ldc_uninit(vsw_ldc_t *ldcp) 4245 { 4246 vsw_t *vswp = ldcp->ldc_vswp; 4247 int rv; 4248 4249 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 4250 4251 LDC_ENTER_LOCK(ldcp); 4252 4253 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 4254 if (rv != 0) { 4255 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 4256 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 4257 LDC_EXIT_LOCK(ldcp); 4258 return (1); 4259 } 4260 4261 mutex_enter(&ldcp->status_lock); 4262 ldcp->ldc_status = LDC_INIT; 4263 mutex_exit(&ldcp->status_lock); 4264 4265 LDC_EXIT_LOCK(ldcp); 4266 4267 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 4268 4269 return (0); 4270 } 4271 4272 static int 4273 vsw_init_ldcs(vsw_port_t *port) 4274 { 4275 vsw_ldc_list_t *ldcl = &port->p_ldclist; 4276 vsw_ldc_t *ldcp; 4277 4278 READ_ENTER(&ldcl->lockrw); 4279 ldcp = ldcl->head; 4280 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 4281 (void) vsw_ldc_init(ldcp); 4282 } 4283 RW_EXIT(&ldcl->lockrw); 4284 4285 return (0); 4286 } 4287 4288 static int 4289 vsw_uninit_ldcs(vsw_port_t *port) 4290 { 4291 vsw_ldc_list_t *ldcl = &port->p_ldclist; 4292 vsw_ldc_t *ldcp; 4293 4294 D1(NULL, "vsw_uninit_ldcs: enter\n"); 4295 4296 READ_ENTER(&ldcl->lockrw); 4297 ldcp = ldcl->head; 4298 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 4299 (void) vsw_ldc_uninit(ldcp); 4300 } 4301 RW_EXIT(&ldcl->lockrw); 4302 4303 D1(NULL, "vsw_uninit_ldcs: exit\n"); 4304 4305 return (0); 4306 } 4307 4308 /* 4309 * Wait until the callback(s) associated with the ldcs under the specified 4310 * port have completed. 4311 * 4312 * Prior to this function being invoked each channel under this port 4313 * should have been quiesced via ldc_set_cb_mode(DISABLE). 4314 * 4315 * A short explaination of what we are doing below.. 4316 * 4317 * The simplest approach would be to have a reference counter in 4318 * the ldc structure which is increment/decremented by the callbacks as 4319 * they use the channel. The drain function could then simply disable any 4320 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 4321 * there is a tiny window here - before the callback is able to get the lock 4322 * on the channel it is interrupted and this function gets to execute. It 4323 * sees that the ref count is zero and believes its free to delete the 4324 * associated data structures. 4325 * 4326 * We get around this by taking advantage of the fact that before the ldc 4327 * framework invokes a callback it sets a flag to indicate that there is a 4328 * callback active (or about to become active). If when we attempt to 4329 * unregister a callback when this active flag is set then the unregister 4330 * will fail with EWOULDBLOCK. 4331 * 4332 * If the unregister fails we do a cv_timedwait. We will either be signaled 4333 * by the callback as it is exiting (note we have to wait a short period to 4334 * allow the callback to return fully to the ldc framework and it to clear 4335 * the active flag), or by the timer expiring. In either case we again attempt 4336 * the unregister. We repeat this until we can succesfully unregister the 4337 * callback. 4338 * 4339 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 4340 * the case where the callback has finished but the ldc framework has not yet 4341 * cleared the active flag. In this case we would never get a cv_signal. 4342 */ 4343 static int 4344 vsw_drain_ldcs(vsw_port_t *port) 4345 { 4346 vsw_ldc_list_t *ldcl = &port->p_ldclist; 4347 vsw_ldc_t *ldcp; 4348 vsw_t *vswp = port->p_vswp; 4349 4350 D1(vswp, "%s: enter", __func__); 4351 4352 READ_ENTER(&ldcl->lockrw); 4353 4354 ldcp = ldcl->head; 4355 4356 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 4357 /* 4358 * If we can unregister the channel callback then we 4359 * know that there is no callback either running or 4360 * scheduled to run for this channel so move on to next 4361 * channel in the list. 4362 */ 4363 mutex_enter(&ldcp->drain_cv_lock); 4364 4365 /* prompt active callbacks to quit */ 4366 ldcp->drain_state = VSW_LDC_DRAINING; 4367 4368 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 4369 D2(vswp, "%s: unreg callback for chan %ld", __func__, 4370 ldcp->ldc_id); 4371 mutex_exit(&ldcp->drain_cv_lock); 4372 continue; 4373 } else { 4374 /* 4375 * If we end up here we know that either 1) a callback 4376 * is currently executing, 2) is about to start (i.e. 4377 * the ldc framework has set the active flag but 4378 * has not actually invoked the callback yet, or 3) 4379 * has finished and has returned to the ldc framework 4380 * but the ldc framework has not yet cleared the 4381 * active bit. 4382 * 4383 * Wait for it to finish. 4384 */ 4385 while (ldc_unreg_callback(ldcp->ldc_handle) 4386 == EWOULDBLOCK) 4387 (void) cv_timedwait(&ldcp->drain_cv, 4388 &ldcp->drain_cv_lock, lbolt + hz); 4389 4390 mutex_exit(&ldcp->drain_cv_lock); 4391 D2(vswp, "%s: unreg callback for chan %ld after " 4392 "timeout", __func__, ldcp->ldc_id); 4393 } 4394 } 4395 RW_EXIT(&ldcl->lockrw); 4396 4397 D1(vswp, "%s: exit", __func__); 4398 return (0); 4399 } 4400 4401 /* 4402 * Wait until all tasks which reference this port have completed. 4403 * 4404 * Prior to this function being invoked each channel under this port 4405 * should have been quiesced via ldc_set_cb_mode(DISABLE). 4406 */ 4407 static int 4408 vsw_drain_port_taskq(vsw_port_t *port) 4409 { 4410 vsw_t *vswp = port->p_vswp; 4411 4412 D1(vswp, "%s: enter", __func__); 4413 4414 /* 4415 * Mark the port as in the process of being detached, and 4416 * dispatch a marker task to the queue so we know when all 4417 * relevant tasks have completed. 4418 */ 4419 mutex_enter(&port->state_lock); 4420 port->state = VSW_PORT_DETACHING; 4421 4422 if ((vswp->taskq_p == NULL) || 4423 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 4424 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 4425 DERR(vswp, "%s: unable to dispatch marker task", 4426 __func__); 4427 mutex_exit(&port->state_lock); 4428 return (1); 4429 } 4430 4431 /* 4432 * Wait for the marker task to finish. 4433 */ 4434 while (port->state != VSW_PORT_DETACHABLE) 4435 cv_wait(&port->state_cv, &port->state_lock); 4436 4437 mutex_exit(&port->state_lock); 4438 4439 D1(vswp, "%s: exit", __func__); 4440 4441 return (0); 4442 } 4443 4444 static void 4445 vsw_marker_task(void *arg) 4446 { 4447 vsw_port_t *port = arg; 4448 vsw_t *vswp = port->p_vswp; 4449 4450 D1(vswp, "%s: enter", __func__); 4451 4452 mutex_enter(&port->state_lock); 4453 4454 /* 4455 * No further tasks should be dispatched which reference 4456 * this port so ok to mark it as safe to detach. 4457 */ 4458 port->state = VSW_PORT_DETACHABLE; 4459 4460 cv_signal(&port->state_cv); 4461 4462 mutex_exit(&port->state_lock); 4463 4464 D1(vswp, "%s: exit", __func__); 4465 } 4466 4467 static vsw_port_t * 4468 vsw_lookup_port(vsw_t *vswp, int p_instance) 4469 { 4470 vsw_port_list_t *plist = &vswp->plist; 4471 vsw_port_t *port; 4472 4473 for (port = plist->head; port != NULL; port = port->p_next) { 4474 if (port->p_instance == p_instance) { 4475 D2(vswp, "vsw_lookup_port: found p_instance\n"); 4476 return (port); 4477 } 4478 } 4479 4480 return (NULL); 4481 } 4482 4483 /* 4484 * Search for and remove the specified port from the port 4485 * list. Returns 0 if able to locate and remove port, otherwise 4486 * returns 1. 4487 */ 4488 static int 4489 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 4490 { 4491 vsw_port_list_t *plist = &vswp->plist; 4492 vsw_port_t *curr_p, *prev_p; 4493 4494 if (plist->head == NULL) 4495 return (1); 4496 4497 curr_p = prev_p = plist->head; 4498 4499 while (curr_p != NULL) { 4500 if (curr_p == port) { 4501 if (prev_p == curr_p) { 4502 plist->head = curr_p->p_next; 4503 } else { 4504 prev_p->p_next = curr_p->p_next; 4505 } 4506 plist->num_ports--; 4507 break; 4508 } else { 4509 prev_p = curr_p; 4510 curr_p = curr_p->p_next; 4511 } 4512 } 4513 return (0); 4514 } 4515 4516 /* 4517 * Interrupt handler for ldc messages. 4518 */ 4519 static uint_t 4520 vsw_ldc_cb(uint64_t event, caddr_t arg) 4521 { 4522 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4523 vsw_t *vswp = ldcp->ldc_vswp; 4524 4525 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4526 4527 mutex_enter(&ldcp->ldc_cblock); 4528 4529 mutex_enter(&ldcp->status_lock); 4530 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 4531 mutex_exit(&ldcp->status_lock); 4532 mutex_exit(&ldcp->ldc_cblock); 4533 return (LDC_SUCCESS); 4534 } 4535 mutex_exit(&ldcp->status_lock); 4536 4537 if (event & LDC_EVT_UP) { 4538 /* 4539 * Channel has come up. 4540 */ 4541 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 4542 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4543 4544 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4545 4546 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4547 } 4548 4549 if (event & LDC_EVT_READ) { 4550 /* 4551 * Data available for reading. 4552 */ 4553 D2(vswp, "%s: id(ld) event(%llx) data READ", 4554 __func__, ldcp->ldc_id, event); 4555 4556 vsw_process_pkt(ldcp); 4557 4558 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4559 4560 goto vsw_cb_exit; 4561 } 4562 4563 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 4564 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", 4565 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4566 4567 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4568 } 4569 4570 /* 4571 * Catch either LDC_EVT_WRITE which we don't support or any 4572 * unknown event. 4573 */ 4574 if (event & 4575 ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) { 4576 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 4577 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4578 } 4579 4580 vsw_cb_exit: 4581 mutex_exit(&ldcp->ldc_cblock); 4582 4583 /* 4584 * Let the drain function know we are finishing if it 4585 * is waiting. 4586 */ 4587 mutex_enter(&ldcp->drain_cv_lock); 4588 if (ldcp->drain_state == VSW_LDC_DRAINING) 4589 cv_signal(&ldcp->drain_cv); 4590 mutex_exit(&ldcp->drain_cv_lock); 4591 4592 return (LDC_SUCCESS); 4593 } 4594 4595 /* 4596 * Reinitialise data structures associated with the channel. 4597 */ 4598 static void 4599 vsw_ldc_reinit(vsw_ldc_t *ldcp) 4600 { 4601 vsw_t *vswp = ldcp->ldc_vswp; 4602 vsw_port_t *port; 4603 vsw_ldc_list_t *ldcl; 4604 4605 D1(vswp, "%s: enter", __func__); 4606 4607 port = ldcp->ldc_port; 4608 ldcl = &port->p_ldclist; 4609 4610 READ_ENTER(&ldcl->lockrw); 4611 4612 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 4613 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 4614 4615 vsw_free_lane_resources(ldcp, INBOUND); 4616 vsw_free_lane_resources(ldcp, OUTBOUND); 4617 RW_EXIT(&ldcl->lockrw); 4618 4619 ldcp->lane_in.lstate = 0; 4620 ldcp->lane_out.lstate = 0; 4621 4622 /* 4623 * Remove parent port from any multicast groups 4624 * it may have registered with. Client must resend 4625 * multicast add command after handshake completes. 4626 */ 4627 (void) vsw_del_fdb(vswp, port); 4628 4629 vsw_del_mcst_port(port); 4630 4631 ldcp->peer_session = 0; 4632 ldcp->session_status = 0; 4633 ldcp->hcnt = 0; 4634 ldcp->hphase = VSW_MILESTONE0; 4635 4636 D1(vswp, "%s: exit", __func__); 4637 } 4638 4639 /* 4640 * Process a connection event. 4641 * 4642 * Note - care must be taken to ensure that this function is 4643 * not called with the dlistrw lock held. 4644 */ 4645 static void 4646 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) 4647 { 4648 vsw_t *vswp = ldcp->ldc_vswp; 4649 vsw_conn_evt_t *conn = NULL; 4650 4651 D1(vswp, "%s: enter", __func__); 4652 4653 /* 4654 * Check if either a reset or restart event is pending 4655 * or in progress. If so just return. 4656 * 4657 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT 4658 * being received by the callback handler, or a ECONNRESET error 4659 * code being returned from a ldc_read() or ldc_write() call. 4660 * 4661 * A VSW_CONN_RESTART event occurs when some error checking code 4662 * decides that there is a problem with data from the channel, 4663 * and that the handshake should be restarted. 4664 */ 4665 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && 4666 (ldstub((uint8_t *)&ldcp->reset_active))) 4667 return; 4668 4669 /* 4670 * If it is an LDC_UP event we first check the recorded 4671 * state of the channel. If this is UP then we know that 4672 * the channel moving to the UP state has already been dealt 4673 * with and don't need to dispatch a new task. 4674 * 4675 * The reason for this check is that when we do a ldc_up(), 4676 * depending on the state of the peer, we may or may not get 4677 * a LDC_UP event. As we can't depend on getting a LDC_UP evt 4678 * every time we do ldc_up() we explicitly check the channel 4679 * status to see has it come up (ldc_up() is asynch and will 4680 * complete at some undefined time), and take the appropriate 4681 * action. 4682 * 4683 * The flip side of this is that we may get a LDC_UP event 4684 * when we have already seen that the channel is up and have 4685 * dealt with that. 4686 */ 4687 mutex_enter(&ldcp->status_lock); 4688 if (evt == VSW_CONN_UP) { 4689 if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) { 4690 mutex_exit(&ldcp->status_lock); 4691 return; 4692 } 4693 } 4694 mutex_exit(&ldcp->status_lock); 4695 4696 /* 4697 * The transaction group id allows us to identify and discard 4698 * any tasks which are still pending on the taskq and refer 4699 * to the handshake session we are about to restart or reset. 4700 * These stale messages no longer have any real meaning. 4701 */ 4702 mutex_enter(&ldcp->hss_lock); 4703 ldcp->hss_id++; 4704 mutex_exit(&ldcp->hss_lock); 4705 4706 ASSERT(vswp->taskq_p != NULL); 4707 4708 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { 4709 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" 4710 " connection event", vswp->instance); 4711 goto err_exit; 4712 } 4713 4714 conn->evt = evt; 4715 conn->ldcp = ldcp; 4716 4717 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, 4718 DDI_NOSLEEP) != DDI_SUCCESS) { 4719 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", 4720 vswp->instance); 4721 4722 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4723 goto err_exit; 4724 } 4725 4726 D1(vswp, "%s: exit", __func__); 4727 return; 4728 4729 err_exit: 4730 /* 4731 * Have mostly likely failed due to memory shortage. Clear the flag so 4732 * that future requests will at least be attempted and will hopefully 4733 * succeed. 4734 */ 4735 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4736 ldcp->reset_active = 0; 4737 } 4738 4739 /* 4740 * Deal with events relating to a connection. Invoked from a taskq. 4741 */ 4742 static void 4743 vsw_conn_task(void *arg) 4744 { 4745 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; 4746 vsw_ldc_t *ldcp = NULL; 4747 vsw_t *vswp = NULL; 4748 uint16_t evt; 4749 ldc_status_t curr_status; 4750 4751 ldcp = conn->ldcp; 4752 evt = conn->evt; 4753 vswp = ldcp->ldc_vswp; 4754 4755 D1(vswp, "%s: enter", __func__); 4756 4757 /* can safely free now have copied out data */ 4758 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4759 4760 mutex_enter(&ldcp->status_lock); 4761 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4762 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4763 "channel %ld", vswp->instance, ldcp->ldc_id); 4764 mutex_exit(&ldcp->status_lock); 4765 return; 4766 } 4767 4768 /* 4769 * If we wish to restart the handshake on this channel, then if 4770 * the channel is UP we bring it DOWN to flush the underlying 4771 * ldc queue. 4772 */ 4773 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) 4774 (void) ldc_down(ldcp->ldc_handle); 4775 4776 /* 4777 * re-init all the associated data structures. 4778 */ 4779 vsw_ldc_reinit(ldcp); 4780 4781 /* 4782 * Bring the channel back up (note it does no harm to 4783 * do this even if the channel is already UP, Just 4784 * becomes effectively a no-op). 4785 */ 4786 (void) ldc_up(ldcp->ldc_handle); 4787 4788 /* 4789 * Check if channel is now UP. This will only happen if 4790 * peer has also done a ldc_up(). 4791 */ 4792 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4793 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4794 "channel %ld", vswp->instance, ldcp->ldc_id); 4795 mutex_exit(&ldcp->status_lock); 4796 return; 4797 } 4798 4799 ldcp->ldc_status = curr_status; 4800 4801 /* channel UP so restart handshake by sending version info */ 4802 if (curr_status == LDC_UP) { 4803 if (ldcp->hcnt++ > vsw_num_handshakes) { 4804 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" 4805 " handshake attempts (%d) on channel %ld", 4806 vswp->instance, ldcp->hcnt, ldcp->ldc_id); 4807 mutex_exit(&ldcp->status_lock); 4808 return; 4809 } 4810 4811 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 4812 DDI_NOSLEEP) != DDI_SUCCESS) { 4813 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", 4814 vswp->instance); 4815 4816 /* 4817 * Don't count as valid restart attempt if couldn't 4818 * send version msg. 4819 */ 4820 if (ldcp->hcnt > 0) 4821 ldcp->hcnt--; 4822 } 4823 } 4824 4825 /* 4826 * Mark that the process is complete by clearing the flag. 4827 * 4828 * Note is it possible that the taskq dispatch above may have failed, 4829 * most likely due to memory shortage. We still clear the flag so 4830 * future attempts will at least be attempted and will hopefully 4831 * succeed. 4832 */ 4833 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4834 ldcp->reset_active = 0; 4835 4836 mutex_exit(&ldcp->status_lock); 4837 4838 D1(vswp, "%s: exit", __func__); 4839 } 4840 4841 /* 4842 * returns 0 if legal for event signified by flag to have 4843 * occured at the time it did. Otherwise returns 1. 4844 */ 4845 int 4846 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 4847 { 4848 vsw_t *vswp = ldcp->ldc_vswp; 4849 uint64_t state; 4850 uint64_t phase; 4851 4852 if (dir == INBOUND) 4853 state = ldcp->lane_in.lstate; 4854 else 4855 state = ldcp->lane_out.lstate; 4856 4857 phase = ldcp->hphase; 4858 4859 switch (flag) { 4860 case VSW_VER_INFO_RECV: 4861 if (phase > VSW_MILESTONE0) { 4862 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 4863 " when in state %d\n", ldcp->ldc_id, phase); 4864 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4865 return (1); 4866 } 4867 break; 4868 4869 case VSW_VER_ACK_RECV: 4870 case VSW_VER_NACK_RECV: 4871 if (!(state & VSW_VER_INFO_SENT)) { 4872 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or " 4873 "VER_NACK when in state %d\n", ldcp->ldc_id, phase); 4874 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4875 return (1); 4876 } else 4877 state &= ~VSW_VER_INFO_SENT; 4878 break; 4879 4880 case VSW_ATTR_INFO_RECV: 4881 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 4882 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 4883 " when in state %d\n", ldcp->ldc_id, phase); 4884 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4885 return (1); 4886 } 4887 break; 4888 4889 case VSW_ATTR_ACK_RECV: 4890 case VSW_ATTR_NACK_RECV: 4891 if (!(state & VSW_ATTR_INFO_SENT)) { 4892 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 4893 " or ATTR_NACK when in state %d\n", 4894 ldcp->ldc_id, phase); 4895 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4896 return (1); 4897 } else 4898 state &= ~VSW_ATTR_INFO_SENT; 4899 break; 4900 4901 case VSW_DRING_INFO_RECV: 4902 if (phase < VSW_MILESTONE1) { 4903 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 4904 " when in state %d\n", ldcp->ldc_id, phase); 4905 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4906 return (1); 4907 } 4908 break; 4909 4910 case VSW_DRING_ACK_RECV: 4911 case VSW_DRING_NACK_RECV: 4912 if (!(state & VSW_DRING_INFO_SENT)) { 4913 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK " 4914 " or DRING_NACK when in state %d\n", 4915 ldcp->ldc_id, phase); 4916 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4917 return (1); 4918 } else 4919 state &= ~VSW_DRING_INFO_SENT; 4920 break; 4921 4922 case VSW_RDX_INFO_RECV: 4923 if (phase < VSW_MILESTONE3) { 4924 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 4925 " when in state %d\n", ldcp->ldc_id, phase); 4926 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4927 return (1); 4928 } 4929 break; 4930 4931 case VSW_RDX_ACK_RECV: 4932 case VSW_RDX_NACK_RECV: 4933 if (!(state & VSW_RDX_INFO_SENT)) { 4934 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or " 4935 "RDX_NACK when in state %d\n", ldcp->ldc_id, phase); 4936 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4937 return (1); 4938 } else 4939 state &= ~VSW_RDX_INFO_SENT; 4940 break; 4941 4942 case VSW_MCST_INFO_RECV: 4943 if (phase < VSW_MILESTONE3) { 4944 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 4945 " when in state %d\n", ldcp->ldc_id, phase); 4946 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4947 return (1); 4948 } 4949 break; 4950 4951 default: 4952 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 4953 ldcp->ldc_id, flag); 4954 return (1); 4955 } 4956 4957 if (dir == INBOUND) 4958 ldcp->lane_in.lstate = state; 4959 else 4960 ldcp->lane_out.lstate = state; 4961 4962 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 4963 4964 return (0); 4965 } 4966 4967 void 4968 vsw_next_milestone(vsw_ldc_t *ldcp) 4969 { 4970 vsw_t *vswp = ldcp->ldc_vswp; 4971 4972 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 4973 ldcp->ldc_id, ldcp->hphase); 4974 4975 DUMP_FLAGS(ldcp->lane_in.lstate); 4976 DUMP_FLAGS(ldcp->lane_out.lstate); 4977 4978 switch (ldcp->hphase) { 4979 4980 case VSW_MILESTONE0: 4981 /* 4982 * If we haven't started to handshake with our peer, 4983 * start to do so now. 4984 */ 4985 if (ldcp->lane_out.lstate == 0) { 4986 D2(vswp, "%s: (chan %lld) starting handshake " 4987 "with peer", __func__, ldcp->ldc_id); 4988 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4989 } 4990 4991 /* 4992 * Only way to pass this milestone is to have successfully 4993 * negotiated version info. 4994 */ 4995 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 4996 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 4997 4998 D2(vswp, "%s: (chan %lld) leaving milestone 0", 4999 __func__, ldcp->ldc_id); 5000 5001 /* 5002 * Next milestone is passed when attribute 5003 * information has been successfully exchanged. 5004 */ 5005 ldcp->hphase = VSW_MILESTONE1; 5006 vsw_send_attr(ldcp); 5007 5008 } 5009 break; 5010 5011 case VSW_MILESTONE1: 5012 /* 5013 * Only way to pass this milestone is to have successfully 5014 * negotiated attribute information. 5015 */ 5016 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 5017 5018 ldcp->hphase = VSW_MILESTONE2; 5019 5020 /* 5021 * If the peer device has said it wishes to 5022 * use descriptor rings then we send it our ring 5023 * info, otherwise we just set up a private ring 5024 * which we use an internal buffer 5025 */ 5026 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 5027 vsw_send_dring_info(ldcp); 5028 } 5029 break; 5030 5031 case VSW_MILESTONE2: 5032 /* 5033 * If peer has indicated in its attribute message that 5034 * it wishes to use descriptor rings then the only way 5035 * to pass this milestone is for us to have received 5036 * valid dring info. 5037 * 5038 * If peer is not using descriptor rings then just fall 5039 * through. 5040 */ 5041 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 5042 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 5043 break; 5044 5045 D2(vswp, "%s: (chan %lld) leaving milestone 2", 5046 __func__, ldcp->ldc_id); 5047 5048 ldcp->hphase = VSW_MILESTONE3; 5049 vsw_send_rdx(ldcp); 5050 break; 5051 5052 case VSW_MILESTONE3: 5053 /* 5054 * Pass this milestone when all paramaters have been 5055 * successfully exchanged and RDX sent in both directions. 5056 * 5057 * Mark outbound lane as available to transmit data. 5058 */ 5059 if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) && 5060 (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) { 5061 5062 D2(vswp, "%s: (chan %lld) leaving milestone 3", 5063 __func__, ldcp->ldc_id); 5064 D2(vswp, "%s: ** handshake complete (0x%llx : " 5065 "0x%llx) **", __func__, ldcp->lane_in.lstate, 5066 ldcp->lane_out.lstate); 5067 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 5068 ldcp->hphase = VSW_MILESTONE4; 5069 ldcp->hcnt = 0; 5070 DISPLAY_STATE(); 5071 } else { 5072 D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)", 5073 __func__, ldcp->lane_in.lstate, 5074 ldcp->lane_out.lstate); 5075 } 5076 break; 5077 5078 case VSW_MILESTONE4: 5079 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 5080 ldcp->ldc_id); 5081 break; 5082 5083 default: 5084 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 5085 ldcp->ldc_id, ldcp->hphase); 5086 } 5087 5088 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 5089 ldcp->hphase); 5090 } 5091 5092 /* 5093 * Check if major version is supported. 5094 * 5095 * Returns 0 if finds supported major number, and if necessary 5096 * adjusts the minor field. 5097 * 5098 * Returns 1 if can't match major number exactly. Sets mjor/minor 5099 * to next lowest support values, or to zero if no other values possible. 5100 */ 5101 static int 5102 vsw_supported_version(vio_ver_msg_t *vp) 5103 { 5104 int i; 5105 5106 D1(NULL, "vsw_supported_version: enter"); 5107 5108 for (i = 0; i < VSW_NUM_VER; i++) { 5109 if (vsw_versions[i].ver_major == vp->ver_major) { 5110 /* 5111 * Matching or lower major version found. Update 5112 * minor number if necessary. 5113 */ 5114 if (vp->ver_minor > vsw_versions[i].ver_minor) { 5115 D2(NULL, "%s: adjusting minor value from %d " 5116 "to %d", __func__, vp->ver_minor, 5117 vsw_versions[i].ver_minor); 5118 vp->ver_minor = vsw_versions[i].ver_minor; 5119 } 5120 5121 return (0); 5122 } 5123 5124 if (vsw_versions[i].ver_major < vp->ver_major) { 5125 if (vp->ver_minor > vsw_versions[i].ver_minor) { 5126 D2(NULL, "%s: adjusting minor value from %d " 5127 "to %d", __func__, vp->ver_minor, 5128 vsw_versions[i].ver_minor); 5129 vp->ver_minor = vsw_versions[i].ver_minor; 5130 } 5131 return (1); 5132 } 5133 } 5134 5135 /* No match was possible, zero out fields */ 5136 vp->ver_major = 0; 5137 vp->ver_minor = 0; 5138 5139 D1(NULL, "vsw_supported_version: exit"); 5140 5141 return (1); 5142 } 5143 5144 /* 5145 * Main routine for processing messages received over LDC. 5146 */ 5147 static void 5148 vsw_process_pkt(void *arg) 5149 { 5150 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 5151 vsw_t *vswp = ldcp->ldc_vswp; 5152 size_t msglen; 5153 vio_msg_tag_t tag; 5154 def_msg_t dmsg; 5155 int rv = 0; 5156 5157 5158 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 5159 5160 /* 5161 * If channel is up read messages until channel is empty. 5162 */ 5163 do { 5164 msglen = sizeof (dmsg); 5165 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 5166 5167 if (rv != 0) { 5168 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n", 5169 __func__, ldcp->ldc_id, rv, msglen); 5170 } 5171 5172 /* channel has been reset */ 5173 if (rv == ECONNRESET) { 5174 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 5175 break; 5176 } 5177 5178 if (msglen == 0) { 5179 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 5180 ldcp->ldc_id); 5181 break; 5182 } 5183 5184 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 5185 ldcp->ldc_id, msglen); 5186 5187 /* 5188 * Figure out what sort of packet we have gotten by 5189 * examining the msg tag, and then switch it appropriately. 5190 */ 5191 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 5192 5193 switch (tag.vio_msgtype) { 5194 case VIO_TYPE_CTRL: 5195 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 5196 break; 5197 case VIO_TYPE_DATA: 5198 vsw_process_data_pkt(ldcp, &dmsg, tag); 5199 break; 5200 case VIO_TYPE_ERR: 5201 vsw_process_err_pkt(ldcp, &dmsg, tag); 5202 break; 5203 default: 5204 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 5205 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 5206 break; 5207 } 5208 } while (msglen); 5209 5210 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 5211 } 5212 5213 /* 5214 * Dispatch a task to process a VIO control message. 5215 */ 5216 static void 5217 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 5218 { 5219 vsw_ctrl_task_t *ctaskp = NULL; 5220 vsw_port_t *port = ldcp->ldc_port; 5221 vsw_t *vswp = port->p_vswp; 5222 5223 D1(vswp, "%s: enter", __func__); 5224 5225 /* 5226 * We need to handle RDX ACK messages in-band as once they 5227 * are exchanged it is possible that we will get an 5228 * immediate (legitimate) data packet. 5229 */ 5230 if ((tag.vio_subtype_env == VIO_RDX) && 5231 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 5232 5233 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) 5234 return; 5235 5236 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; 5237 D2(vswp, "%s (%ld) handling RDX_ACK in place " 5238 "(ostate 0x%llx : hphase %d)", __func__, 5239 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); 5240 vsw_next_milestone(ldcp); 5241 return; 5242 } 5243 5244 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 5245 5246 if (ctaskp == NULL) { 5247 DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__); 5248 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5249 return; 5250 } 5251 5252 ctaskp->ldcp = ldcp; 5253 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 5254 mutex_enter(&ldcp->hss_lock); 5255 ctaskp->hss_id = ldcp->hss_id; 5256 mutex_exit(&ldcp->hss_lock); 5257 5258 /* 5259 * Dispatch task to processing taskq if port is not in 5260 * the process of being detached. 5261 */ 5262 mutex_enter(&port->state_lock); 5263 if (port->state == VSW_PORT_INIT) { 5264 if ((vswp->taskq_p == NULL) || 5265 (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt, 5266 ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) { 5267 DERR(vswp, "%s: unable to dispatch task to taskq", 5268 __func__); 5269 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 5270 mutex_exit(&port->state_lock); 5271 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5272 return; 5273 } 5274 } else { 5275 DWARN(vswp, "%s: port %d detaching, not dispatching " 5276 "task", __func__, port->p_instance); 5277 } 5278 5279 mutex_exit(&port->state_lock); 5280 5281 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 5282 ldcp->ldc_id); 5283 D1(vswp, "%s: exit", __func__); 5284 } 5285 5286 /* 5287 * Process a VIO ctrl message. Invoked from taskq. 5288 */ 5289 static void 5290 vsw_process_ctrl_pkt(void *arg) 5291 { 5292 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 5293 vsw_ldc_t *ldcp = ctaskp->ldcp; 5294 vsw_t *vswp = ldcp->ldc_vswp; 5295 vio_msg_tag_t tag; 5296 uint16_t env; 5297 5298 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5299 5300 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 5301 env = tag.vio_subtype_env; 5302 5303 /* stale pkt check */ 5304 mutex_enter(&ldcp->hss_lock); 5305 if (ctaskp->hss_id < ldcp->hss_id) { 5306 DWARN(vswp, "%s: discarding stale packet belonging to earlier" 5307 " (%ld) handshake session", __func__, ctaskp->hss_id); 5308 mutex_exit(&ldcp->hss_lock); 5309 return; 5310 } 5311 mutex_exit(&ldcp->hss_lock); 5312 5313 /* session id check */ 5314 if (ldcp->session_status & VSW_PEER_SESSION) { 5315 if (ldcp->peer_session != tag.vio_sid) { 5316 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5317 __func__, ldcp->ldc_id, tag.vio_sid); 5318 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 5319 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5320 return; 5321 } 5322 } 5323 5324 /* 5325 * Switch on vio_subtype envelope, then let lower routines 5326 * decide if its an INFO, ACK or NACK packet. 5327 */ 5328 switch (env) { 5329 case VIO_VER_INFO: 5330 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 5331 break; 5332 case VIO_DRING_REG: 5333 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 5334 break; 5335 case VIO_DRING_UNREG: 5336 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 5337 break; 5338 case VIO_ATTR_INFO: 5339 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 5340 break; 5341 case VNET_MCAST_INFO: 5342 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 5343 break; 5344 case VIO_RDX: 5345 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 5346 break; 5347 default: 5348 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); 5349 } 5350 5351 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 5352 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5353 } 5354 5355 /* 5356 * Version negotiation. We can end up here either because our peer 5357 * has responded to a handshake message we have sent it, or our peer 5358 * has initiated a handshake with us. If its the former then can only 5359 * be ACK or NACK, if its the later can only be INFO. 5360 * 5361 * If its an ACK we move to the next stage of the handshake, namely 5362 * attribute exchange. If its a NACK we see if we can specify another 5363 * version, if we can't we stop. 5364 * 5365 * If it is an INFO we reset all params associated with communication 5366 * in that direction over this channel (remember connection is 5367 * essentially 2 independent simplex channels). 5368 */ 5369 void 5370 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 5371 { 5372 vio_ver_msg_t *ver_pkt; 5373 vsw_t *vswp = ldcp->ldc_vswp; 5374 5375 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5376 5377 /* 5378 * We know this is a ctrl/version packet so 5379 * cast it into the correct structure. 5380 */ 5381 ver_pkt = (vio_ver_msg_t *)pkt; 5382 5383 switch (ver_pkt->tag.vio_subtype) { 5384 case VIO_SUBTYPE_INFO: 5385 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 5386 5387 /* 5388 * Record the session id, which we will use from now 5389 * until we see another VER_INFO msg. Even then the 5390 * session id in most cases will be unchanged, execpt 5391 * if channel was reset. 5392 */ 5393 if ((ldcp->session_status & VSW_PEER_SESSION) && 5394 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 5395 DERR(vswp, "%s: updating session id for chan %lld " 5396 "from %llx to %llx", __func__, ldcp->ldc_id, 5397 ldcp->peer_session, ver_pkt->tag.vio_sid); 5398 } 5399 5400 ldcp->peer_session = ver_pkt->tag.vio_sid; 5401 ldcp->session_status |= VSW_PEER_SESSION; 5402 5403 /* Legal message at this time ? */ 5404 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 5405 return; 5406 5407 /* 5408 * First check the device class. Currently only expect 5409 * to be talking to a network device. In the future may 5410 * also talk to another switch. 5411 */ 5412 if (ver_pkt->dev_class != VDEV_NETWORK) { 5413 DERR(vswp, "%s: illegal device class %d", __func__, 5414 ver_pkt->dev_class); 5415 5416 ver_pkt->tag.vio_sid = ldcp->local_session; 5417 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5418 5419 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5420 5421 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5422 sizeof (vio_ver_msg_t), B_TRUE); 5423 5424 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 5425 vsw_next_milestone(ldcp); 5426 return; 5427 } else { 5428 ldcp->dev_class = ver_pkt->dev_class; 5429 } 5430 5431 /* 5432 * Now check the version. 5433 */ 5434 if (vsw_supported_version(ver_pkt) == 0) { 5435 /* 5436 * Support this major version and possibly 5437 * adjusted minor version. 5438 */ 5439 5440 D2(vswp, "%s: accepted ver %d:%d", __func__, 5441 ver_pkt->ver_major, ver_pkt->ver_minor); 5442 5443 /* Store accepted values */ 5444 ldcp->lane_in.ver_major = ver_pkt->ver_major; 5445 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 5446 5447 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5448 5449 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 5450 } else { 5451 /* 5452 * NACK back with the next lower major/minor 5453 * pairing we support (if don't suuport any more 5454 * versions then they will be set to zero. 5455 */ 5456 5457 D2(vswp, "%s: replying with ver %d:%d", __func__, 5458 ver_pkt->ver_major, ver_pkt->ver_minor); 5459 5460 /* Store updated values */ 5461 ldcp->lane_in.ver_major = ver_pkt->ver_major; 5462 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 5463 5464 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5465 5466 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 5467 } 5468 5469 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5470 ver_pkt->tag.vio_sid = ldcp->local_session; 5471 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5472 sizeof (vio_ver_msg_t), B_TRUE); 5473 5474 vsw_next_milestone(ldcp); 5475 break; 5476 5477 case VIO_SUBTYPE_ACK: 5478 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 5479 5480 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 5481 return; 5482 5483 /* Store updated values */ 5484 ldcp->lane_in.ver_major = ver_pkt->ver_major; 5485 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 5486 5487 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 5488 vsw_next_milestone(ldcp); 5489 5490 break; 5491 5492 case VIO_SUBTYPE_NACK: 5493 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 5494 5495 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 5496 return; 5497 5498 /* 5499 * If our peer sent us a NACK with the ver fields set to 5500 * zero then there is nothing more we can do. Otherwise see 5501 * if we support either the version suggested, or a lesser 5502 * one. 5503 */ 5504 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5505 DERR(vswp, "%s: peer unable to negotiate any " 5506 "further.", __func__); 5507 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5508 vsw_next_milestone(ldcp); 5509 return; 5510 } 5511 5512 /* 5513 * Check to see if we support this major version or 5514 * a lower one. If we don't then maj/min will be set 5515 * to zero. 5516 */ 5517 (void) vsw_supported_version(ver_pkt); 5518 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5519 /* Nothing more we can do */ 5520 DERR(vswp, "%s: version negotiation failed.\n", 5521 __func__); 5522 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5523 vsw_next_milestone(ldcp); 5524 } else { 5525 /* found a supported major version */ 5526 ldcp->lane_out.ver_major = ver_pkt->ver_major; 5527 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 5528 5529 D2(vswp, "%s: resending with updated values (%x, %x)", 5530 __func__, ver_pkt->ver_major, ver_pkt->ver_minor); 5531 5532 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 5533 ver_pkt->tag.vio_sid = ldcp->local_session; 5534 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 5535 5536 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5537 5538 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5539 sizeof (vio_ver_msg_t), B_TRUE); 5540 5541 vsw_next_milestone(ldcp); 5542 5543 } 5544 break; 5545 5546 default: 5547 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5548 ver_pkt->tag.vio_subtype); 5549 } 5550 5551 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 5552 } 5553 5554 /* 5555 * Process an attribute packet. We can end up here either because our peer 5556 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 5557 * peer has sent us an attribute INFO message 5558 * 5559 * If its an ACK we then move to the next stage of the handshake which 5560 * is to send our descriptor ring info to our peer. If its a NACK then 5561 * there is nothing more we can (currently) do. 5562 * 5563 * If we get a valid/acceptable INFO packet (and we have already negotiated 5564 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 5565 * NACK back and reset channel state to INACTIV. 5566 * 5567 * FUTURE: in time we will probably negotiate over attributes, but for 5568 * the moment unacceptable attributes are regarded as a fatal error. 5569 * 5570 */ 5571 void 5572 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 5573 { 5574 vnet_attr_msg_t *attr_pkt; 5575 vsw_t *vswp = ldcp->ldc_vswp; 5576 vsw_port_t *port = ldcp->ldc_port; 5577 uint64_t macaddr = 0; 5578 int i; 5579 5580 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5581 5582 /* 5583 * We know this is a ctrl/attr packet so 5584 * cast it into the correct structure. 5585 */ 5586 attr_pkt = (vnet_attr_msg_t *)pkt; 5587 5588 switch (attr_pkt->tag.vio_subtype) { 5589 case VIO_SUBTYPE_INFO: 5590 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5591 5592 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 5593 return; 5594 5595 /* 5596 * If the attributes are unacceptable then we NACK back. 5597 */ 5598 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 5599 5600 DERR(vswp, "%s (chan %d): invalid attributes", 5601 __func__, ldcp->ldc_id); 5602 5603 vsw_free_lane_resources(ldcp, INBOUND); 5604 5605 attr_pkt->tag.vio_sid = ldcp->local_session; 5606 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5607 5608 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5609 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 5610 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5611 sizeof (vnet_attr_msg_t), B_TRUE); 5612 5613 vsw_next_milestone(ldcp); 5614 return; 5615 } 5616 5617 /* 5618 * Otherwise store attributes for this lane and update 5619 * lane state. 5620 */ 5621 ldcp->lane_in.mtu = attr_pkt->mtu; 5622 ldcp->lane_in.addr = attr_pkt->addr; 5623 ldcp->lane_in.addr_type = attr_pkt->addr_type; 5624 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 5625 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 5626 5627 macaddr = ldcp->lane_in.addr; 5628 for (i = ETHERADDRL - 1; i >= 0; i--) { 5629 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 5630 macaddr >>= 8; 5631 } 5632 5633 /* create the fdb entry for this port/mac address */ 5634 (void) vsw_add_fdb(vswp, port); 5635 5636 /* setup device specifc xmit routines */ 5637 mutex_enter(&port->tx_lock); 5638 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 5639 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 5640 port->transmit = vsw_dringsend; 5641 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 5642 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 5643 vsw_create_privring(ldcp); 5644 port->transmit = vsw_descrsend; 5645 } 5646 mutex_exit(&port->tx_lock); 5647 5648 attr_pkt->tag.vio_sid = ldcp->local_session; 5649 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5650 5651 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5652 5653 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 5654 5655 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5656 sizeof (vnet_attr_msg_t), B_TRUE); 5657 5658 vsw_next_milestone(ldcp); 5659 break; 5660 5661 case VIO_SUBTYPE_ACK: 5662 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5663 5664 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 5665 return; 5666 5667 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 5668 vsw_next_milestone(ldcp); 5669 break; 5670 5671 case VIO_SUBTYPE_NACK: 5672 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5673 5674 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 5675 return; 5676 5677 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 5678 vsw_next_milestone(ldcp); 5679 break; 5680 5681 default: 5682 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5683 attr_pkt->tag.vio_subtype); 5684 } 5685 5686 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5687 } 5688 5689 /* 5690 * Process a dring info packet. We can end up here either because our peer 5691 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 5692 * peer has sent us a dring INFO message. 5693 * 5694 * If we get a valid/acceptable INFO packet (and we have already negotiated 5695 * a version) we ACK back and update the lane state, otherwise we NACK back. 5696 * 5697 * FUTURE: nothing to stop client from sending us info on multiple dring's 5698 * but for the moment we will just use the first one we are given. 5699 * 5700 */ 5701 void 5702 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 5703 { 5704 vio_dring_reg_msg_t *dring_pkt; 5705 vsw_t *vswp = ldcp->ldc_vswp; 5706 ldc_mem_info_t minfo; 5707 dring_info_t *dp, *dbp; 5708 int dring_found = 0; 5709 5710 /* 5711 * We know this is a ctrl/dring packet so 5712 * cast it into the correct structure. 5713 */ 5714 dring_pkt = (vio_dring_reg_msg_t *)pkt; 5715 5716 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5717 5718 switch (dring_pkt->tag.vio_subtype) { 5719 case VIO_SUBTYPE_INFO: 5720 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5721 5722 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5723 return; 5724 5725 /* 5726 * If the dring params are unacceptable then we NACK back. 5727 */ 5728 if (vsw_check_dring_info(dring_pkt)) { 5729 5730 DERR(vswp, "%s (%lld): invalid dring info", 5731 __func__, ldcp->ldc_id); 5732 5733 vsw_free_lane_resources(ldcp, INBOUND); 5734 5735 dring_pkt->tag.vio_sid = ldcp->local_session; 5736 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5737 5738 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5739 5740 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5741 5742 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5743 sizeof (vio_dring_reg_msg_t), B_TRUE); 5744 5745 vsw_next_milestone(ldcp); 5746 return; 5747 } 5748 5749 /* 5750 * Otherwise, attempt to map in the dring using the 5751 * cookie. If that succeeds we send back a unique dring 5752 * identifier that the sending side will use in future 5753 * to refer to this descriptor ring. 5754 */ 5755 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 5756 5757 dp->num_descriptors = dring_pkt->num_descriptors; 5758 dp->descriptor_size = dring_pkt->descriptor_size; 5759 dp->options = dring_pkt->options; 5760 dp->ncookies = dring_pkt->ncookies; 5761 5762 /* 5763 * Note: should only get one cookie. Enforced in 5764 * the ldc layer. 5765 */ 5766 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 5767 sizeof (ldc_mem_cookie_t)); 5768 5769 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 5770 dp->num_descriptors, dp->descriptor_size); 5771 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 5772 dp->options, dp->ncookies); 5773 5774 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 5775 dp->ncookies, dp->num_descriptors, dp->descriptor_size, 5776 LDC_SHADOW_MAP, &(dp->handle))) != 0) { 5777 5778 DERR(vswp, "%s: dring_map failed\n", __func__); 5779 5780 kmem_free(dp, sizeof (dring_info_t)); 5781 vsw_free_lane_resources(ldcp, INBOUND); 5782 5783 dring_pkt->tag.vio_sid = ldcp->local_session; 5784 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5785 5786 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5787 5788 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5789 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5790 sizeof (vio_dring_reg_msg_t), B_TRUE); 5791 5792 vsw_next_milestone(ldcp); 5793 return; 5794 } 5795 5796 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 5797 5798 DERR(vswp, "%s: dring_addr failed\n", __func__); 5799 5800 kmem_free(dp, sizeof (dring_info_t)); 5801 vsw_free_lane_resources(ldcp, INBOUND); 5802 5803 dring_pkt->tag.vio_sid = ldcp->local_session; 5804 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5805 5806 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5807 5808 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5809 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5810 sizeof (vio_dring_reg_msg_t), B_TRUE); 5811 5812 vsw_next_milestone(ldcp); 5813 return; 5814 } else { 5815 /* store the address of the pub part of ring */ 5816 dp->pub_addr = minfo.vaddr; 5817 } 5818 5819 /* no private section as we are importing */ 5820 dp->priv_addr = NULL; 5821 5822 /* 5823 * Using simple mono increasing int for ident at 5824 * the moment. 5825 */ 5826 dp->ident = ldcp->next_ident; 5827 ldcp->next_ident++; 5828 5829 dp->end_idx = 0; 5830 dp->next = NULL; 5831 5832 /* 5833 * Link it onto the end of the list of drings 5834 * for this lane. 5835 */ 5836 if (ldcp->lane_in.dringp == NULL) { 5837 D2(vswp, "%s: adding first INBOUND dring", __func__); 5838 ldcp->lane_in.dringp = dp; 5839 } else { 5840 dbp = ldcp->lane_in.dringp; 5841 5842 while (dbp->next != NULL) 5843 dbp = dbp->next; 5844 5845 dbp->next = dp; 5846 } 5847 5848 /* acknowledge it */ 5849 dring_pkt->tag.vio_sid = ldcp->local_session; 5850 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5851 dring_pkt->dring_ident = dp->ident; 5852 5853 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5854 sizeof (vio_dring_reg_msg_t), B_TRUE); 5855 5856 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 5857 vsw_next_milestone(ldcp); 5858 break; 5859 5860 case VIO_SUBTYPE_ACK: 5861 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5862 5863 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 5864 return; 5865 5866 /* 5867 * Peer is acknowledging our dring info and will have 5868 * sent us a dring identifier which we will use to 5869 * refer to this ring w.r.t. our peer. 5870 */ 5871 dp = ldcp->lane_out.dringp; 5872 if (dp != NULL) { 5873 /* 5874 * Find the ring this ident should be associated 5875 * with. 5876 */ 5877 if (vsw_dring_match(dp, dring_pkt)) { 5878 dring_found = 1; 5879 5880 } else while (dp != NULL) { 5881 if (vsw_dring_match(dp, dring_pkt)) { 5882 dring_found = 1; 5883 break; 5884 } 5885 dp = dp->next; 5886 } 5887 5888 if (dring_found == 0) { 5889 DERR(NULL, "%s: unrecognised ring cookie", 5890 __func__); 5891 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5892 return; 5893 } 5894 5895 } else { 5896 DERR(vswp, "%s: DRING ACK received but no drings " 5897 "allocated", __func__); 5898 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5899 return; 5900 } 5901 5902 /* store ident */ 5903 dp->ident = dring_pkt->dring_ident; 5904 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 5905 vsw_next_milestone(ldcp); 5906 break; 5907 5908 case VIO_SUBTYPE_NACK: 5909 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5910 5911 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 5912 return; 5913 5914 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 5915 vsw_next_milestone(ldcp); 5916 break; 5917 5918 default: 5919 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5920 dring_pkt->tag.vio_subtype); 5921 } 5922 5923 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5924 } 5925 5926 /* 5927 * Process a request from peer to unregister a dring. 5928 * 5929 * For the moment we just restart the handshake if our 5930 * peer endpoint attempts to unregister a dring. 5931 */ 5932 void 5933 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 5934 { 5935 vsw_t *vswp = ldcp->ldc_vswp; 5936 vio_dring_unreg_msg_t *dring_pkt; 5937 5938 /* 5939 * We know this is a ctrl/dring packet so 5940 * cast it into the correct structure. 5941 */ 5942 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 5943 5944 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5945 5946 switch (dring_pkt->tag.vio_subtype) { 5947 case VIO_SUBTYPE_INFO: 5948 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5949 5950 DWARN(vswp, "%s: restarting handshake..", __func__); 5951 break; 5952 5953 case VIO_SUBTYPE_ACK: 5954 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5955 5956 DWARN(vswp, "%s: restarting handshake..", __func__); 5957 break; 5958 5959 case VIO_SUBTYPE_NACK: 5960 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5961 5962 DWARN(vswp, "%s: restarting handshake..", __func__); 5963 break; 5964 5965 default: 5966 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5967 dring_pkt->tag.vio_subtype); 5968 } 5969 5970 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5971 5972 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5973 } 5974 5975 #define SND_MCST_NACK(ldcp, pkt) \ 5976 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5977 pkt->tag.vio_sid = ldcp->local_session; \ 5978 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5979 sizeof (vnet_mcast_msg_t), B_TRUE); 5980 5981 /* 5982 * Process a multicast request from a vnet. 5983 * 5984 * Vnet's specify a multicast address that they are interested in. This 5985 * address is used as a key into the hash table which forms the multicast 5986 * forwarding database (mFDB). 5987 * 5988 * The table keys are the multicast addresses, while the table entries 5989 * are pointers to lists of ports which wish to receive packets for the 5990 * specified multicast address. 5991 * 5992 * When a multicast packet is being switched we use the address as a key 5993 * into the hash table, and then walk the appropriate port list forwarding 5994 * the pkt to each port in turn. 5995 * 5996 * If a vnet is no longer interested in a particular multicast grouping 5997 * we simply find the correct location in the hash table and then delete 5998 * the relevant port from the port list. 5999 * 6000 * To deal with the case whereby a port is being deleted without first 6001 * removing itself from the lists in the hash table, we maintain a list 6002 * of multicast addresses the port has registered an interest in, within 6003 * the port structure itself. We then simply walk that list of addresses 6004 * using them as keys into the hash table and remove the port from the 6005 * appropriate lists. 6006 */ 6007 static void 6008 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 6009 { 6010 vnet_mcast_msg_t *mcst_pkt; 6011 vsw_port_t *port = ldcp->ldc_port; 6012 vsw_t *vswp = ldcp->ldc_vswp; 6013 int i; 6014 6015 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6016 6017 /* 6018 * We know this is a ctrl/mcast packet so 6019 * cast it into the correct structure. 6020 */ 6021 mcst_pkt = (vnet_mcast_msg_t *)pkt; 6022 6023 switch (mcst_pkt->tag.vio_subtype) { 6024 case VIO_SUBTYPE_INFO: 6025 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6026 6027 /* 6028 * Check if in correct state to receive a multicast 6029 * message (i.e. handshake complete). If not reset 6030 * the handshake. 6031 */ 6032 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 6033 return; 6034 6035 /* 6036 * Before attempting to add or remove address check 6037 * that they are valid multicast addresses. 6038 * If not, then NACK back. 6039 */ 6040 for (i = 0; i < mcst_pkt->count; i++) { 6041 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 6042 DERR(vswp, "%s: invalid multicast address", 6043 __func__); 6044 SND_MCST_NACK(ldcp, mcst_pkt); 6045 return; 6046 } 6047 } 6048 6049 /* 6050 * Now add/remove the addresses. If this fails we 6051 * NACK back. 6052 */ 6053 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 6054 SND_MCST_NACK(ldcp, mcst_pkt); 6055 return; 6056 } 6057 6058 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6059 mcst_pkt->tag.vio_sid = ldcp->local_session; 6060 6061 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 6062 6063 (void) vsw_send_msg(ldcp, (void *)mcst_pkt, 6064 sizeof (vnet_mcast_msg_t), B_TRUE); 6065 break; 6066 6067 case VIO_SUBTYPE_ACK: 6068 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 6069 6070 /* 6071 * We shouldn't ever get a multicast ACK message as 6072 * at the moment we never request multicast addresses 6073 * to be set on some other device. This may change in 6074 * the future if we have cascading switches. 6075 */ 6076 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 6077 return; 6078 6079 /* Do nothing */ 6080 break; 6081 6082 case VIO_SUBTYPE_NACK: 6083 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6084 6085 /* 6086 * We shouldn't get a multicast NACK packet for the 6087 * same reasons as we shouldn't get a ACK packet. 6088 */ 6089 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 6090 return; 6091 6092 /* Do nothing */ 6093 break; 6094 6095 default: 6096 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 6097 mcst_pkt->tag.vio_subtype); 6098 } 6099 6100 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6101 } 6102 6103 static void 6104 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 6105 { 6106 vio_rdx_msg_t *rdx_pkt; 6107 vsw_t *vswp = ldcp->ldc_vswp; 6108 6109 /* 6110 * We know this is a ctrl/rdx packet so 6111 * cast it into the correct structure. 6112 */ 6113 rdx_pkt = (vio_rdx_msg_t *)pkt; 6114 6115 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 6116 6117 switch (rdx_pkt->tag.vio_subtype) { 6118 case VIO_SUBTYPE_INFO: 6119 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6120 6121 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) 6122 return; 6123 6124 rdx_pkt->tag.vio_sid = ldcp->local_session; 6125 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6126 6127 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 6128 6129 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; 6130 6131 (void) vsw_send_msg(ldcp, (void *)rdx_pkt, 6132 sizeof (vio_rdx_msg_t), B_TRUE); 6133 6134 vsw_next_milestone(ldcp); 6135 break; 6136 6137 case VIO_SUBTYPE_ACK: 6138 /* 6139 * Should be handled in-band by callback handler. 6140 */ 6141 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 6142 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6143 break; 6144 6145 case VIO_SUBTYPE_NACK: 6146 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6147 6148 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) 6149 return; 6150 6151 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; 6152 vsw_next_milestone(ldcp); 6153 break; 6154 6155 default: 6156 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 6157 rdx_pkt->tag.vio_subtype); 6158 } 6159 6160 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6161 } 6162 6163 static void 6164 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 6165 { 6166 uint16_t env = tag.vio_subtype_env; 6167 vsw_t *vswp = ldcp->ldc_vswp; 6168 6169 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6170 6171 /* session id check */ 6172 if (ldcp->session_status & VSW_PEER_SESSION) { 6173 if (ldcp->peer_session != tag.vio_sid) { 6174 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 6175 __func__, ldcp->ldc_id, tag.vio_sid); 6176 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6177 return; 6178 } 6179 } 6180 6181 /* 6182 * It is an error for us to be getting data packets 6183 * before the handshake has completed. 6184 */ 6185 if (ldcp->hphase != VSW_MILESTONE4) { 6186 DERR(vswp, "%s: got data packet before handshake complete " 6187 "hphase %d (%x: %x)", __func__, ldcp->hphase, 6188 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 6189 DUMP_FLAGS(ldcp->lane_in.lstate); 6190 DUMP_FLAGS(ldcp->lane_out.lstate); 6191 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6192 return; 6193 } 6194 6195 /* 6196 * Switch on vio_subtype envelope, then let lower routines 6197 * decide if its an INFO, ACK or NACK packet. 6198 */ 6199 if (env == VIO_DRING_DATA) { 6200 vsw_process_data_dring_pkt(ldcp, dpkt); 6201 } else if (env == VIO_PKT_DATA) { 6202 vsw_process_data_raw_pkt(ldcp, dpkt); 6203 } else if (env == VIO_DESC_DATA) { 6204 vsw_process_data_ibnd_pkt(ldcp, dpkt); 6205 } else { 6206 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); 6207 } 6208 6209 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6210 } 6211 6212 #define SND_DRING_NACK(ldcp, pkt) \ 6213 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 6214 pkt->tag.vio_sid = ldcp->local_session; \ 6215 (void) vsw_send_msg(ldcp, (void *)pkt, \ 6216 sizeof (vio_dring_msg_t), B_TRUE); 6217 6218 static void 6219 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 6220 { 6221 vio_dring_msg_t *dring_pkt; 6222 vnet_public_desc_t *pub_addr = NULL; 6223 vsw_private_desc_t *priv_addr = NULL; 6224 dring_info_t *dp = NULL; 6225 vsw_t *vswp = ldcp->ldc_vswp; 6226 mblk_t *mp = NULL; 6227 mblk_t *bp = NULL; 6228 mblk_t *bpt = NULL; 6229 size_t nbytes = 0; 6230 size_t off = 0; 6231 uint64_t ncookies = 0; 6232 uint64_t chain = 0; 6233 uint64_t j, len; 6234 uint32_t pos, start, datalen; 6235 uint32_t range_start, range_end; 6236 int32_t end, num, cnt = 0; 6237 int i, rv, msg_rv = 0; 6238 boolean_t ack_needed = B_FALSE; 6239 boolean_t prev_desc_ack = B_FALSE; 6240 int read_attempts = 0; 6241 6242 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6243 6244 /* 6245 * We know this is a data/dring packet so 6246 * cast it into the correct structure. 6247 */ 6248 dring_pkt = (vio_dring_msg_t *)dpkt; 6249 6250 /* 6251 * Switch on the vio_subtype. If its INFO then we need to 6252 * process the data. If its an ACK we need to make sure 6253 * it makes sense (i.e did we send an earlier data/info), 6254 * and if its a NACK then we maybe attempt a retry. 6255 */ 6256 switch (dring_pkt->tag.vio_subtype) { 6257 case VIO_SUBTYPE_INFO: 6258 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 6259 6260 READ_ENTER(&ldcp->lane_in.dlistrw); 6261 if ((dp = vsw_ident2dring(&ldcp->lane_in, 6262 dring_pkt->dring_ident)) == NULL) { 6263 RW_EXIT(&ldcp->lane_in.dlistrw); 6264 6265 DERR(vswp, "%s(%lld): unable to find dring from " 6266 "ident 0x%llx", __func__, ldcp->ldc_id, 6267 dring_pkt->dring_ident); 6268 6269 SND_DRING_NACK(ldcp, dring_pkt); 6270 return; 6271 } 6272 6273 start = pos = dring_pkt->start_idx; 6274 end = dring_pkt->end_idx; 6275 len = dp->num_descriptors; 6276 6277 range_start = range_end = pos; 6278 6279 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 6280 __func__, ldcp->ldc_id, start, end); 6281 6282 if (end == -1) { 6283 num = -1; 6284 } else if (end >= 0) { 6285 num = end >= pos ? end - pos + 1: (len - pos + 1) + end; 6286 6287 /* basic sanity check */ 6288 if (end > len) { 6289 RW_EXIT(&ldcp->lane_in.dlistrw); 6290 DERR(vswp, "%s(%lld): endpoint %lld outside " 6291 "ring length %lld", __func__, 6292 ldcp->ldc_id, end, len); 6293 6294 SND_DRING_NACK(ldcp, dring_pkt); 6295 return; 6296 } 6297 } else { 6298 RW_EXIT(&ldcp->lane_in.dlistrw); 6299 DERR(vswp, "%s(%lld): invalid endpoint %lld", 6300 __func__, ldcp->ldc_id, end); 6301 SND_DRING_NACK(ldcp, dring_pkt); 6302 return; 6303 } 6304 6305 while (cnt != num) { 6306 vsw_recheck_desc: 6307 if ((rv = ldc_mem_dring_acquire(dp->handle, 6308 pos, pos)) != 0) { 6309 RW_EXIT(&ldcp->lane_in.dlistrw); 6310 DERR(vswp, "%s(%lld): unable to acquire " 6311 "descriptor at pos %d: err %d", 6312 __func__, pos, ldcp->ldc_id, rv); 6313 SND_DRING_NACK(ldcp, dring_pkt); 6314 return; 6315 } 6316 6317 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 6318 6319 /* 6320 * When given a bounded range of descriptors 6321 * to process, its an error to hit a descriptor 6322 * which is not ready. In the non-bounded case 6323 * (end_idx == -1) this simply indicates we have 6324 * reached the end of the current active range. 6325 */ 6326 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 6327 /* unbound - no error */ 6328 if (end == -1) { 6329 if (read_attempts == vsw_read_attempts) 6330 break; 6331 6332 delay(drv_usectohz(vsw_desc_delay)); 6333 read_attempts++; 6334 goto vsw_recheck_desc; 6335 } 6336 6337 /* bounded - error - so NACK back */ 6338 RW_EXIT(&ldcp->lane_in.dlistrw); 6339 DERR(vswp, "%s(%lld): descriptor not READY " 6340 "(%d)", __func__, ldcp->ldc_id, 6341 pub_addr->hdr.dstate); 6342 SND_DRING_NACK(ldcp, dring_pkt); 6343 return; 6344 } 6345 6346 DTRACE_PROBE1(read_attempts, int, read_attempts); 6347 6348 range_end = pos; 6349 6350 /* 6351 * If we ACK'd the previous descriptor then now 6352 * record the new range start position for later 6353 * ACK's. 6354 */ 6355 if (prev_desc_ack) { 6356 range_start = pos; 6357 6358 D2(vswp, "%s(%lld): updating range start to be " 6359 "%d", __func__, ldcp->ldc_id, range_start); 6360 6361 prev_desc_ack = B_FALSE; 6362 } 6363 6364 /* 6365 * Data is padded to align on 8 byte boundary, 6366 * datalen is actual data length, i.e. minus that 6367 * padding. 6368 */ 6369 datalen = pub_addr->nbytes; 6370 6371 /* 6372 * Does peer wish us to ACK when we have finished 6373 * with this descriptor ? 6374 */ 6375 if (pub_addr->hdr.ack) 6376 ack_needed = B_TRUE; 6377 6378 D2(vswp, "%s(%lld): processing desc %lld at pos" 6379 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 6380 __func__, ldcp->ldc_id, pos, pub_addr, 6381 pub_addr->hdr.dstate, datalen); 6382 6383 /* 6384 * Mark that we are starting to process descriptor. 6385 */ 6386 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 6387 6388 mp = vio_allocb(ldcp->rxh); 6389 if (mp == NULL) { 6390 /* 6391 * No free receive buffers available, so 6392 * fallback onto allocb(9F). Make sure that 6393 * we get a data buffer which is a multiple 6394 * of 8 as this is required by ldc_mem_copy. 6395 */ 6396 DTRACE_PROBE(allocb); 6397 if ((mp = allocb(datalen + VNET_IPALIGN + 8, 6398 BPRI_MED)) == NULL) { 6399 DERR(vswp, "%s(%ld): allocb failed", 6400 __func__, ldcp->ldc_id); 6401 pub_addr->hdr.dstate = VIO_DESC_DONE; 6402 (void) ldc_mem_dring_release(dp->handle, 6403 pos, pos); 6404 break; 6405 } 6406 } 6407 6408 /* 6409 * Ensure that we ask ldc for an aligned 6410 * number of bytes. 6411 */ 6412 nbytes = datalen + VNET_IPALIGN; 6413 if (nbytes & 0x7) { 6414 off = 8 - (nbytes & 0x7); 6415 nbytes += off; 6416 } 6417 6418 ncookies = pub_addr->ncookies; 6419 rv = ldc_mem_copy(ldcp->ldc_handle, 6420 (caddr_t)mp->b_rptr, 0, &nbytes, 6421 pub_addr->memcookie, ncookies, LDC_COPY_IN); 6422 6423 if (rv != 0) { 6424 DERR(vswp, "%s(%d): unable to copy in data " 6425 "from %d cookies in desc %d (rv %d)", 6426 __func__, ldcp->ldc_id, ncookies, pos, rv); 6427 freemsg(mp); 6428 6429 pub_addr->hdr.dstate = VIO_DESC_DONE; 6430 (void) ldc_mem_dring_release(dp->handle, 6431 pos, pos); 6432 break; 6433 } else { 6434 D2(vswp, "%s(%d): copied in %ld bytes" 6435 " using %d cookies", __func__, 6436 ldcp->ldc_id, nbytes, ncookies); 6437 } 6438 6439 /* adjust the read pointer to skip over the padding */ 6440 mp->b_rptr += VNET_IPALIGN; 6441 6442 /* point to the actual end of data */ 6443 mp->b_wptr = mp->b_rptr + datalen; 6444 6445 /* build a chain of received packets */ 6446 if (bp == NULL) { 6447 /* first pkt */ 6448 bp = mp; 6449 bp->b_next = bp->b_prev = NULL; 6450 bpt = bp; 6451 chain = 1; 6452 } else { 6453 mp->b_next = NULL; 6454 mp->b_prev = bpt; 6455 bpt->b_next = mp; 6456 bpt = mp; 6457 chain++; 6458 } 6459 6460 /* mark we are finished with this descriptor */ 6461 pub_addr->hdr.dstate = VIO_DESC_DONE; 6462 6463 (void) ldc_mem_dring_release(dp->handle, pos, pos); 6464 6465 /* 6466 * Send an ACK back to peer if requested. 6467 */ 6468 if (ack_needed) { 6469 ack_needed = B_FALSE; 6470 6471 dring_pkt->start_idx = range_start; 6472 dring_pkt->end_idx = range_end; 6473 6474 DERR(vswp, "%s(%lld): processed %d %d, ACK" 6475 " requested", __func__, ldcp->ldc_id, 6476 dring_pkt->start_idx, dring_pkt->end_idx); 6477 6478 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 6479 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6480 dring_pkt->tag.vio_sid = ldcp->local_session; 6481 6482 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6483 sizeof (vio_dring_msg_t), B_FALSE); 6484 6485 /* 6486 * Check if ACK was successfully sent. If not 6487 * we break and deal with that below. 6488 */ 6489 if (msg_rv != 0) 6490 break; 6491 6492 prev_desc_ack = B_TRUE; 6493 range_start = pos; 6494 } 6495 6496 /* next descriptor */ 6497 pos = (pos + 1) % len; 6498 cnt++; 6499 6500 /* 6501 * Break out of loop here and stop processing to 6502 * allow some other network device (or disk) to 6503 * get access to the cpu. 6504 */ 6505 if (chain > vsw_chain_len) { 6506 D3(vswp, "%s(%lld): switching chain of %d " 6507 "msgs", __func__, ldcp->ldc_id, chain); 6508 break; 6509 } 6510 } 6511 RW_EXIT(&ldcp->lane_in.dlistrw); 6512 6513 /* 6514 * If when we attempted to send the ACK we found that the 6515 * channel had been reset then now handle this. We deal with 6516 * it here as we cannot reset the channel while holding the 6517 * dlistrw lock, and we don't want to acquire/release it 6518 * continuously in the above loop, as a channel reset should 6519 * be a rare event. 6520 */ 6521 if (msg_rv == ECONNRESET) { 6522 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6523 break; 6524 } 6525 6526 /* send the chain of packets to be switched */ 6527 if (bp != NULL) { 6528 D3(vswp, "%s(%lld): switching chain of %d msgs", 6529 __func__, ldcp->ldc_id, chain); 6530 vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, 6531 ldcp->ldc_port, NULL); 6532 } 6533 6534 DTRACE_PROBE1(msg_cnt, int, cnt); 6535 6536 /* 6537 * We are now finished so ACK back with the state 6538 * set to STOPPING so our peer knows we are finished 6539 */ 6540 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6541 dring_pkt->tag.vio_sid = ldcp->local_session; 6542 6543 dring_pkt->dring_process_state = VIO_DP_STOPPED; 6544 6545 DTRACE_PROBE(stop_process_sent); 6546 6547 /* 6548 * We have not processed any more descriptors beyond 6549 * the last one we ACK'd. 6550 */ 6551 if (prev_desc_ack) 6552 range_start = range_end; 6553 6554 dring_pkt->start_idx = range_start; 6555 dring_pkt->end_idx = range_end; 6556 6557 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 6558 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6559 dring_pkt->end_idx); 6560 6561 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 6562 sizeof (vio_dring_msg_t), B_TRUE); 6563 break; 6564 6565 case VIO_SUBTYPE_ACK: 6566 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 6567 /* 6568 * Verify that the relevant descriptors are all 6569 * marked as DONE 6570 */ 6571 READ_ENTER(&ldcp->lane_out.dlistrw); 6572 if ((dp = vsw_ident2dring(&ldcp->lane_out, 6573 dring_pkt->dring_ident)) == NULL) { 6574 RW_EXIT(&ldcp->lane_out.dlistrw); 6575 DERR(vswp, "%s: unknown ident in ACK", __func__); 6576 return; 6577 } 6578 6579 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 6580 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6581 6582 start = end = 0; 6583 start = dring_pkt->start_idx; 6584 end = dring_pkt->end_idx; 6585 len = dp->num_descriptors; 6586 6587 j = num = 0; 6588 /* calculate # descriptors taking into a/c wrap around */ 6589 num = end >= start ? end - start + 1: (len - start + 1) + end; 6590 6591 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 6592 __func__, ldcp->ldc_id, start, end, num); 6593 6594 mutex_enter(&dp->dlock); 6595 dp->last_ack_recv = end; 6596 mutex_exit(&dp->dlock); 6597 6598 for (i = start; j < num; i = (i + 1) % len, j++) { 6599 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6600 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6601 6602 /* 6603 * If the last descriptor in a range has the ACK 6604 * bit set then we will get two messages from our 6605 * peer relating to it. The normal ACK msg and then 6606 * a subsequent STOP msg. The first message will have 6607 * resulted in the descriptor being reclaimed and 6608 * its state set to FREE so when we encounter a non 6609 * DONE descriptor we need to check to see if its 6610 * because we have just reclaimed it. 6611 */ 6612 mutex_enter(&priv_addr->dstate_lock); 6613 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 6614 /* clear all the fields */ 6615 bzero(priv_addr->datap, priv_addr->datalen); 6616 priv_addr->datalen = 0; 6617 6618 pub_addr->hdr.dstate = VIO_DESC_FREE; 6619 pub_addr->hdr.ack = 0; 6620 6621 priv_addr->dstate = VIO_DESC_FREE; 6622 mutex_exit(&priv_addr->dstate_lock); 6623 6624 D3(vswp, "clearing descp %d : pub state " 6625 "0x%llx : priv state 0x%llx", i, 6626 pub_addr->hdr.dstate, priv_addr->dstate); 6627 6628 } else { 6629 mutex_exit(&priv_addr->dstate_lock); 6630 6631 if (dring_pkt->dring_process_state != 6632 VIO_DP_STOPPED) { 6633 DERR(vswp, "%s: descriptor %lld at pos " 6634 " 0x%llx not DONE (0x%lx)\n", 6635 __func__, i, pub_addr, 6636 pub_addr->hdr.dstate); 6637 RW_EXIT(&ldcp->lane_out.dlistrw); 6638 return; 6639 } 6640 } 6641 } 6642 6643 /* 6644 * If our peer is stopping processing descriptors then 6645 * we check to make sure it has processed all the descriptors 6646 * we have updated. If not then we send it a new message 6647 * to prompt it to restart. 6648 */ 6649 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 6650 DTRACE_PROBE(stop_process_recv); 6651 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 6652 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6653 dring_pkt->end_idx); 6654 6655 /* 6656 * Check next descriptor in public section of ring. 6657 * If its marked as READY then we need to prompt our 6658 * peer to start processing the ring again. 6659 */ 6660 i = (end + 1) % len; 6661 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6662 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6663 6664 /* 6665 * Hold the restart lock across all of this to 6666 * make sure that its not possible for us to 6667 * decide that a msg needs to be sent in the future 6668 * but the sending code having already checked is 6669 * about to exit. 6670 */ 6671 mutex_enter(&dp->restart_lock); 6672 mutex_enter(&priv_addr->dstate_lock); 6673 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 6674 6675 mutex_exit(&priv_addr->dstate_lock); 6676 6677 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 6678 dring_pkt->tag.vio_sid = ldcp->local_session; 6679 6680 mutex_enter(&ldcp->lane_out.seq_lock); 6681 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 6682 mutex_exit(&ldcp->lane_out.seq_lock); 6683 6684 dring_pkt->start_idx = (end + 1) % len; 6685 dring_pkt->end_idx = -1; 6686 6687 D2(vswp, "%s(%lld) : sending restart msg:" 6688 " %d : %d", __func__, ldcp->ldc_id, 6689 dring_pkt->start_idx, dring_pkt->end_idx); 6690 6691 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6692 sizeof (vio_dring_msg_t), B_FALSE); 6693 6694 } else { 6695 mutex_exit(&priv_addr->dstate_lock); 6696 dp->restart_reqd = B_TRUE; 6697 } 6698 mutex_exit(&dp->restart_lock); 6699 } 6700 RW_EXIT(&ldcp->lane_out.dlistrw); 6701 6702 /* only do channel reset after dropping dlistrw lock */ 6703 if (msg_rv == ECONNRESET) 6704 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6705 6706 break; 6707 6708 case VIO_SUBTYPE_NACK: 6709 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 6710 __func__, ldcp->ldc_id); 6711 /* 6712 * Something is badly wrong if we are getting NACK's 6713 * for our data pkts. So reset the channel. 6714 */ 6715 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6716 6717 break; 6718 6719 default: 6720 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6721 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 6722 } 6723 6724 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6725 } 6726 6727 /* 6728 * VIO_PKT_DATA (a.k.a raw data mode ) 6729 * 6730 * Note - currently not supported. Do nothing. 6731 */ 6732 static void 6733 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 6734 { 6735 _NOTE(ARGUNUSED(dpkt)) 6736 6737 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6738 DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id); 6739 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6740 } 6741 6742 /* 6743 * Process an in-band descriptor message (most likely from 6744 * OBP). 6745 */ 6746 static void 6747 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 6748 { 6749 vnet_ibnd_desc_t *ibnd_desc; 6750 dring_info_t *dp = NULL; 6751 vsw_private_desc_t *priv_addr = NULL; 6752 vsw_t *vswp = ldcp->ldc_vswp; 6753 mblk_t *mp = NULL; 6754 size_t nbytes = 0; 6755 size_t off = 0; 6756 uint64_t idx = 0; 6757 uint32_t num = 1, len, datalen = 0; 6758 uint64_t ncookies = 0; 6759 int i, rv; 6760 int j = 0; 6761 6762 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6763 6764 ibnd_desc = (vnet_ibnd_desc_t *)pkt; 6765 6766 switch (ibnd_desc->hdr.tag.vio_subtype) { 6767 case VIO_SUBTYPE_INFO: 6768 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6769 6770 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 6771 return; 6772 6773 /* 6774 * Data is padded to align on a 8 byte boundary, 6775 * nbytes is actual data length, i.e. minus that 6776 * padding. 6777 */ 6778 datalen = ibnd_desc->nbytes; 6779 6780 D2(vswp, "%s(%lld): processing inband desc : " 6781 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 6782 6783 ncookies = ibnd_desc->ncookies; 6784 6785 /* 6786 * allocb(9F) returns an aligned data block. We 6787 * need to ensure that we ask ldc for an aligned 6788 * number of bytes also. 6789 */ 6790 nbytes = datalen; 6791 if (nbytes & 0x7) { 6792 off = 8 - (nbytes & 0x7); 6793 nbytes += off; 6794 } 6795 6796 mp = allocb(datalen, BPRI_MED); 6797 if (mp == NULL) { 6798 DERR(vswp, "%s(%lld): allocb failed", 6799 __func__, ldcp->ldc_id); 6800 return; 6801 } 6802 6803 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 6804 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 6805 LDC_COPY_IN); 6806 6807 if (rv != 0) { 6808 DERR(vswp, "%s(%d): unable to copy in data from " 6809 "%d cookie(s)", __func__, ldcp->ldc_id, ncookies); 6810 freemsg(mp); 6811 return; 6812 } 6813 6814 D2(vswp, "%s(%d): copied in %ld bytes using %d cookies", 6815 __func__, ldcp->ldc_id, nbytes, ncookies); 6816 6817 /* point to the actual end of data */ 6818 mp->b_wptr = mp->b_rptr + datalen; 6819 6820 /* 6821 * We ACK back every in-band descriptor message we process 6822 */ 6823 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 6824 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 6825 (void) vsw_send_msg(ldcp, (void *)ibnd_desc, 6826 sizeof (vnet_ibnd_desc_t), B_TRUE); 6827 6828 /* send the packet to be switched */ 6829 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, 6830 ldcp->ldc_port, NULL); 6831 6832 break; 6833 6834 case VIO_SUBTYPE_ACK: 6835 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 6836 6837 /* Verify the ACK is valid */ 6838 idx = ibnd_desc->hdr.desc_handle; 6839 6840 if (idx >= VSW_RING_NUM_EL) { 6841 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " 6842 "(idx %ld)", vswp->instance, idx); 6843 return; 6844 } 6845 6846 if ((dp = ldcp->lane_out.dringp) == NULL) { 6847 DERR(vswp, "%s: no dring found", __func__); 6848 return; 6849 } 6850 6851 len = dp->num_descriptors; 6852 /* 6853 * If the descriptor we are being ACK'ed for is not the 6854 * one we expected, then pkts were lost somwhere, either 6855 * when we tried to send a msg, or a previous ACK msg from 6856 * our peer. In either case we now reclaim the descriptors 6857 * in the range from the last ACK we received up to the 6858 * current ACK. 6859 */ 6860 if (idx != dp->last_ack_recv) { 6861 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 6862 __func__, dp->last_ack_recv, idx); 6863 num = idx >= dp->last_ack_recv ? 6864 idx - dp->last_ack_recv + 1: 6865 (len - dp->last_ack_recv + 1) + idx; 6866 } 6867 6868 /* 6869 * When we sent the in-band message to our peer we 6870 * marked the copy in our private ring as READY. We now 6871 * check that the descriptor we are being ACK'ed for is in 6872 * fact READY, i.e. it is one we have shared with our peer. 6873 * 6874 * If its not we flag an error, but still reset the descr 6875 * back to FREE. 6876 */ 6877 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 6878 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6879 mutex_enter(&priv_addr->dstate_lock); 6880 if (priv_addr->dstate != VIO_DESC_READY) { 6881 DERR(vswp, "%s: (%ld) desc at index %ld not " 6882 "READY (0x%lx)", __func__, 6883 ldcp->ldc_id, idx, priv_addr->dstate); 6884 DERR(vswp, "%s: bound %d: ncookies %ld : " 6885 "datalen %ld", __func__, 6886 priv_addr->bound, priv_addr->ncookies, 6887 priv_addr->datalen); 6888 } 6889 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 6890 ldcp->ldc_id, idx); 6891 /* release resources associated with sent msg */ 6892 bzero(priv_addr->datap, priv_addr->datalen); 6893 priv_addr->datalen = 0; 6894 priv_addr->dstate = VIO_DESC_FREE; 6895 mutex_exit(&priv_addr->dstate_lock); 6896 } 6897 /* update to next expected value */ 6898 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 6899 6900 break; 6901 6902 case VIO_SUBTYPE_NACK: 6903 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6904 6905 /* 6906 * We should only get a NACK if our peer doesn't like 6907 * something about a message we have sent it. If this 6908 * happens we just release the resources associated with 6909 * the message. (We are relying on higher layers to decide 6910 * whether or not to resend. 6911 */ 6912 6913 /* limit check */ 6914 idx = ibnd_desc->hdr.desc_handle; 6915 6916 if (idx >= VSW_RING_NUM_EL) { 6917 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 6918 __func__, idx); 6919 return; 6920 } 6921 6922 if ((dp = ldcp->lane_out.dringp) == NULL) { 6923 DERR(vswp, "%s: no dring found", __func__); 6924 return; 6925 } 6926 6927 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6928 6929 /* move to correct location in ring */ 6930 priv_addr += idx; 6931 6932 /* release resources associated with sent msg */ 6933 mutex_enter(&priv_addr->dstate_lock); 6934 bzero(priv_addr->datap, priv_addr->datalen); 6935 priv_addr->datalen = 0; 6936 priv_addr->dstate = VIO_DESC_FREE; 6937 mutex_exit(&priv_addr->dstate_lock); 6938 6939 break; 6940 6941 default: 6942 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6943 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 6944 } 6945 6946 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6947 } 6948 6949 static void 6950 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 6951 { 6952 _NOTE(ARGUNUSED(epkt)) 6953 6954 vsw_t *vswp = ldcp->ldc_vswp; 6955 uint16_t env = tag.vio_subtype_env; 6956 6957 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6958 6959 /* 6960 * Error vio_subtypes have yet to be defined. So for 6961 * the moment we can't do anything. 6962 */ 6963 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 6964 6965 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6966 } 6967 6968 /* 6969 * Switch the given ethernet frame when operating in layer 2 mode. 6970 * 6971 * vswp: pointer to the vsw instance 6972 * mp: pointer to chain of ethernet frame(s) to be switched 6973 * caller: identifies the source of this frame as: 6974 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 6975 * 2. VSW_PHYSDEV - the physical ethernet device 6976 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 6977 * arg: argument provided by the caller. 6978 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 6979 * 2. for PHYSDEV - NULL 6980 * 3. for LOCALDEV - pointer to to this vsw_t(self) 6981 */ 6982 void 6983 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 6984 vsw_port_t *arg, mac_resource_handle_t mrh) 6985 { 6986 struct ether_header *ehp; 6987 vsw_port_t *port = NULL; 6988 mblk_t *bp, *ret_m; 6989 mblk_t *nmp = NULL; 6990 vsw_port_list_t *plist = &vswp->plist; 6991 6992 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6993 6994 /* 6995 * PERF: rather than breaking up the chain here, scan it 6996 * to find all mblks heading to same destination and then 6997 * pass that sub-chain to the lower transmit functions. 6998 */ 6999 7000 /* process the chain of packets */ 7001 bp = mp; 7002 while (bp) { 7003 mp = bp; 7004 bp = bp->b_next; 7005 mp->b_next = mp->b_prev = NULL; 7006 ehp = (struct ether_header *)mp->b_rptr; 7007 7008 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 7009 __func__, MBLKSIZE(mp), MBLKL(mp)); 7010 7011 READ_ENTER(&vswp->if_lockrw); 7012 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 7013 /* 7014 * If destination is VSW_LOCALDEV (vsw as an eth 7015 * interface) and if the device is up & running, 7016 * send the packet up the stack on this host. 7017 * If the virtual interface is down, drop the packet. 7018 */ 7019 if (caller != VSW_LOCALDEV) { 7020 if (vswp->if_state & VSW_IF_UP) { 7021 RW_EXIT(&vswp->if_lockrw); 7022 mac_rx(vswp->if_mh, mrh, mp); 7023 } else { 7024 RW_EXIT(&vswp->if_lockrw); 7025 /* Interface down, drop pkt */ 7026 freemsg(mp); 7027 } 7028 } else { 7029 RW_EXIT(&vswp->if_lockrw); 7030 freemsg(mp); 7031 } 7032 continue; 7033 } 7034 RW_EXIT(&vswp->if_lockrw); 7035 7036 READ_ENTER(&plist->lockrw); 7037 port = vsw_lookup_fdb(vswp, ehp); 7038 if (port) { 7039 /* 7040 * Mark the port as in-use. 7041 */ 7042 mutex_enter(&port->ref_lock); 7043 port->ref_cnt++; 7044 mutex_exit(&port->ref_lock); 7045 RW_EXIT(&plist->lockrw); 7046 7047 /* 7048 * If plumbed and in promisc mode then copy msg 7049 * and send up the stack. 7050 */ 7051 READ_ENTER(&vswp->if_lockrw); 7052 if (VSW_U_P(vswp->if_state)) { 7053 RW_EXIT(&vswp->if_lockrw); 7054 nmp = copymsg(mp); 7055 if (nmp) 7056 mac_rx(vswp->if_mh, mrh, nmp); 7057 } else { 7058 RW_EXIT(&vswp->if_lockrw); 7059 } 7060 7061 /* 7062 * If the destination is in FDB, the packet 7063 * should be forwarded to the correponding 7064 * vsw_port (connected to a vnet device - 7065 * VSW_VNETPORT) 7066 */ 7067 (void) vsw_portsend(port, mp); 7068 7069 /* 7070 * Decrement use count in port and check if 7071 * should wake delete thread. 7072 */ 7073 mutex_enter(&port->ref_lock); 7074 port->ref_cnt--; 7075 if (port->ref_cnt == 0) 7076 cv_signal(&port->ref_cv); 7077 mutex_exit(&port->ref_lock); 7078 } else { 7079 RW_EXIT(&plist->lockrw); 7080 /* 7081 * Destination not in FDB. 7082 * 7083 * If the destination is broadcast or 7084 * multicast forward the packet to all 7085 * (VNETPORTs, PHYSDEV, LOCALDEV), 7086 * except the caller. 7087 */ 7088 if (IS_BROADCAST(ehp)) { 7089 D3(vswp, "%s: BROADCAST pkt", __func__); 7090 (void) vsw_forward_all(vswp, mp, caller, arg); 7091 } else if (IS_MULTICAST(ehp)) { 7092 D3(vswp, "%s: MULTICAST pkt", __func__); 7093 (void) vsw_forward_grp(vswp, mp, caller, arg); 7094 } else { 7095 /* 7096 * If the destination is unicast, and came 7097 * from either a logical network device or 7098 * the switch itself when it is plumbed, then 7099 * send it out on the physical device and also 7100 * up the stack if the logical interface is 7101 * in promiscious mode. 7102 * 7103 * NOTE: The assumption here is that if we 7104 * cannot find the destination in our fdb, its 7105 * a unicast address, and came from either a 7106 * vnet or down the stack (when plumbed) it 7107 * must be destinded for an ethernet device 7108 * outside our ldoms. 7109 */ 7110 if (caller == VSW_VNETPORT) { 7111 READ_ENTER(&vswp->if_lockrw); 7112 if (VSW_U_P(vswp->if_state)) { 7113 RW_EXIT(&vswp->if_lockrw); 7114 nmp = copymsg(mp); 7115 if (nmp) 7116 mac_rx(vswp->if_mh, 7117 mrh, nmp); 7118 } else { 7119 RW_EXIT(&vswp->if_lockrw); 7120 } 7121 if ((ret_m = vsw_tx_msg(vswp, mp)) 7122 != NULL) { 7123 DERR(vswp, "%s: drop mblks to " 7124 "phys dev", __func__); 7125 freemsg(ret_m); 7126 } 7127 7128 } else if (caller == VSW_PHYSDEV) { 7129 /* 7130 * Pkt seen because card in promisc 7131 * mode. Send up stack if plumbed in 7132 * promisc mode, else drop it. 7133 */ 7134 READ_ENTER(&vswp->if_lockrw); 7135 if (VSW_U_P(vswp->if_state)) { 7136 RW_EXIT(&vswp->if_lockrw); 7137 mac_rx(vswp->if_mh, mrh, mp); 7138 } else { 7139 RW_EXIT(&vswp->if_lockrw); 7140 freemsg(mp); 7141 } 7142 7143 } else if (caller == VSW_LOCALDEV) { 7144 /* 7145 * Pkt came down the stack, send out 7146 * over physical device. 7147 */ 7148 if ((ret_m = vsw_tx_msg(vswp, mp)) 7149 != NULL) { 7150 DERR(vswp, "%s: drop mblks to " 7151 "phys dev", __func__); 7152 freemsg(ret_m); 7153 } 7154 } 7155 } 7156 } 7157 } 7158 D1(vswp, "%s: exit\n", __func__); 7159 } 7160 7161 /* 7162 * Switch ethernet frame when in layer 3 mode (i.e. using IP 7163 * layer to do the routing). 7164 * 7165 * There is a large amount of overlap between this function and 7166 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 7167 * both these functions. 7168 */ 7169 void 7170 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 7171 vsw_port_t *arg, mac_resource_handle_t mrh) 7172 { 7173 struct ether_header *ehp; 7174 vsw_port_t *port = NULL; 7175 mblk_t *bp = NULL; 7176 vsw_port_list_t *plist = &vswp->plist; 7177 7178 D1(vswp, "%s: enter (caller %d)", __func__, caller); 7179 7180 /* 7181 * In layer 3 mode should only ever be switching packets 7182 * between IP layer and vnet devices. So make sure thats 7183 * who is invoking us. 7184 */ 7185 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 7186 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 7187 freemsgchain(mp); 7188 return; 7189 } 7190 7191 /* process the chain of packets */ 7192 bp = mp; 7193 while (bp) { 7194 mp = bp; 7195 bp = bp->b_next; 7196 mp->b_next = mp->b_prev = NULL; 7197 ehp = (struct ether_header *)mp->b_rptr; 7198 7199 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 7200 __func__, MBLKSIZE(mp), MBLKL(mp)); 7201 7202 READ_ENTER(&plist->lockrw); 7203 port = vsw_lookup_fdb(vswp, ehp); 7204 if (port) { 7205 /* 7206 * Mark port as in-use. 7207 */ 7208 mutex_enter(&port->ref_lock); 7209 port->ref_cnt++; 7210 mutex_exit(&port->ref_lock); 7211 RW_EXIT(&plist->lockrw); 7212 7213 D2(vswp, "%s: sending to target port", __func__); 7214 (void) vsw_portsend(port, mp); 7215 7216 /* 7217 * Finished with port so decrement ref count and 7218 * check if should wake delete thread. 7219 */ 7220 mutex_enter(&port->ref_lock); 7221 port->ref_cnt--; 7222 if (port->ref_cnt == 0) 7223 cv_signal(&port->ref_cv); 7224 mutex_exit(&port->ref_lock); 7225 } else { 7226 RW_EXIT(&plist->lockrw); 7227 /* 7228 * Destination not in FDB 7229 * 7230 * If the destination is broadcast or 7231 * multicast forward the packet to all 7232 * (VNETPORTs, PHYSDEV, LOCALDEV), 7233 * except the caller. 7234 */ 7235 if (IS_BROADCAST(ehp)) { 7236 D2(vswp, "%s: BROADCAST pkt", __func__); 7237 (void) vsw_forward_all(vswp, mp, caller, arg); 7238 } else if (IS_MULTICAST(ehp)) { 7239 D2(vswp, "%s: MULTICAST pkt", __func__); 7240 (void) vsw_forward_grp(vswp, mp, caller, arg); 7241 } else { 7242 /* 7243 * Unicast pkt from vnet that we don't have 7244 * an FDB entry for, so must be destinded for 7245 * the outside world. Attempt to send up to the 7246 * IP layer to allow it to deal with it. 7247 */ 7248 if (caller == VSW_VNETPORT) { 7249 READ_ENTER(&vswp->if_lockrw); 7250 if (vswp->if_state & VSW_IF_UP) { 7251 RW_EXIT(&vswp->if_lockrw); 7252 D2(vswp, "%s: sending up", 7253 __func__); 7254 mac_rx(vswp->if_mh, mrh, mp); 7255 } else { 7256 RW_EXIT(&vswp->if_lockrw); 7257 /* Interface down, drop pkt */ 7258 D2(vswp, "%s I/F down", 7259 __func__); 7260 freemsg(mp); 7261 } 7262 } 7263 } 7264 } 7265 } 7266 7267 D1(vswp, "%s: exit", __func__); 7268 } 7269 7270 /* 7271 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 7272 * except the caller (port on which frame arrived). 7273 */ 7274 static int 7275 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 7276 { 7277 vsw_port_list_t *plist = &vswp->plist; 7278 vsw_port_t *portp; 7279 mblk_t *nmp = NULL; 7280 mblk_t *ret_m = NULL; 7281 int skip_port = 0; 7282 7283 D1(vswp, "vsw_forward_all: enter\n"); 7284 7285 /* 7286 * Broadcast message from inside ldoms so send to outside 7287 * world if in either of layer 2 modes. 7288 */ 7289 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 7290 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 7291 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 7292 7293 nmp = dupmsg(mp); 7294 if (nmp) { 7295 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 7296 DERR(vswp, "%s: dropping pkt(s) " 7297 "consisting of %ld bytes of data for" 7298 " physical device", __func__, MBLKL(ret_m)); 7299 freemsg(ret_m); 7300 } 7301 } 7302 } 7303 7304 if (caller == VSW_VNETPORT) 7305 skip_port = 1; 7306 7307 /* 7308 * Broadcast message from other vnet (layer 2 or 3) or outside 7309 * world (layer 2 only), send up stack if plumbed. 7310 */ 7311 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 7312 READ_ENTER(&vswp->if_lockrw); 7313 if (vswp->if_state & VSW_IF_UP) { 7314 RW_EXIT(&vswp->if_lockrw); 7315 nmp = copymsg(mp); 7316 if (nmp) 7317 mac_rx(vswp->if_mh, NULL, nmp); 7318 } else { 7319 RW_EXIT(&vswp->if_lockrw); 7320 } 7321 } 7322 7323 /* send it to all VNETPORTs */ 7324 READ_ENTER(&plist->lockrw); 7325 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 7326 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 7327 /* 7328 * Caution ! - don't reorder these two checks as arg 7329 * will be NULL if the caller is PHYSDEV. skip_port is 7330 * only set if caller is VNETPORT. 7331 */ 7332 if ((skip_port) && (portp == arg)) 7333 continue; 7334 else { 7335 nmp = dupmsg(mp); 7336 if (nmp) { 7337 (void) vsw_portsend(portp, nmp); 7338 } else { 7339 DERR(vswp, "vsw_forward_all: nmp NULL"); 7340 } 7341 } 7342 } 7343 RW_EXIT(&plist->lockrw); 7344 7345 freemsg(mp); 7346 7347 D1(vswp, "vsw_forward_all: exit\n"); 7348 return (0); 7349 } 7350 7351 /* 7352 * Forward pkts to any devices or interfaces which have registered 7353 * an interest in them (i.e. multicast groups). 7354 */ 7355 static int 7356 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 7357 { 7358 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 7359 mfdb_ent_t *entp = NULL; 7360 mfdb_ent_t *tpp = NULL; 7361 vsw_port_t *port; 7362 uint64_t key = 0; 7363 mblk_t *nmp = NULL; 7364 mblk_t *ret_m = NULL; 7365 boolean_t check_if = B_TRUE; 7366 7367 /* 7368 * Convert address to hash table key 7369 */ 7370 KEY_HASH(key, ehp->ether_dhost); 7371 7372 D1(vswp, "%s: key 0x%llx", __func__, key); 7373 7374 /* 7375 * If pkt came from either a vnet or down the stack (if we are 7376 * plumbed) and we are in layer 2 mode, then we send the pkt out 7377 * over the physical adapter, and then check to see if any other 7378 * vnets are interested in it. 7379 */ 7380 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 7381 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 7382 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 7383 nmp = dupmsg(mp); 7384 if (nmp) { 7385 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 7386 DERR(vswp, "%s: dropping pkt(s) consisting of " 7387 "%ld bytes of data for physical device", 7388 __func__, MBLKL(ret_m)); 7389 freemsg(ret_m); 7390 } 7391 } 7392 } 7393 7394 READ_ENTER(&vswp->mfdbrw); 7395 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 7396 (mod_hash_val_t *)&entp) != 0) { 7397 D3(vswp, "%s: no table entry found for addr 0x%llx", 7398 __func__, key); 7399 } else { 7400 /* 7401 * Send to list of devices associated with this address... 7402 */ 7403 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 7404 7405 /* dont send to ourselves */ 7406 if ((caller == VSW_VNETPORT) && 7407 (tpp->d_addr == (void *)arg)) { 7408 port = (vsw_port_t *)tpp->d_addr; 7409 D3(vswp, "%s: not sending to ourselves" 7410 " : port %d", __func__, port->p_instance); 7411 continue; 7412 7413 } else if ((caller == VSW_LOCALDEV) && 7414 (tpp->d_type == VSW_LOCALDEV)) { 7415 D3(vswp, "%s: not sending back up stack", 7416 __func__); 7417 continue; 7418 } 7419 7420 if (tpp->d_type == VSW_VNETPORT) { 7421 port = (vsw_port_t *)tpp->d_addr; 7422 D3(vswp, "%s: sending to port %ld for addr " 7423 "0x%llx", __func__, port->p_instance, key); 7424 7425 nmp = dupmsg(mp); 7426 if (nmp) 7427 (void) vsw_portsend(port, nmp); 7428 } else { 7429 if (vswp->if_state & VSW_IF_UP) { 7430 nmp = copymsg(mp); 7431 if (nmp) 7432 mac_rx(vswp->if_mh, NULL, nmp); 7433 check_if = B_FALSE; 7434 D3(vswp, "%s: sending up stack" 7435 " for addr 0x%llx", __func__, key); 7436 } 7437 } 7438 } 7439 } 7440 7441 RW_EXIT(&vswp->mfdbrw); 7442 7443 /* 7444 * If the pkt came from either a vnet or from physical device, 7445 * and if we havent already sent the pkt up the stack then we 7446 * check now if we can/should (i.e. the interface is plumbed 7447 * and in promisc mode). 7448 */ 7449 if ((check_if) && 7450 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 7451 READ_ENTER(&vswp->if_lockrw); 7452 if (VSW_U_P(vswp->if_state)) { 7453 RW_EXIT(&vswp->if_lockrw); 7454 D3(vswp, "%s: (caller %d) finally sending up stack" 7455 " for addr 0x%llx", __func__, caller, key); 7456 nmp = copymsg(mp); 7457 if (nmp) 7458 mac_rx(vswp->if_mh, NULL, nmp); 7459 } else { 7460 RW_EXIT(&vswp->if_lockrw); 7461 } 7462 } 7463 7464 freemsg(mp); 7465 7466 D1(vswp, "%s: exit", __func__); 7467 7468 return (0); 7469 } 7470 7471 /* transmit the packet over the given port */ 7472 static int 7473 vsw_portsend(vsw_port_t *port, mblk_t *mp) 7474 { 7475 vsw_ldc_list_t *ldcl = &port->p_ldclist; 7476 vsw_ldc_t *ldcp; 7477 int status = 0; 7478 7479 7480 READ_ENTER(&ldcl->lockrw); 7481 /* 7482 * Note for now, we have a single channel. 7483 */ 7484 ldcp = ldcl->head; 7485 if (ldcp == NULL) { 7486 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 7487 freemsg(mp); 7488 RW_EXIT(&ldcl->lockrw); 7489 return (1); 7490 } 7491 7492 /* 7493 * Send the message out using the appropriate 7494 * transmit function which will free mblock when it 7495 * is finished with it. 7496 */ 7497 mutex_enter(&port->tx_lock); 7498 if (port->transmit != NULL) 7499 status = (*port->transmit)(ldcp, mp); 7500 else { 7501 freemsg(mp); 7502 } 7503 mutex_exit(&port->tx_lock); 7504 7505 RW_EXIT(&ldcl->lockrw); 7506 7507 return (status); 7508 } 7509 7510 /* 7511 * Send packet out via descriptor ring to a logical device. 7512 */ 7513 static int 7514 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 7515 { 7516 vio_dring_msg_t dring_pkt; 7517 dring_info_t *dp = NULL; 7518 vsw_private_desc_t *priv_desc = NULL; 7519 vnet_public_desc_t *pub = NULL; 7520 vsw_t *vswp = ldcp->ldc_vswp; 7521 mblk_t *bp; 7522 size_t n, size; 7523 caddr_t bufp; 7524 int idx; 7525 int status = LDC_TX_SUCCESS; 7526 7527 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 7528 7529 /* TODO: make test a macro */ 7530 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7531 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7532 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 7533 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 7534 ldcp->lane_out.lstate); 7535 freemsg(mp); 7536 return (LDC_TX_FAILURE); 7537 } 7538 7539 /* 7540 * Note - using first ring only, this may change 7541 * in the future. 7542 */ 7543 READ_ENTER(&ldcp->lane_out.dlistrw); 7544 if ((dp = ldcp->lane_out.dringp) == NULL) { 7545 RW_EXIT(&ldcp->lane_out.dlistrw); 7546 DERR(vswp, "%s(%lld): no dring for outbound lane on" 7547 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 7548 freemsg(mp); 7549 return (LDC_TX_FAILURE); 7550 } 7551 7552 size = msgsize(mp); 7553 if (size > (size_t)ETHERMAX) { 7554 RW_EXIT(&ldcp->lane_out.dlistrw); 7555 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7556 ldcp->ldc_id, size); 7557 freemsg(mp); 7558 return (LDC_TX_FAILURE); 7559 } 7560 7561 /* 7562 * Find a free descriptor 7563 * 7564 * Note: for the moment we are assuming that we will only 7565 * have one dring going from the switch to each of its 7566 * peers. This may change in the future. 7567 */ 7568 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7569 D2(vswp, "%s(%lld): no descriptor available for ring " 7570 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7571 7572 /* nothing more we can do */ 7573 status = LDC_TX_NORESOURCES; 7574 goto vsw_dringsend_free_exit; 7575 } else { 7576 D2(vswp, "%s(%lld): free private descriptor found at pos %ld " 7577 "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc); 7578 } 7579 7580 /* copy data into the descriptor */ 7581 bufp = priv_desc->datap; 7582 bufp += VNET_IPALIGN; 7583 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7584 n = MBLKL(bp); 7585 bcopy(bp->b_rptr, bufp, n); 7586 bufp += n; 7587 } 7588 7589 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7590 7591 pub = priv_desc->descp; 7592 pub->nbytes = priv_desc->datalen; 7593 7594 mutex_enter(&priv_desc->dstate_lock); 7595 pub->hdr.dstate = VIO_DESC_READY; 7596 mutex_exit(&priv_desc->dstate_lock); 7597 7598 /* 7599 * Determine whether or not we need to send a message to our 7600 * peer prompting them to read our newly updated descriptor(s). 7601 */ 7602 mutex_enter(&dp->restart_lock); 7603 if (dp->restart_reqd) { 7604 dp->restart_reqd = B_FALSE; 7605 mutex_exit(&dp->restart_lock); 7606 7607 /* 7608 * Send a vio_dring_msg to peer to prompt them to read 7609 * the updated descriptor ring. 7610 */ 7611 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 7612 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 7613 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 7614 dring_pkt.tag.vio_sid = ldcp->local_session; 7615 7616 /* Note - for now using first ring */ 7617 dring_pkt.dring_ident = dp->ident; 7618 7619 mutex_enter(&ldcp->lane_out.seq_lock); 7620 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 7621 mutex_exit(&ldcp->lane_out.seq_lock); 7622 7623 /* 7624 * If last_ack_recv is -1 then we know we've not 7625 * received any ack's yet, so this must be the first 7626 * msg sent, so set the start to the begining of the ring. 7627 */ 7628 mutex_enter(&dp->dlock); 7629 if (dp->last_ack_recv == -1) { 7630 dring_pkt.start_idx = 0; 7631 } else { 7632 dring_pkt.start_idx = 7633 (dp->last_ack_recv + 1) % dp->num_descriptors; 7634 } 7635 dring_pkt.end_idx = -1; 7636 mutex_exit(&dp->dlock); 7637 7638 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 7639 ldcp->ldc_id, dp, dring_pkt.dring_ident); 7640 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 7641 __func__, ldcp->ldc_id, dring_pkt.start_idx, 7642 dring_pkt.end_idx, dring_pkt.seq_num); 7643 7644 RW_EXIT(&ldcp->lane_out.dlistrw); 7645 7646 (void) vsw_send_msg(ldcp, (void *)&dring_pkt, 7647 sizeof (vio_dring_msg_t), B_TRUE); 7648 7649 /* free the message block */ 7650 freemsg(mp); 7651 return (status); 7652 7653 } else { 7654 mutex_exit(&dp->restart_lock); 7655 D2(vswp, "%s(%lld): updating descp %d", __func__, 7656 ldcp->ldc_id, idx); 7657 } 7658 7659 vsw_dringsend_free_exit: 7660 7661 RW_EXIT(&ldcp->lane_out.dlistrw); 7662 7663 /* free the message block */ 7664 freemsg(mp); 7665 7666 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 7667 return (status); 7668 } 7669 7670 /* 7671 * Send an in-band descriptor message over ldc. 7672 */ 7673 static int 7674 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 7675 { 7676 vsw_t *vswp = ldcp->ldc_vswp; 7677 vnet_ibnd_desc_t ibnd_msg; 7678 vsw_private_desc_t *priv_desc = NULL; 7679 dring_info_t *dp = NULL; 7680 size_t n, size = 0; 7681 caddr_t bufp; 7682 mblk_t *bp; 7683 int idx, i; 7684 int status = LDC_TX_SUCCESS; 7685 static int warn_msg = 1; 7686 7687 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7688 7689 ASSERT(mp != NULL); 7690 7691 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7692 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7693 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 7694 __func__, ldcp->ldc_id, ldcp->ldc_status, 7695 ldcp->lane_out.lstate); 7696 freemsg(mp); 7697 return (LDC_TX_FAILURE); 7698 } 7699 7700 /* 7701 * only expect single dring to exist, which we use 7702 * as an internal buffer, rather than a transfer channel. 7703 */ 7704 READ_ENTER(&ldcp->lane_out.dlistrw); 7705 if ((dp = ldcp->lane_out.dringp) == NULL) { 7706 DERR(vswp, "%s(%lld): no dring for outbound lane", 7707 __func__, ldcp->ldc_id); 7708 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__, 7709 ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate); 7710 RW_EXIT(&ldcp->lane_out.dlistrw); 7711 freemsg(mp); 7712 return (LDC_TX_FAILURE); 7713 } 7714 7715 size = msgsize(mp); 7716 if (size > (size_t)ETHERMAX) { 7717 RW_EXIT(&ldcp->lane_out.dlistrw); 7718 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7719 ldcp->ldc_id, size); 7720 freemsg(mp); 7721 return (LDC_TX_FAILURE); 7722 } 7723 7724 /* 7725 * Find a free descriptor in our buffer ring 7726 */ 7727 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7728 RW_EXIT(&ldcp->lane_out.dlistrw); 7729 if (warn_msg) { 7730 DERR(vswp, "%s(%lld): no descriptor available for ring " 7731 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7732 warn_msg = 0; 7733 } 7734 7735 /* nothing more we can do */ 7736 status = LDC_TX_NORESOURCES; 7737 goto vsw_descrsend_free_exit; 7738 } else { 7739 D2(vswp, "%s(%lld): free private descriptor found at pos " 7740 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc); 7741 warn_msg = 1; 7742 } 7743 7744 /* copy data into the descriptor */ 7745 bufp = priv_desc->datap; 7746 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7747 n = MBLKL(bp); 7748 bcopy(bp->b_rptr, bufp, n); 7749 bufp += n; 7750 } 7751 7752 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7753 7754 /* create and send the in-band descp msg */ 7755 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 7756 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 7757 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 7758 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 7759 7760 mutex_enter(&ldcp->lane_out.seq_lock); 7761 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 7762 mutex_exit(&ldcp->lane_out.seq_lock); 7763 7764 /* 7765 * Copy the mem cookies describing the data from the 7766 * private region of the descriptor ring into the inband 7767 * descriptor. 7768 */ 7769 for (i = 0; i < priv_desc->ncookies; i++) { 7770 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 7771 sizeof (ldc_mem_cookie_t)); 7772 } 7773 7774 ibnd_msg.hdr.desc_handle = idx; 7775 ibnd_msg.ncookies = priv_desc->ncookies; 7776 ibnd_msg.nbytes = size; 7777 7778 RW_EXIT(&ldcp->lane_out.dlistrw); 7779 7780 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, 7781 sizeof (vnet_ibnd_desc_t), B_TRUE); 7782 7783 vsw_descrsend_free_exit: 7784 7785 /* free the allocated message blocks */ 7786 freemsg(mp); 7787 7788 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7789 return (status); 7790 } 7791 7792 static void 7793 vsw_send_ver(void *arg) 7794 { 7795 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 7796 vsw_t *vswp = ldcp->ldc_vswp; 7797 lane_t *lp = &ldcp->lane_out; 7798 vio_ver_msg_t ver_msg; 7799 7800 D1(vswp, "%s enter", __func__); 7801 7802 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7803 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7804 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 7805 ver_msg.tag.vio_sid = ldcp->local_session; 7806 7807 ver_msg.ver_major = vsw_versions[0].ver_major; 7808 ver_msg.ver_minor = vsw_versions[0].ver_minor; 7809 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 7810 7811 lp->lstate |= VSW_VER_INFO_SENT; 7812 lp->ver_major = ver_msg.ver_major; 7813 lp->ver_minor = ver_msg.ver_minor; 7814 7815 DUMP_TAG(ver_msg.tag); 7816 7817 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); 7818 7819 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 7820 } 7821 7822 static void 7823 vsw_send_attr(vsw_ldc_t *ldcp) 7824 { 7825 vsw_t *vswp = ldcp->ldc_vswp; 7826 lane_t *lp = &ldcp->lane_out; 7827 vnet_attr_msg_t attr_msg; 7828 7829 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7830 7831 /* 7832 * Subtype is set to INFO by default 7833 */ 7834 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7835 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7836 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 7837 attr_msg.tag.vio_sid = ldcp->local_session; 7838 7839 /* payload copied from default settings for lane */ 7840 attr_msg.mtu = lp->mtu; 7841 attr_msg.addr_type = lp->addr_type; 7842 attr_msg.xfer_mode = lp->xfer_mode; 7843 attr_msg.ack_freq = lp->xfer_mode; 7844 7845 READ_ENTER(&vswp->if_lockrw); 7846 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 7847 RW_EXIT(&vswp->if_lockrw); 7848 7849 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 7850 7851 DUMP_TAG(attr_msg.tag); 7852 7853 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); 7854 7855 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7856 } 7857 7858 /* 7859 * Create dring info msg (which also results in the creation of 7860 * a dring). 7861 */ 7862 static vio_dring_reg_msg_t * 7863 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 7864 { 7865 vio_dring_reg_msg_t *mp; 7866 dring_info_t *dp; 7867 vsw_t *vswp = ldcp->ldc_vswp; 7868 7869 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 7870 7871 /* 7872 * If we can't create a dring, obviously no point sending 7873 * a message. 7874 */ 7875 if ((dp = vsw_create_dring(ldcp)) == NULL) 7876 return (NULL); 7877 7878 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 7879 7880 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 7881 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 7882 mp->tag.vio_subtype_env = VIO_DRING_REG; 7883 mp->tag.vio_sid = ldcp->local_session; 7884 7885 /* payload */ 7886 mp->num_descriptors = dp->num_descriptors; 7887 mp->descriptor_size = dp->descriptor_size; 7888 mp->options = dp->options; 7889 mp->ncookies = dp->ncookies; 7890 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 7891 7892 mp->dring_ident = 0; 7893 7894 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 7895 7896 return (mp); 7897 } 7898 7899 static void 7900 vsw_send_dring_info(vsw_ldc_t *ldcp) 7901 { 7902 vio_dring_reg_msg_t *dring_msg; 7903 vsw_t *vswp = ldcp->ldc_vswp; 7904 7905 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 7906 7907 dring_msg = vsw_create_dring_info_pkt(ldcp); 7908 if (dring_msg == NULL) { 7909 cmn_err(CE_WARN, "!vsw%d: %s: error creating msg", 7910 vswp->instance, __func__); 7911 return; 7912 } 7913 7914 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 7915 7916 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 7917 7918 (void) vsw_send_msg(ldcp, dring_msg, 7919 sizeof (vio_dring_reg_msg_t), B_TRUE); 7920 7921 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 7922 7923 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 7924 } 7925 7926 static void 7927 vsw_send_rdx(vsw_ldc_t *ldcp) 7928 { 7929 vsw_t *vswp = ldcp->ldc_vswp; 7930 vio_rdx_msg_t rdx_msg; 7931 7932 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7933 7934 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7935 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7936 rdx_msg.tag.vio_subtype_env = VIO_RDX; 7937 rdx_msg.tag.vio_sid = ldcp->local_session; 7938 7939 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; 7940 7941 DUMP_TAG(rdx_msg.tag); 7942 7943 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); 7944 7945 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7946 } 7947 7948 /* 7949 * Generic routine to send message out over ldc channel. 7950 * 7951 * It is possible that when we attempt to write over the ldc channel 7952 * that we get notified that it has been reset. Depending on the value 7953 * of the handle_reset flag we either handle that event here or simply 7954 * notify the caller that the channel was reset. 7955 */ 7956 static int 7957 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) 7958 { 7959 int rv; 7960 size_t msglen = size; 7961 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 7962 vsw_t *vswp = ldcp->ldc_vswp; 7963 7964 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 7965 ldcp->ldc_id, size); 7966 7967 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 7968 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 7969 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 7970 7971 mutex_enter(&ldcp->ldc_txlock); 7972 do { 7973 msglen = size; 7974 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 7975 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 7976 7977 if ((rv != 0) || (msglen != size)) { 7978 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) " 7979 "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen); 7980 } 7981 mutex_exit(&ldcp->ldc_txlock); 7982 7983 /* 7984 * If channel has been reset we either handle it here or 7985 * simply report back that it has been reset and let caller 7986 * decide what to do. 7987 */ 7988 if (rv == ECONNRESET) { 7989 DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id); 7990 7991 /* 7992 * N.B - must never be holding the dlistrw lock when 7993 * we do a reset of the channel. 7994 */ 7995 if (handle_reset) { 7996 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 7997 } 7998 } 7999 8000 return (rv); 8001 } 8002 8003 /* 8004 * Add an entry into FDB, for the given mac address and port_id. 8005 * Returns 0 on success, 1 on failure. 8006 * 8007 * Lock protecting FDB must be held by calling process. 8008 */ 8009 static int 8010 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 8011 { 8012 uint64_t addr = 0; 8013 8014 D1(vswp, "%s: enter", __func__); 8015 8016 KEY_HASH(addr, port->p_macaddr); 8017 8018 D2(vswp, "%s: key = 0x%llx", __func__, addr); 8019 8020 /* 8021 * Note: duplicate keys will be rejected by mod_hash. 8022 */ 8023 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 8024 (mod_hash_val_t)port) != 0) { 8025 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 8026 return (1); 8027 } 8028 8029 D1(vswp, "%s: exit", __func__); 8030 return (0); 8031 } 8032 8033 /* 8034 * Remove an entry from FDB. 8035 * Returns 0 on success, 1 on failure. 8036 */ 8037 static int 8038 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 8039 { 8040 uint64_t addr = 0; 8041 8042 D1(vswp, "%s: enter", __func__); 8043 8044 KEY_HASH(addr, port->p_macaddr); 8045 8046 D2(vswp, "%s: key = 0x%llx", __func__, addr); 8047 8048 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 8049 8050 D1(vswp, "%s: enter", __func__); 8051 8052 return (0); 8053 } 8054 8055 /* 8056 * Search fdb for a given mac address. 8057 * Returns pointer to the entry if found, else returns NULL. 8058 */ 8059 static vsw_port_t * 8060 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 8061 { 8062 uint64_t key = 0; 8063 vsw_port_t *port = NULL; 8064 8065 D1(vswp, "%s: enter", __func__); 8066 8067 KEY_HASH(key, ehp->ether_dhost); 8068 8069 D2(vswp, "%s: key = 0x%llx", __func__, key); 8070 8071 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 8072 (mod_hash_val_t *)&port) != 0) { 8073 D2(vswp, "%s: no port found", __func__); 8074 return (NULL); 8075 } 8076 8077 D1(vswp, "%s: exit", __func__); 8078 8079 return (port); 8080 } 8081 8082 /* 8083 * Add or remove multicast address(es). 8084 * 8085 * Returns 0 on success, 1 on failure. 8086 */ 8087 static int 8088 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 8089 { 8090 mcst_addr_t *mcst_p = NULL; 8091 vsw_t *vswp = port->p_vswp; 8092 uint64_t addr = 0x0; 8093 int i; 8094 8095 D1(vswp, "%s: enter", __func__); 8096 8097 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 8098 8099 for (i = 0; i < mcst_pkt->count; i++) { 8100 /* 8101 * Convert address into form that can be used 8102 * as hash table key. 8103 */ 8104 KEY_HASH(addr, mcst_pkt->mca[i]); 8105 8106 /* 8107 * Add or delete the specified address/port combination. 8108 */ 8109 if (mcst_pkt->set == 0x1) { 8110 D3(vswp, "%s: adding multicast address 0x%llx for " 8111 "port %ld", __func__, addr, port->p_instance); 8112 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 8113 /* 8114 * Update the list of multicast 8115 * addresses contained within the 8116 * port structure to include this new 8117 * one. 8118 */ 8119 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), 8120 KM_NOSLEEP); 8121 if (mcst_p == NULL) { 8122 DERR(vswp, "%s: unable to alloc mem", 8123 __func__); 8124 (void) vsw_del_mcst(vswp, 8125 VSW_VNETPORT, addr, port); 8126 return (1); 8127 } 8128 8129 mcst_p->nextp = NULL; 8130 mcst_p->addr = addr; 8131 ether_copy(&mcst_pkt->mca[i], &mcst_p->mca); 8132 8133 /* 8134 * Program the address into HW. If the addr 8135 * has already been programmed then the MAC 8136 * just increments a ref counter (which is 8137 * used when the address is being deleted) 8138 */ 8139 mutex_enter(&vswp->mac_lock); 8140 if (vswp->mh != NULL) { 8141 if (mac_multicst_add(vswp->mh, 8142 (uchar_t *)&mcst_pkt->mca[i])) { 8143 mutex_exit(&vswp->mac_lock); 8144 cmn_err(CE_WARN, "!vsw%d: " 8145 "unable to add multicast " 8146 "address: %s\n", 8147 vswp->instance, 8148 ether_sprintf((void *) 8149 &mcst_p->mca)); 8150 (void) vsw_del_mcst(vswp, 8151 VSW_VNETPORT, addr, port); 8152 kmem_free(mcst_p, 8153 sizeof (*mcst_p)); 8154 return (1); 8155 } 8156 mcst_p->mac_added = B_TRUE; 8157 } 8158 mutex_exit(&vswp->mac_lock); 8159 8160 mutex_enter(&port->mca_lock); 8161 mcst_p->nextp = port->mcap; 8162 port->mcap = mcst_p; 8163 mutex_exit(&port->mca_lock); 8164 8165 } else { 8166 DERR(vswp, "%s: error adding multicast " 8167 "address 0x%llx for port %ld", 8168 __func__, addr, port->p_instance); 8169 return (1); 8170 } 8171 } else { 8172 /* 8173 * Delete an entry from the multicast hash 8174 * table and update the address list 8175 * appropriately. 8176 */ 8177 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 8178 D3(vswp, "%s: deleting multicast address " 8179 "0x%llx for port %ld", __func__, addr, 8180 port->p_instance); 8181 8182 mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr); 8183 ASSERT(mcst_p != NULL); 8184 8185 /* 8186 * Remove the address from HW. The address 8187 * will actually only be removed once the ref 8188 * count within the MAC layer has dropped to 8189 * zero. I.e. we can safely call this fn even 8190 * if other ports are interested in this 8191 * address. 8192 */ 8193 mutex_enter(&vswp->mac_lock); 8194 if (vswp->mh != NULL && mcst_p->mac_added) { 8195 if (mac_multicst_remove(vswp->mh, 8196 (uchar_t *)&mcst_pkt->mca[i])) { 8197 mutex_exit(&vswp->mac_lock); 8198 cmn_err(CE_WARN, "!vsw%d: " 8199 "unable to remove mcast " 8200 "address: %s\n", 8201 vswp->instance, 8202 ether_sprintf((void *) 8203 &mcst_p->mca)); 8204 kmem_free(mcst_p, 8205 sizeof (*mcst_p)); 8206 return (1); 8207 } 8208 mcst_p->mac_added = B_FALSE; 8209 } 8210 mutex_exit(&vswp->mac_lock); 8211 kmem_free(mcst_p, sizeof (*mcst_p)); 8212 8213 } else { 8214 DERR(vswp, "%s: error deleting multicast " 8215 "addr 0x%llx for port %ld", 8216 __func__, addr, port->p_instance); 8217 return (1); 8218 } 8219 } 8220 } 8221 D1(vswp, "%s: exit", __func__); 8222 return (0); 8223 } 8224 8225 /* 8226 * Add a new multicast entry. 8227 * 8228 * Search hash table based on address. If match found then 8229 * update associated val (which is chain of ports), otherwise 8230 * create new key/val (addr/port) pair and insert into table. 8231 */ 8232 static int 8233 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 8234 { 8235 int dup = 0; 8236 int rv = 0; 8237 mfdb_ent_t *ment = NULL; 8238 mfdb_ent_t *tmp_ent = NULL; 8239 mfdb_ent_t *new_ent = NULL; 8240 void *tgt = NULL; 8241 8242 if (devtype == VSW_VNETPORT) { 8243 /* 8244 * Being invoked from a vnet. 8245 */ 8246 ASSERT(arg != NULL); 8247 tgt = arg; 8248 D2(NULL, "%s: port %d : address 0x%llx", __func__, 8249 ((vsw_port_t *)arg)->p_instance, addr); 8250 } else { 8251 /* 8252 * We are being invoked via the m_multicst mac entry 8253 * point. 8254 */ 8255 D2(NULL, "%s: address 0x%llx", __func__, addr); 8256 tgt = (void *)vswp; 8257 } 8258 8259 WRITE_ENTER(&vswp->mfdbrw); 8260 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 8261 (mod_hash_val_t *)&ment) != 0) { 8262 8263 /* address not currently in table */ 8264 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 8265 ment->d_addr = (void *)tgt; 8266 ment->d_type = devtype; 8267 ment->nextp = NULL; 8268 8269 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 8270 (mod_hash_val_t)ment) != 0) { 8271 DERR(vswp, "%s: hash table insertion failed", __func__); 8272 kmem_free(ment, sizeof (mfdb_ent_t)); 8273 rv = 1; 8274 } else { 8275 D2(vswp, "%s: added initial entry for 0x%llx to " 8276 "table", __func__, addr); 8277 } 8278 } else { 8279 /* 8280 * Address in table. Check to see if specified port 8281 * is already associated with the address. If not add 8282 * it now. 8283 */ 8284 tmp_ent = ment; 8285 while (tmp_ent != NULL) { 8286 if (tmp_ent->d_addr == (void *)tgt) { 8287 if (devtype == VSW_VNETPORT) { 8288 DERR(vswp, "%s: duplicate port entry " 8289 "found for portid %ld and key " 8290 "0x%llx", __func__, 8291 ((vsw_port_t *)arg)->p_instance, 8292 addr); 8293 } else { 8294 DERR(vswp, "%s: duplicate entry found" 8295 "for key 0x%llx", __func__, addr); 8296 } 8297 rv = 1; 8298 dup = 1; 8299 break; 8300 } 8301 tmp_ent = tmp_ent->nextp; 8302 } 8303 8304 /* 8305 * Port not on list so add it to end now. 8306 */ 8307 if (0 == dup) { 8308 D2(vswp, "%s: added entry for 0x%llx to table", 8309 __func__, addr); 8310 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 8311 new_ent->d_addr = (void *)tgt; 8312 new_ent->d_type = devtype; 8313 new_ent->nextp = NULL; 8314 8315 tmp_ent = ment; 8316 while (tmp_ent->nextp != NULL) 8317 tmp_ent = tmp_ent->nextp; 8318 8319 tmp_ent->nextp = new_ent; 8320 } 8321 } 8322 8323 RW_EXIT(&vswp->mfdbrw); 8324 return (rv); 8325 } 8326 8327 /* 8328 * Remove a multicast entry from the hashtable. 8329 * 8330 * Search hash table based on address. If match found, scan 8331 * list of ports associated with address. If specified port 8332 * found remove it from list. 8333 */ 8334 static int 8335 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 8336 { 8337 mfdb_ent_t *ment = NULL; 8338 mfdb_ent_t *curr_p, *prev_p; 8339 void *tgt = NULL; 8340 8341 D1(vswp, "%s: enter", __func__); 8342 8343 if (devtype == VSW_VNETPORT) { 8344 tgt = (vsw_port_t *)arg; 8345 D2(vswp, "%s: removing port %d from mFDB for address" 8346 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr); 8347 } else { 8348 D2(vswp, "%s: removing entry", __func__); 8349 tgt = (void *)vswp; 8350 } 8351 8352 WRITE_ENTER(&vswp->mfdbrw); 8353 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 8354 (mod_hash_val_t *)&ment) != 0) { 8355 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 8356 RW_EXIT(&vswp->mfdbrw); 8357 return (1); 8358 } 8359 8360 prev_p = curr_p = ment; 8361 8362 while (curr_p != NULL) { 8363 if (curr_p->d_addr == (void *)tgt) { 8364 if (devtype == VSW_VNETPORT) { 8365 D2(vswp, "%s: port %d found", __func__, 8366 ((vsw_port_t *)tgt)->p_instance); 8367 } else { 8368 D2(vswp, "%s: instance found", __func__); 8369 } 8370 8371 if (prev_p == curr_p) { 8372 /* 8373 * head of list, if no other element is in 8374 * list then destroy this entry, otherwise 8375 * just replace it with updated value. 8376 */ 8377 ment = curr_p->nextp; 8378 if (ment == NULL) { 8379 (void) mod_hash_destroy(vswp->mfdb, 8380 (mod_hash_val_t)addr); 8381 } else { 8382 (void) mod_hash_replace(vswp->mfdb, 8383 (mod_hash_key_t)addr, 8384 (mod_hash_val_t)ment); 8385 } 8386 } else { 8387 /* 8388 * Not head of list, no need to do 8389 * replacement, just adjust list pointers. 8390 */ 8391 prev_p->nextp = curr_p->nextp; 8392 } 8393 break; 8394 } 8395 8396 prev_p = curr_p; 8397 curr_p = curr_p->nextp; 8398 } 8399 8400 RW_EXIT(&vswp->mfdbrw); 8401 8402 D1(vswp, "%s: exit", __func__); 8403 8404 if (curr_p == NULL) 8405 return (1); 8406 kmem_free(curr_p, sizeof (mfdb_ent_t)); 8407 return (0); 8408 } 8409 8410 /* 8411 * Port is being deleted, but has registered an interest in one 8412 * or more multicast groups. Using the list of addresses maintained 8413 * within the port structure find the appropriate entry in the hash 8414 * table and remove this port from the list of interested ports. 8415 */ 8416 static void 8417 vsw_del_mcst_port(vsw_port_t *port) 8418 { 8419 mcst_addr_t *mcap = NULL; 8420 vsw_t *vswp = port->p_vswp; 8421 8422 D1(vswp, "%s: enter", __func__); 8423 8424 mutex_enter(&port->mca_lock); 8425 8426 while ((mcap = port->mcap) != NULL) { 8427 8428 port->mcap = mcap->nextp; 8429 8430 mutex_exit(&port->mca_lock); 8431 8432 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 8433 mcap->addr, port); 8434 8435 /* 8436 * Remove the address from HW. The address 8437 * will actually only be removed once the ref 8438 * count within the MAC layer has dropped to 8439 * zero. I.e. we can safely call this fn even 8440 * if other ports are interested in this 8441 * address. 8442 */ 8443 mutex_enter(&vswp->mac_lock); 8444 if (vswp->mh != NULL && mcap->mac_added) { 8445 (void) mac_multicst_remove(vswp->mh, 8446 (uchar_t *)&mcap->mca); 8447 } 8448 mutex_exit(&vswp->mac_lock); 8449 8450 kmem_free(mcap, sizeof (*mcap)); 8451 8452 mutex_enter(&port->mca_lock); 8453 8454 } 8455 8456 mutex_exit(&port->mca_lock); 8457 8458 D1(vswp, "%s: exit", __func__); 8459 } 8460 8461 /* 8462 * This vsw instance is detaching, but has registered an interest in one 8463 * or more multicast groups. Using the list of addresses maintained 8464 * within the vsw structure find the appropriate entry in the hash 8465 * table and remove this instance from the list of interested ports. 8466 */ 8467 static void 8468 vsw_del_mcst_vsw(vsw_t *vswp) 8469 { 8470 mcst_addr_t *next_p = NULL; 8471 8472 D1(vswp, "%s: enter", __func__); 8473 8474 mutex_enter(&vswp->mca_lock); 8475 8476 while (vswp->mcap != NULL) { 8477 DERR(vswp, "%s: deleting addr 0x%llx", 8478 __func__, vswp->mcap->addr); 8479 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL); 8480 8481 next_p = vswp->mcap->nextp; 8482 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 8483 vswp->mcap = next_p; 8484 } 8485 8486 vswp->mcap = NULL; 8487 mutex_exit(&vswp->mca_lock); 8488 8489 D1(vswp, "%s: exit", __func__); 8490 } 8491 8492 /* 8493 * Remove the specified address from the list of address maintained 8494 * in this port node. 8495 */ 8496 static mcst_addr_t * 8497 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 8498 { 8499 vsw_t *vswp = NULL; 8500 vsw_port_t *port = NULL; 8501 mcst_addr_t *prev_p = NULL; 8502 mcst_addr_t *curr_p = NULL; 8503 8504 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 8505 __func__, devtype, addr); 8506 8507 if (devtype == VSW_VNETPORT) { 8508 port = (vsw_port_t *)arg; 8509 mutex_enter(&port->mca_lock); 8510 prev_p = curr_p = port->mcap; 8511 } else { 8512 vswp = (vsw_t *)arg; 8513 mutex_enter(&vswp->mca_lock); 8514 prev_p = curr_p = vswp->mcap; 8515 } 8516 8517 while (curr_p != NULL) { 8518 if (curr_p->addr == addr) { 8519 D2(NULL, "%s: address found", __func__); 8520 /* match found */ 8521 if (prev_p == curr_p) { 8522 /* list head */ 8523 if (devtype == VSW_VNETPORT) 8524 port->mcap = curr_p->nextp; 8525 else 8526 vswp->mcap = curr_p->nextp; 8527 } else { 8528 prev_p->nextp = curr_p->nextp; 8529 } 8530 break; 8531 } else { 8532 prev_p = curr_p; 8533 curr_p = curr_p->nextp; 8534 } 8535 } 8536 8537 if (devtype == VSW_VNETPORT) 8538 mutex_exit(&port->mca_lock); 8539 else 8540 mutex_exit(&vswp->mca_lock); 8541 8542 D1(NULL, "%s: exit", __func__); 8543 8544 return (curr_p); 8545 } 8546 8547 /* 8548 * Creates a descriptor ring (dring) and links it into the 8549 * link of outbound drings for this channel. 8550 * 8551 * Returns NULL if creation failed. 8552 */ 8553 static dring_info_t * 8554 vsw_create_dring(vsw_ldc_t *ldcp) 8555 { 8556 vsw_private_desc_t *priv_addr = NULL; 8557 vsw_t *vswp = ldcp->ldc_vswp; 8558 ldc_mem_info_t minfo; 8559 dring_info_t *dp, *tp; 8560 int i; 8561 8562 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8563 8564 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8565 8566 /* create public section of ring */ 8567 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 8568 VSW_PUB_SIZE, &dp->handle)) != 0) { 8569 8570 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 8571 "failed", ldcp->ldc_id); 8572 goto create_fail_exit; 8573 } 8574 8575 ASSERT(dp->handle != NULL); 8576 8577 /* 8578 * Get the base address of the public section of the ring. 8579 */ 8580 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 8581 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 8582 ldcp->ldc_id); 8583 goto dring_fail_exit; 8584 } else { 8585 ASSERT(minfo.vaddr != 0); 8586 dp->pub_addr = minfo.vaddr; 8587 } 8588 8589 dp->num_descriptors = VSW_RING_NUM_EL; 8590 dp->descriptor_size = VSW_PUB_SIZE; 8591 dp->options = VIO_TX_DRING; 8592 dp->ncookies = 1; /* guaranteed by ldc */ 8593 8594 /* 8595 * create private portion of ring 8596 */ 8597 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 8598 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 8599 8600 if (vsw_setup_ring(ldcp, dp)) { 8601 DERR(vswp, "%s: unable to setup ring", __func__); 8602 goto dring_fail_exit; 8603 } 8604 8605 /* haven't used any descriptors yet */ 8606 dp->end_idx = 0; 8607 dp->last_ack_recv = -1; 8608 8609 /* bind dring to the channel */ 8610 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 8611 LDC_SHADOW_MAP, LDC_MEM_RW, 8612 &dp->cookie[0], &dp->ncookies)) != 0) { 8613 DERR(vswp, "vsw_create_dring: unable to bind to channel " 8614 "%lld", ldcp->ldc_id); 8615 goto dring_fail_exit; 8616 } 8617 8618 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8619 dp->restart_reqd = B_TRUE; 8620 8621 /* 8622 * Only ever create rings for outgoing lane. Link it onto 8623 * end of list. 8624 */ 8625 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8626 if (ldcp->lane_out.dringp == NULL) { 8627 D2(vswp, "vsw_create_dring: adding first outbound ring"); 8628 ldcp->lane_out.dringp = dp; 8629 } else { 8630 tp = ldcp->lane_out.dringp; 8631 while (tp->next != NULL) 8632 tp = tp->next; 8633 8634 tp->next = dp; 8635 } 8636 RW_EXIT(&ldcp->lane_out.dlistrw); 8637 8638 return (dp); 8639 8640 dring_fail_exit: 8641 (void) ldc_mem_dring_destroy(dp->handle); 8642 8643 create_fail_exit: 8644 if (dp->priv_addr != NULL) { 8645 priv_addr = dp->priv_addr; 8646 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8647 if (priv_addr->memhandle != NULL) 8648 (void) ldc_mem_free_handle( 8649 priv_addr->memhandle); 8650 priv_addr++; 8651 } 8652 kmem_free(dp->priv_addr, 8653 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8654 } 8655 mutex_destroy(&dp->dlock); 8656 8657 kmem_free(dp, sizeof (dring_info_t)); 8658 return (NULL); 8659 } 8660 8661 /* 8662 * Create a ring consisting of just a private portion and link 8663 * it into the list of rings for the outbound lane. 8664 * 8665 * These type of rings are used primarily for temporary data 8666 * storage (i.e. as data buffers). 8667 */ 8668 void 8669 vsw_create_privring(vsw_ldc_t *ldcp) 8670 { 8671 dring_info_t *dp, *tp; 8672 vsw_t *vswp = ldcp->ldc_vswp; 8673 8674 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 8675 8676 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8677 8678 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8679 8680 /* no public section */ 8681 dp->pub_addr = NULL; 8682 8683 dp->priv_addr = kmem_zalloc( 8684 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 8685 8686 dp->num_descriptors = VSW_RING_NUM_EL; 8687 8688 if (vsw_setup_ring(ldcp, dp)) { 8689 DERR(vswp, "%s: setup of ring failed", __func__); 8690 kmem_free(dp->priv_addr, 8691 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8692 mutex_destroy(&dp->dlock); 8693 kmem_free(dp, sizeof (dring_info_t)); 8694 return; 8695 } 8696 8697 /* haven't used any descriptors yet */ 8698 dp->end_idx = 0; 8699 8700 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8701 dp->restart_reqd = B_TRUE; 8702 8703 /* 8704 * Only ever create rings for outgoing lane. Link it onto 8705 * end of list. 8706 */ 8707 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8708 if (ldcp->lane_out.dringp == NULL) { 8709 D2(vswp, "%s: adding first outbound privring", __func__); 8710 ldcp->lane_out.dringp = dp; 8711 } else { 8712 tp = ldcp->lane_out.dringp; 8713 while (tp->next != NULL) 8714 tp = tp->next; 8715 8716 tp->next = dp; 8717 } 8718 RW_EXIT(&ldcp->lane_out.dlistrw); 8719 8720 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 8721 } 8722 8723 /* 8724 * Setup the descriptors in the dring. Returns 0 on success, 1 on 8725 * failure. 8726 */ 8727 int 8728 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 8729 { 8730 vnet_public_desc_t *pub_addr = NULL; 8731 vsw_private_desc_t *priv_addr = NULL; 8732 vsw_t *vswp = ldcp->ldc_vswp; 8733 uint64_t *tmpp; 8734 uint64_t offset = 0; 8735 uint32_t ncookies = 0; 8736 static char *name = "vsw_setup_ring"; 8737 int i, j, nc, rv; 8738 8739 priv_addr = dp->priv_addr; 8740 pub_addr = dp->pub_addr; 8741 8742 /* public section may be null but private should never be */ 8743 ASSERT(priv_addr != NULL); 8744 8745 /* 8746 * Allocate the region of memory which will be used to hold 8747 * the data the descriptors will refer to. 8748 */ 8749 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 8750 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 8751 8752 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 8753 dp->data_sz, dp->data_addr); 8754 8755 tmpp = (uint64_t *)dp->data_addr; 8756 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 8757 8758 /* 8759 * Initialise some of the private and public (if they exist) 8760 * descriptor fields. 8761 */ 8762 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8763 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 8764 8765 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 8766 &priv_addr->memhandle)) != 0) { 8767 DERR(vswp, "%s: alloc mem handle failed", name); 8768 goto setup_ring_cleanup; 8769 } 8770 8771 priv_addr->datap = (void *)tmpp; 8772 8773 rv = ldc_mem_bind_handle(priv_addr->memhandle, 8774 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 8775 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 8776 &(priv_addr->memcookie[0]), &ncookies); 8777 if (rv != 0) { 8778 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 8779 "(rv %d)", name, ldcp->ldc_id, rv); 8780 goto setup_ring_cleanup; 8781 } 8782 priv_addr->bound = 1; 8783 8784 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 8785 name, i, priv_addr->memcookie[0].addr, 8786 priv_addr->memcookie[0].size); 8787 8788 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 8789 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 8790 "invalid num of cookies (%d) for size 0x%llx", 8791 name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ); 8792 8793 goto setup_ring_cleanup; 8794 } else { 8795 for (j = 1; j < ncookies; j++) { 8796 rv = ldc_mem_nextcookie(priv_addr->memhandle, 8797 &(priv_addr->memcookie[j])); 8798 if (rv != 0) { 8799 DERR(vswp, "%s: ldc_mem_nextcookie " 8800 "failed rv (%d)", name, rv); 8801 goto setup_ring_cleanup; 8802 } 8803 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 8804 "size 0x%llx", name, j, 8805 priv_addr->memcookie[j].addr, 8806 priv_addr->memcookie[j].size); 8807 } 8808 8809 } 8810 priv_addr->ncookies = ncookies; 8811 priv_addr->dstate = VIO_DESC_FREE; 8812 8813 if (pub_addr != NULL) { 8814 8815 /* link pub and private sides */ 8816 priv_addr->descp = pub_addr; 8817 8818 pub_addr->ncookies = priv_addr->ncookies; 8819 8820 for (nc = 0; nc < pub_addr->ncookies; nc++) { 8821 bcopy(&priv_addr->memcookie[nc], 8822 &pub_addr->memcookie[nc], 8823 sizeof (ldc_mem_cookie_t)); 8824 } 8825 8826 pub_addr->hdr.dstate = VIO_DESC_FREE; 8827 pub_addr++; 8828 } 8829 8830 /* 8831 * move to next element in the dring and the next 8832 * position in the data buffer. 8833 */ 8834 priv_addr++; 8835 tmpp += offset; 8836 } 8837 8838 return (0); 8839 8840 setup_ring_cleanup: 8841 priv_addr = dp->priv_addr; 8842 8843 for (j = 0; j < i; j++) { 8844 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 8845 (void) ldc_mem_free_handle(priv_addr->memhandle); 8846 8847 mutex_destroy(&priv_addr->dstate_lock); 8848 8849 priv_addr++; 8850 } 8851 kmem_free(dp->data_addr, dp->data_sz); 8852 8853 return (1); 8854 } 8855 8856 /* 8857 * Searches the private section of a ring for a free descriptor, 8858 * starting at the location of the last free descriptor found 8859 * previously. 8860 * 8861 * Returns 0 if free descriptor is available, and updates state 8862 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 8863 * 8864 * FUTURE: might need to return contiguous range of descriptors 8865 * as dring info msg assumes all will be contiguous. 8866 */ 8867 static int 8868 vsw_dring_find_free_desc(dring_info_t *dringp, 8869 vsw_private_desc_t **priv_p, int *idx) 8870 { 8871 vsw_private_desc_t *addr = NULL; 8872 int num = VSW_RING_NUM_EL; 8873 int ret = 1; 8874 8875 D1(NULL, "%s enter\n", __func__); 8876 8877 ASSERT(dringp->priv_addr != NULL); 8878 8879 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 8880 __func__, dringp, dringp->end_idx); 8881 8882 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 8883 8884 mutex_enter(&addr->dstate_lock); 8885 if (addr->dstate == VIO_DESC_FREE) { 8886 addr->dstate = VIO_DESC_READY; 8887 *priv_p = addr; 8888 *idx = dringp->end_idx; 8889 dringp->end_idx = (dringp->end_idx + 1) % num; 8890 ret = 0; 8891 8892 } 8893 mutex_exit(&addr->dstate_lock); 8894 8895 /* ring full */ 8896 if (ret == 1) { 8897 D2(NULL, "%s: no desp free: started at %d", __func__, 8898 dringp->end_idx); 8899 } 8900 8901 D1(NULL, "%s: exit\n", __func__); 8902 8903 return (ret); 8904 } 8905 8906 /* 8907 * Map from a dring identifier to the ring itself. Returns 8908 * pointer to ring or NULL if no match found. 8909 * 8910 * Should be called with dlistrw rwlock held as reader. 8911 */ 8912 static dring_info_t * 8913 vsw_ident2dring(lane_t *lane, uint64_t ident) 8914 { 8915 dring_info_t *dp = NULL; 8916 8917 if ((dp = lane->dringp) == NULL) { 8918 return (NULL); 8919 } else { 8920 if (dp->ident == ident) 8921 return (dp); 8922 8923 while (dp != NULL) { 8924 if (dp->ident == ident) 8925 break; 8926 dp = dp->next; 8927 } 8928 } 8929 8930 return (dp); 8931 } 8932 8933 /* 8934 * Set the default lane attributes. These are copied into 8935 * the attr msg we send to our peer. If they are not acceptable 8936 * then (currently) the handshake ends. 8937 */ 8938 static void 8939 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 8940 { 8941 bzero(lp, sizeof (lane_t)); 8942 8943 READ_ENTER(&vswp->if_lockrw); 8944 ether_copy(&(vswp->if_addr), &(lp->addr)); 8945 RW_EXIT(&vswp->if_lockrw); 8946 8947 lp->mtu = VSW_MTU; 8948 lp->addr_type = ADDR_TYPE_MAC; 8949 lp->xfer_mode = VIO_DRING_MODE; 8950 lp->ack_freq = 0; /* for shared mode */ 8951 8952 mutex_enter(&lp->seq_lock); 8953 lp->seq_num = VNET_ISS; 8954 mutex_exit(&lp->seq_lock); 8955 } 8956 8957 /* 8958 * Verify that the attributes are acceptable. 8959 * 8960 * FUTURE: If some attributes are not acceptable, change them 8961 * our desired values. 8962 */ 8963 static int 8964 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 8965 { 8966 int ret = 0; 8967 8968 D1(NULL, "vsw_check_attr enter\n"); 8969 8970 /* 8971 * Note we currently only support in-band descriptors 8972 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 8973 */ 8974 if ((pkt->xfer_mode != VIO_DESC_MODE) && 8975 (pkt->xfer_mode != VIO_DRING_MODE)) { 8976 D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode); 8977 ret = 1; 8978 } 8979 8980 /* Only support MAC addresses at moment. */ 8981 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 8982 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 8983 "or address 0x%llx\n", pkt->addr_type, pkt->addr); 8984 ret = 1; 8985 } 8986 8987 /* 8988 * MAC address supplied by device should match that stored 8989 * in the vsw-port OBP node. Need to decide what to do if they 8990 * don't match, for the moment just warn but don't fail. 8991 */ 8992 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 8993 DERR(NULL, "vsw_check_attr: device supplied address " 8994 "0x%llx doesn't match node address 0x%llx\n", 8995 pkt->addr, port->p_macaddr); 8996 } 8997 8998 /* 8999 * Ack freq only makes sense in pkt mode, in shared 9000 * mode the ring descriptors say whether or not to 9001 * send back an ACK. 9002 */ 9003 if ((pkt->xfer_mode == VIO_DRING_MODE) && 9004 (pkt->ack_freq > 0)) { 9005 D2(NULL, "vsw_check_attr: non zero ack freq " 9006 " in SHM mode\n"); 9007 ret = 1; 9008 } 9009 9010 /* 9011 * Note: for the moment we only support ETHER 9012 * frames. This may change in the future. 9013 */ 9014 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 9015 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 9016 pkt->mtu); 9017 ret = 1; 9018 } 9019 9020 D1(NULL, "vsw_check_attr exit\n"); 9021 9022 return (ret); 9023 } 9024 9025 /* 9026 * Returns 1 if there is a problem, 0 otherwise. 9027 */ 9028 static int 9029 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 9030 { 9031 _NOTE(ARGUNUSED(pkt)) 9032 9033 int ret = 0; 9034 9035 D1(NULL, "vsw_check_dring_info enter\n"); 9036 9037 if ((pkt->num_descriptors == 0) || 9038 (pkt->descriptor_size == 0) || 9039 (pkt->ncookies != 1)) { 9040 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 9041 ret = 1; 9042 } 9043 9044 D1(NULL, "vsw_check_dring_info exit\n"); 9045 9046 return (ret); 9047 } 9048 9049 /* 9050 * Returns 1 if two memory cookies match. Otherwise returns 0. 9051 */ 9052 static int 9053 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 9054 { 9055 if ((m1->addr != m2->addr) || 9056 (m2->size != m2->size)) { 9057 return (0); 9058 } else { 9059 return (1); 9060 } 9061 } 9062 9063 /* 9064 * Returns 1 if ring described in reg message matches that 9065 * described by dring_info structure. Otherwise returns 0. 9066 */ 9067 static int 9068 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 9069 { 9070 if ((msg->descriptor_size != dp->descriptor_size) || 9071 (msg->num_descriptors != dp->num_descriptors) || 9072 (msg->ncookies != dp->ncookies) || 9073 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 9074 return (0); 9075 } else { 9076 return (1); 9077 } 9078 9079 } 9080 9081 static caddr_t 9082 vsw_print_ethaddr(uint8_t *a, char *ebuf) 9083 { 9084 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 9085 a[0], a[1], a[2], a[3], a[4], a[5]); 9086 return (ebuf); 9087 } 9088 9089 /* 9090 * Reset and free all the resources associated with 9091 * the channel. 9092 */ 9093 static void 9094 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 9095 { 9096 dring_info_t *dp, *dpp; 9097 lane_t *lp = NULL; 9098 int rv = 0; 9099 9100 ASSERT(ldcp != NULL); 9101 9102 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 9103 9104 if (dir == INBOUND) { 9105 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 9106 " of channel %lld", __func__, ldcp->ldc_id); 9107 lp = &ldcp->lane_in; 9108 } else { 9109 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 9110 " of channel %lld", __func__, ldcp->ldc_id); 9111 lp = &ldcp->lane_out; 9112 } 9113 9114 lp->lstate = VSW_LANE_INACTIV; 9115 mutex_enter(&lp->seq_lock); 9116 lp->seq_num = VNET_ISS; 9117 mutex_exit(&lp->seq_lock); 9118 if (lp->dringp) { 9119 if (dir == INBOUND) { 9120 WRITE_ENTER(&lp->dlistrw); 9121 dp = lp->dringp; 9122 while (dp != NULL) { 9123 dpp = dp->next; 9124 if (dp->handle != NULL) 9125 (void) ldc_mem_dring_unmap(dp->handle); 9126 kmem_free(dp, sizeof (dring_info_t)); 9127 dp = dpp; 9128 } 9129 RW_EXIT(&lp->dlistrw); 9130 } else { 9131 /* 9132 * unbind, destroy exported dring, free dring struct 9133 */ 9134 WRITE_ENTER(&lp->dlistrw); 9135 dp = lp->dringp; 9136 rv = vsw_free_ring(dp); 9137 RW_EXIT(&lp->dlistrw); 9138 } 9139 if (rv == 0) { 9140 lp->dringp = NULL; 9141 } 9142 } 9143 9144 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 9145 } 9146 9147 /* 9148 * Free ring and all associated resources. 9149 * 9150 * Should be called with dlistrw rwlock held as writer. 9151 */ 9152 static int 9153 vsw_free_ring(dring_info_t *dp) 9154 { 9155 vsw_private_desc_t *paddr = NULL; 9156 dring_info_t *dpp; 9157 int i, rv = 1; 9158 9159 while (dp != NULL) { 9160 mutex_enter(&dp->dlock); 9161 dpp = dp->next; 9162 if (dp->priv_addr != NULL) { 9163 /* 9164 * First unbind and free the memory handles 9165 * stored in each descriptor within the ring. 9166 */ 9167 for (i = 0; i < VSW_RING_NUM_EL; i++) { 9168 paddr = (vsw_private_desc_t *) 9169 dp->priv_addr + i; 9170 if (paddr->memhandle != NULL) { 9171 if (paddr->bound == 1) { 9172 rv = ldc_mem_unbind_handle( 9173 paddr->memhandle); 9174 9175 if (rv != 0) { 9176 DERR(NULL, "error " 9177 "unbinding handle for " 9178 "ring 0x%llx at pos %d", 9179 dp, i); 9180 mutex_exit(&dp->dlock); 9181 return (rv); 9182 } 9183 paddr->bound = 0; 9184 } 9185 9186 rv = ldc_mem_free_handle( 9187 paddr->memhandle); 9188 if (rv != 0) { 9189 DERR(NULL, "error freeing " 9190 "handle for ring 0x%llx " 9191 "at pos %d", dp, i); 9192 mutex_exit(&dp->dlock); 9193 return (rv); 9194 } 9195 paddr->memhandle = NULL; 9196 } 9197 mutex_destroy(&paddr->dstate_lock); 9198 } 9199 kmem_free(dp->priv_addr, 9200 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 9201 } 9202 9203 /* 9204 * Now unbind and destroy the ring itself. 9205 */ 9206 if (dp->handle != NULL) { 9207 (void) ldc_mem_dring_unbind(dp->handle); 9208 (void) ldc_mem_dring_destroy(dp->handle); 9209 } 9210 9211 if (dp->data_addr != NULL) { 9212 kmem_free(dp->data_addr, dp->data_sz); 9213 } 9214 9215 mutex_exit(&dp->dlock); 9216 mutex_destroy(&dp->dlock); 9217 mutex_destroy(&dp->restart_lock); 9218 kmem_free(dp, sizeof (dring_info_t)); 9219 9220 dp = dpp; 9221 } 9222 return (0); 9223 } 9224 9225 /* 9226 * Debugging routines 9227 */ 9228 static void 9229 display_state(void) 9230 { 9231 vsw_t *vswp; 9232 vsw_port_list_t *plist; 9233 vsw_port_t *port; 9234 vsw_ldc_list_t *ldcl; 9235 vsw_ldc_t *ldcp; 9236 9237 cmn_err(CE_NOTE, "***** system state *****"); 9238 9239 for (vswp = vsw_head; vswp; vswp = vswp->next) { 9240 plist = &vswp->plist; 9241 READ_ENTER(&plist->lockrw); 9242 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 9243 vswp->instance, plist->num_ports); 9244 9245 for (port = plist->head; port != NULL; port = port->p_next) { 9246 ldcl = &port->p_ldclist; 9247 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 9248 port->p_instance, ldcl->num_ldcs); 9249 READ_ENTER(&ldcl->lockrw); 9250 ldcp = ldcl->head; 9251 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 9252 cmn_err(CE_CONT, "chan %lu : dev %d : " 9253 "status %d : phase %u\n", 9254 ldcp->ldc_id, ldcp->dev_class, 9255 ldcp->ldc_status, ldcp->hphase); 9256 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 9257 "psession %lu\n", ldcp->ldc_id, 9258 ldcp->local_session, ldcp->peer_session); 9259 9260 cmn_err(CE_CONT, "Inbound lane:\n"); 9261 display_lane(&ldcp->lane_in); 9262 cmn_err(CE_CONT, "Outbound lane:\n"); 9263 display_lane(&ldcp->lane_out); 9264 } 9265 RW_EXIT(&ldcl->lockrw); 9266 } 9267 RW_EXIT(&plist->lockrw); 9268 } 9269 cmn_err(CE_NOTE, "***** system state *****"); 9270 } 9271 9272 static void 9273 display_lane(lane_t *lp) 9274 { 9275 dring_info_t *drp; 9276 9277 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 9278 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 9279 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 9280 lp->addr_type, lp->addr, lp->xfer_mode); 9281 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 9282 9283 cmn_err(CE_CONT, "Dring info:\n"); 9284 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 9285 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 9286 drp->num_descriptors, drp->descriptor_size); 9287 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 9288 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 9289 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 9290 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 9291 drp->ident, drp->end_idx); 9292 display_ring(drp); 9293 } 9294 } 9295 9296 static void 9297 display_ring(dring_info_t *dringp) 9298 { 9299 uint64_t i; 9300 uint64_t priv_count = 0; 9301 uint64_t pub_count = 0; 9302 vnet_public_desc_t *pub_addr = NULL; 9303 vsw_private_desc_t *priv_addr = NULL; 9304 9305 for (i = 0; i < VSW_RING_NUM_EL; i++) { 9306 if (dringp->pub_addr != NULL) { 9307 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 9308 9309 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 9310 pub_count++; 9311 } 9312 9313 if (dringp->priv_addr != NULL) { 9314 priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i; 9315 9316 if (priv_addr->dstate == VIO_DESC_FREE) 9317 priv_count++; 9318 } 9319 } 9320 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 9321 i, priv_count, pub_count); 9322 } 9323 9324 static void 9325 dump_flags(uint64_t state) 9326 { 9327 int i; 9328 9329 typedef struct flag_name { 9330 int flag_val; 9331 char *flag_name; 9332 } flag_name_t; 9333 9334 flag_name_t flags[] = { 9335 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 9336 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 9337 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 9338 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 9339 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 9340 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 9341 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 9342 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 9343 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 9344 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 9345 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 9346 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 9347 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 9348 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 9349 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 9350 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 9351 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 9352 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 9353 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 9354 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 9355 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 9356 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 9357 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 9358 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 9359 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 9360 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 9361 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 9362 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 9363 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 9364 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 9365 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 9366 9367 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 9368 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 9369 if (state & flags[i].flag_val) 9370 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 9371 } 9372 } 9373