1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static void vsw_get_md_properties(vsw_t *vswp); 81 static int vsw_get_physaddr(vsw_t *); 82 static int vsw_setup_layer2(vsw_t *); 83 static int vsw_setup_layer3(vsw_t *); 84 85 /* MAC Ring table functions. */ 86 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 87 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 88 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 89 static void vsw_queue_stop(vsw_queue_t *vqp); 90 static vsw_queue_t *vsw_queue_create(); 91 static void vsw_queue_destroy(vsw_queue_t *vqp); 92 93 /* MAC layer routines */ 94 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 95 mac_resource_t *mrp); 96 static int vsw_get_hw_maddr(vsw_t *); 97 static int vsw_set_hw(vsw_t *, vsw_port_t *); 98 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *); 99 static int vsw_unset_hw(vsw_t *, vsw_port_t *); 100 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *); 101 static int vsw_reconfig_hw(vsw_t *); 102 static int vsw_mac_attach(vsw_t *vswp); 103 static void vsw_mac_detach(vsw_t *vswp); 104 105 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 106 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 107 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 108 static int vsw_mac_register(vsw_t *); 109 static int vsw_mac_unregister(vsw_t *); 110 static int vsw_m_stat(void *, uint_t, uint64_t *); 111 static void vsw_m_stop(void *arg); 112 static int vsw_m_start(void *arg); 113 static int vsw_m_unicst(void *arg, const uint8_t *); 114 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 115 static int vsw_m_promisc(void *arg, boolean_t); 116 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 117 118 /* MDEG routines */ 119 static void vsw_mdeg_register(vsw_t *vswp); 120 static void vsw_mdeg_unregister(vsw_t *vswp); 121 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 122 123 /* Port add/deletion routines */ 124 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 125 static int vsw_port_attach(vsw_t *vswp, int p_instance, 126 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 127 static int vsw_detach_ports(vsw_t *vswp); 128 static int vsw_port_detach(vsw_t *vswp, int p_instance); 129 static int vsw_port_delete(vsw_port_t *port); 130 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 131 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 132 static int vsw_init_ldcs(vsw_port_t *port); 133 static int vsw_uninit_ldcs(vsw_port_t *port); 134 static int vsw_ldc_init(vsw_ldc_t *ldcp); 135 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 136 static int vsw_drain_ldcs(vsw_port_t *port); 137 static int vsw_drain_port_taskq(vsw_port_t *port); 138 static void vsw_marker_task(void *); 139 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 140 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 141 142 /* Interrupt routines */ 143 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 144 145 /* Handshake routines */ 146 static void vsw_restart_ldc(vsw_ldc_t *); 147 static void vsw_restart_handshake(vsw_ldc_t *); 148 static void vsw_handle_reset(vsw_ldc_t *); 149 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 150 static void vsw_next_milestone(vsw_ldc_t *); 151 static int vsw_supported_version(vio_ver_msg_t *); 152 153 /* Data processing routines */ 154 static void vsw_process_pkt(void *); 155 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 156 static void vsw_process_ctrl_pkt(void *); 157 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 158 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 159 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 160 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 161 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 162 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 163 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 164 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 165 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 166 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 167 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 168 169 /* Switching/data transmit routines */ 170 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 171 vsw_port_t *port, mac_resource_handle_t); 172 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 173 vsw_port_t *port, mac_resource_handle_t); 174 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 175 vsw_port_t *port); 176 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 177 vsw_port_t *port); 178 static int vsw_portsend(vsw_port_t *, mblk_t *); 179 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 180 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 181 182 /* Packet creation routines */ 183 static void vsw_send_ver(void *); 184 static void vsw_send_attr(vsw_ldc_t *); 185 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 186 static void vsw_send_dring_info(vsw_ldc_t *); 187 static void vsw_send_rdx(vsw_ldc_t *); 188 189 static void vsw_send_msg(vsw_ldc_t *, void *, int); 190 191 /* Forwarding database (FDB) routines */ 192 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 193 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 194 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 195 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 196 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 197 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 198 static void vsw_del_addr(uint8_t, void *, uint64_t); 199 static void vsw_del_mcst_port(vsw_port_t *); 200 static void vsw_del_mcst_vsw(vsw_t *); 201 202 /* Dring routines */ 203 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 204 static void vsw_create_privring(vsw_ldc_t *); 205 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 206 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 207 int *); 208 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 209 210 static void vsw_set_lane_attr(vsw_t *, lane_t *); 211 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 212 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 213 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 214 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 215 216 /* Misc support routines */ 217 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 218 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 219 static int vsw_free_ring(dring_info_t *); 220 221 222 /* Debugging routines */ 223 static void dump_flags(uint64_t); 224 static void display_state(void); 225 static void display_lane(lane_t *); 226 static void display_ring(dring_info_t *); 227 228 int vsw_num_handshakes = 3; /* # of handshake attempts */ 229 int vsw_wretries = 100; /* # of write attempts */ 230 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 231 int vsw_desc_delay = 0; /* delay in us */ 232 int vsw_read_attempts = 5; /* # of reads of descriptor */ 233 234 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 235 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 236 237 238 /* 239 * mode specific frame switching function 240 */ 241 void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, 242 mac_resource_handle_t); 243 244 static mac_callbacks_t vsw_m_callbacks = { 245 0, 246 vsw_m_stat, 247 vsw_m_start, 248 vsw_m_stop, 249 vsw_m_promisc, 250 vsw_m_multicst, 251 vsw_m_unicst, 252 vsw_m_tx, 253 NULL, 254 NULL, 255 NULL 256 }; 257 258 static struct cb_ops vsw_cb_ops = { 259 nulldev, /* cb_open */ 260 nulldev, /* cb_close */ 261 nodev, /* cb_strategy */ 262 nodev, /* cb_print */ 263 nodev, /* cb_dump */ 264 nodev, /* cb_read */ 265 nodev, /* cb_write */ 266 nodev, /* cb_ioctl */ 267 nodev, /* cb_devmap */ 268 nodev, /* cb_mmap */ 269 nodev, /* cb_segmap */ 270 nochpoll, /* cb_chpoll */ 271 ddi_prop_op, /* cb_prop_op */ 272 NULL, /* cb_stream */ 273 D_MP, /* cb_flag */ 274 CB_REV, /* rev */ 275 nodev, /* int (*cb_aread)() */ 276 nodev /* int (*cb_awrite)() */ 277 }; 278 279 static struct dev_ops vsw_ops = { 280 DEVO_REV, /* devo_rev */ 281 0, /* devo_refcnt */ 282 vsw_getinfo, /* devo_getinfo */ 283 nulldev, /* devo_identify */ 284 nulldev, /* devo_probe */ 285 vsw_attach, /* devo_attach */ 286 vsw_detach, /* devo_detach */ 287 nodev, /* devo_reset */ 288 &vsw_cb_ops, /* devo_cb_ops */ 289 (struct bus_ops *)NULL, /* devo_bus_ops */ 290 ddi_power /* devo_power */ 291 }; 292 293 extern struct mod_ops mod_driverops; 294 static struct modldrv vswmodldrv = { 295 &mod_driverops, 296 "sun4v Virtual Switch Driver %I%", 297 &vsw_ops, 298 }; 299 300 #define LDC_ENTER_LOCK(ldcp) \ 301 mutex_enter(&((ldcp)->ldc_cblock));\ 302 mutex_enter(&((ldcp)->ldc_txlock)); 303 #define LDC_EXIT_LOCK(ldcp) \ 304 mutex_exit(&((ldcp)->ldc_txlock));\ 305 mutex_exit(&((ldcp)->ldc_cblock)); 306 307 /* Driver soft state ptr */ 308 static void *vsw_state; 309 310 /* 311 * Linked list of "vsw_t" structures - one per instance. 312 */ 313 vsw_t *vsw_head = NULL; 314 krwlock_t vsw_rw; 315 316 /* 317 * Property names 318 */ 319 static char vdev_propname[] = "virtual-device"; 320 static char vsw_propname[] = "virtual-network-switch"; 321 static char physdev_propname[] = "vsw-phys-dev"; 322 static char smode_propname[] = "vsw-switch-mode"; 323 static char macaddr_propname[] = "local-mac-address"; 324 static char remaddr_propname[] = "remote-mac-address"; 325 static char ldcids_propname[] = "ldc-ids"; 326 static char chan_propname[] = "channel-endpoint"; 327 static char id_propname[] = "id"; 328 static char reg_propname[] = "reg"; 329 330 /* supported versions */ 331 static ver_sup_t vsw_versions[] = { {1, 0} }; 332 333 /* 334 * Matching criteria passed to the MDEG to register interest 335 * in changes to 'virtual-device-port' nodes identified by their 336 * 'id' property. 337 */ 338 static md_prop_match_t vport_prop_match[] = { 339 { MDET_PROP_VAL, "id" }, 340 { MDET_LIST_END, NULL } 341 }; 342 343 static mdeg_node_match_t vport_match = { "virtual-device-port", 344 vport_prop_match }; 345 346 /* 347 * Specification of an MD node passed to the MDEG to filter any 348 * 'vport' nodes that do not belong to the specified node. This 349 * template is copied for each vsw instance and filled in with 350 * the appropriate 'cfg-handle' value before being passed to the MDEG. 351 */ 352 static mdeg_prop_spec_t vsw_prop_template[] = { 353 { MDET_PROP_STR, "name", vsw_propname }, 354 { MDET_PROP_VAL, "cfg-handle", NULL }, 355 { MDET_LIST_END, NULL, NULL } 356 }; 357 358 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 359 360 /* 361 * From /etc/system enable/disable thread per ring. This is a mode 362 * selection that is done a vsw driver attach time. 363 */ 364 boolean_t vsw_multi_ring_enable = B_FALSE; 365 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 366 367 /* 368 * Print debug messages - set to 0x1f to enable all msgs 369 * or 0x0 to turn all off. 370 */ 371 int vswdbg = 0x0; 372 373 /* 374 * debug levels: 375 * 0x01: Function entry/exit tracing 376 * 0x02: Internal function messages 377 * 0x04: Verbose internal messages 378 * 0x08: Warning messages 379 * 0x10: Error messages 380 */ 381 382 static void 383 vswdebug(vsw_t *vswp, const char *fmt, ...) 384 { 385 char buf[512]; 386 va_list ap; 387 388 va_start(ap, fmt); 389 (void) vsprintf(buf, fmt, ap); 390 va_end(ap); 391 392 if (vswp == NULL) 393 cmn_err(CE_CONT, "%s\n", buf); 394 else 395 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 396 } 397 398 /* 399 * For the moment the state dump routines have their own 400 * private flag. 401 */ 402 #define DUMP_STATE 0 403 404 #if DUMP_STATE 405 406 #define DUMP_TAG(tag) \ 407 { \ 408 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 409 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 410 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 411 } 412 413 #define DUMP_TAG_PTR(tag) \ 414 { \ 415 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 416 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 417 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 418 } 419 420 #define DUMP_FLAGS(flags) dump_flags(flags); 421 #define DISPLAY_STATE() display_state() 422 423 #else 424 425 #define DUMP_TAG(tag) 426 #define DUMP_TAG_PTR(tag) 427 #define DUMP_FLAGS(state) 428 #define DISPLAY_STATE() 429 430 #endif /* DUMP_STATE */ 431 432 #ifdef DEBUG 433 434 #define D1 \ 435 if (vswdbg & 0x01) \ 436 vswdebug 437 438 #define D2 \ 439 if (vswdbg & 0x02) \ 440 vswdebug 441 442 #define D3 \ 443 if (vswdbg & 0x04) \ 444 vswdebug 445 446 #define DWARN \ 447 if (vswdbg & 0x08) \ 448 vswdebug 449 450 #define DERR \ 451 if (vswdbg & 0x10) \ 452 vswdebug 453 454 #else 455 456 #define DERR if (0) vswdebug 457 #define DWARN if (0) vswdebug 458 #define D1 if (0) vswdebug 459 #define D2 if (0) vswdebug 460 #define D3 if (0) vswdebug 461 462 #endif /* DEBUG */ 463 464 static struct modlinkage modlinkage = { 465 MODREV_1, 466 &vswmodldrv, 467 NULL 468 }; 469 470 int 471 _init(void) 472 { 473 int status; 474 475 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 476 477 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 478 if (status != 0) { 479 return (status); 480 } 481 482 mac_init_ops(&vsw_ops, "vsw"); 483 status = mod_install(&modlinkage); 484 if (status != 0) { 485 ddi_soft_state_fini(&vsw_state); 486 } 487 return (status); 488 } 489 490 int 491 _fini(void) 492 { 493 int status; 494 495 status = mod_remove(&modlinkage); 496 if (status != 0) 497 return (status); 498 mac_fini_ops(&vsw_ops); 499 ddi_soft_state_fini(&vsw_state); 500 501 rw_destroy(&vsw_rw); 502 503 return (status); 504 } 505 506 int 507 _info(struct modinfo *modinfop) 508 { 509 return (mod_info(&modlinkage, modinfop)); 510 } 511 512 static int 513 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 514 { 515 vsw_t *vswp; 516 int instance, i; 517 char hashname[MAXNAMELEN]; 518 char qname[TASKQ_NAMELEN]; 519 int rv = 1; 520 enum { PROG_init = 0x00, 521 PROG_if_lock = 0x01, 522 PROG_fdb = 0x02, 523 PROG_mfdb = 0x04, 524 PROG_report_dev = 0x08, 525 PROG_plist = 0x10, 526 PROG_taskq = 0x20} 527 progress; 528 529 progress = PROG_init; 530 531 switch (cmd) { 532 case DDI_ATTACH: 533 break; 534 case DDI_RESUME: 535 /* nothing to do for this non-device */ 536 return (DDI_SUCCESS); 537 case DDI_PM_RESUME: 538 default: 539 return (DDI_FAILURE); 540 } 541 542 instance = ddi_get_instance(dip); 543 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 544 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 545 return (DDI_FAILURE); 546 } 547 vswp = ddi_get_soft_state(vsw_state, instance); 548 549 if (vswp == NULL) { 550 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 551 goto vsw_attach_fail; 552 } 553 554 vswp->dip = dip; 555 vswp->instance = instance; 556 ddi_set_driver_private(dip, (caddr_t)vswp); 557 558 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 559 progress |= PROG_if_lock; 560 561 /* 562 * Get the various properties such as physical device name 563 * (vsw-phys-dev), switch mode etc from the MD. 564 */ 565 vsw_get_md_properties(vswp); 566 567 /* setup the unicast forwarding database */ 568 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 569 vswp->instance); 570 D2(vswp, "creating unicast hash table (%s)...", hashname); 571 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 572 mod_hash_null_valdtor, sizeof (void *)); 573 574 progress |= PROG_fdb; 575 576 /* setup the multicast fowarding database */ 577 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 578 vswp->instance); 579 D2(vswp, "creating multicast hash table %s)...", hashname); 580 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 581 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 582 mod_hash_null_valdtor, sizeof (void *)); 583 584 progress |= PROG_mfdb; 585 586 /* 587 * create lock protecting list of multicast addresses 588 * which could come via m_multicst() entry point when plumbed. 589 */ 590 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 591 vswp->mcap = NULL; 592 593 ddi_report_dev(vswp->dip); 594 595 progress |= PROG_report_dev; 596 597 WRITE_ENTER(&vsw_rw); 598 vswp->next = vsw_head; 599 vsw_head = vswp; 600 RW_EXIT(&vsw_rw); 601 602 /* setup the port list */ 603 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 604 vswp->plist.head = NULL; 605 606 progress |= PROG_plist; 607 608 /* 609 * Create the taskq which will process all the VIO 610 * control messages. 611 */ 612 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 613 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 614 TASKQ_DEFAULTPRI, 0)) == NULL) { 615 cmn_err(CE_WARN, "Unable to create task queue"); 616 goto vsw_attach_fail; 617 } 618 619 progress |= PROG_taskq; 620 621 /* select best switching mode */ 622 for (i = 0; i < vswp->smode_num; i++) { 623 vswp->smode_idx = i; 624 switch (vswp->smode[i]) { 625 case VSW_LAYER2: 626 case VSW_LAYER2_PROMISC: 627 rv = vsw_setup_layer2(vswp); 628 break; 629 630 case VSW_LAYER3: 631 rv = vsw_setup_layer3(vswp); 632 break; 633 634 default: 635 DERR(vswp, "unknown switch mode"); 636 rv = 1; 637 break; 638 } 639 640 if (rv == 0) 641 break; 642 } 643 644 if (rv == 1) { 645 cmn_err(CE_WARN, "Unable to setup switching mode"); 646 goto vsw_attach_fail; 647 } 648 649 D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); 650 651 /* 652 * Register with the MAC layer as a network device so 653 * we can be plumbed if desired. 654 * 655 * Do this in both layer 2 and layer 3 mode. 656 */ 657 vswp->if_state &= ~VSW_IF_UP; 658 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 659 if (vsw_mac_register(vswp) != 0) { 660 cmn_err(CE_WARN, "Unable to register as provider " 661 " with MAC layer, continuing with attach"); 662 } 663 } 664 665 /* prevent auto-detaching */ 666 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 667 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 668 cmn_err(CE_NOTE, "Unable to set \"%s\" property for " 669 "instance %u", DDI_NO_AUTODETACH, instance); 670 } 671 672 /* 673 * Now we have everything setup, register for MD change 674 * events. 675 */ 676 vsw_mdeg_register(vswp); 677 678 return (DDI_SUCCESS); 679 680 vsw_attach_fail: 681 DERR(NULL, "vsw_attach: failed"); 682 683 if (progress & PROG_taskq) 684 ddi_taskq_destroy(vswp->taskq_p); 685 686 if (progress & PROG_plist) 687 rw_destroy(&vswp->plist.lockrw); 688 689 if (progress & PROG_report_dev) { 690 ddi_remove_minor_node(dip, NULL); 691 mutex_destroy(&vswp->mca_lock); 692 } 693 694 if (progress & PROG_mfdb) { 695 mod_hash_destroy_hash(vswp->mfdb); 696 vswp->mfdb = NULL; 697 rw_destroy(&vswp->mfdbrw); 698 } 699 700 if (progress & PROG_fdb) { 701 mod_hash_destroy_hash(vswp->fdb); 702 vswp->fdb = NULL; 703 } 704 705 if (progress & PROG_if_lock) 706 rw_destroy(&vswp->if_lockrw); 707 708 ddi_soft_state_free(vsw_state, instance); 709 return (DDI_FAILURE); 710 } 711 712 static int 713 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 714 { 715 vio_mblk_pool_t *poolp, *npoolp; 716 vsw_t **vswpp, *vswp; 717 int instance; 718 719 instance = ddi_get_instance(dip); 720 vswp = ddi_get_soft_state(vsw_state, instance); 721 722 if (vswp == NULL) { 723 return (DDI_FAILURE); 724 } 725 726 switch (cmd) { 727 case DDI_DETACH: 728 break; 729 case DDI_SUSPEND: 730 case DDI_PM_SUSPEND: 731 default: 732 return (DDI_FAILURE); 733 } 734 735 D2(vswp, "detaching instance %d", instance); 736 737 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 738 if (vsw_mac_unregister(vswp) != 0) { 739 cmn_err(CE_WARN, "Unable to detach from MAC layer"); 740 return (DDI_FAILURE); 741 } 742 rw_destroy(&vswp->if_lockrw); 743 } 744 745 vsw_mdeg_unregister(vswp); 746 747 /* remove mac layer callback */ 748 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 749 mac_rx_remove(vswp->mh, vswp->mrh); 750 vswp->mrh = NULL; 751 } 752 753 if (vsw_detach_ports(vswp) != 0) { 754 cmn_err(CE_WARN, "Unable to detach ports"); 755 return (DDI_FAILURE); 756 } 757 758 /* 759 * Now that the ports have been deleted, stop and close 760 * the physical device. 761 */ 762 if (vswp->mh != NULL) { 763 if (vswp->mstarted) 764 mac_stop(vswp->mh); 765 if (vswp->mresources) 766 mac_resource_set(vswp->mh, NULL, NULL); 767 mac_close(vswp->mh); 768 769 vswp->mh = NULL; 770 vswp->txinfo = NULL; 771 } 772 773 /* 774 * Destroy any free pools that may still exist. 775 */ 776 poolp = vswp->rxh; 777 while (poolp != NULL) { 778 npoolp = vswp->rxh = poolp->nextp; 779 if (vio_destroy_mblks(poolp) != 0) { 780 vswp->rxh = poolp; 781 return (DDI_FAILURE); 782 } 783 poolp = npoolp; 784 } 785 786 /* 787 * Remove this instance from any entries it may be on in 788 * the hash table by using the list of addresses maintained 789 * in the vsw_t structure. 790 */ 791 vsw_del_mcst_vsw(vswp); 792 793 vswp->mcap = NULL; 794 mutex_destroy(&vswp->mca_lock); 795 796 /* 797 * By now any pending tasks have finished and the underlying 798 * ldc's have been destroyed, so its safe to delete the control 799 * message taskq. 800 */ 801 if (vswp->taskq_p != NULL) 802 ddi_taskq_destroy(vswp->taskq_p); 803 804 /* 805 * At this stage all the data pointers in the hash table 806 * should be NULL, as all the ports have been removed and will 807 * have deleted themselves from the port lists which the data 808 * pointers point to. Hence we can destroy the table using the 809 * default destructors. 810 */ 811 D2(vswp, "vsw_detach: destroying hash tables.."); 812 mod_hash_destroy_hash(vswp->fdb); 813 vswp->fdb = NULL; 814 815 WRITE_ENTER(&vswp->mfdbrw); 816 mod_hash_destroy_hash(vswp->mfdb); 817 vswp->mfdb = NULL; 818 RW_EXIT(&vswp->mfdbrw); 819 rw_destroy(&vswp->mfdbrw); 820 821 ddi_remove_minor_node(dip, NULL); 822 823 rw_destroy(&vswp->plist.lockrw); 824 WRITE_ENTER(&vsw_rw); 825 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 826 if (*vswpp == vswp) { 827 *vswpp = vswp->next; 828 break; 829 } 830 } 831 RW_EXIT(&vsw_rw); 832 ddi_soft_state_free(vsw_state, instance); 833 834 return (DDI_SUCCESS); 835 } 836 837 static int 838 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 839 { 840 _NOTE(ARGUNUSED(dip)) 841 842 vsw_t *vswp = NULL; 843 dev_t dev = (dev_t)arg; 844 int instance; 845 846 instance = getminor(dev); 847 848 switch (infocmd) { 849 case DDI_INFO_DEVT2DEVINFO: 850 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 851 *result = NULL; 852 return (DDI_FAILURE); 853 } 854 *result = vswp->dip; 855 return (DDI_SUCCESS); 856 857 case DDI_INFO_DEVT2INSTANCE: 858 *result = (void *)(uintptr_t)instance; 859 return (DDI_SUCCESS); 860 861 default: 862 *result = NULL; 863 return (DDI_FAILURE); 864 } 865 } 866 867 /* 868 * Get the properties from our MD node. 869 */ 870 static void 871 vsw_get_md_properties(vsw_t *vswp) 872 { 873 md_t *mdp = NULL; 874 int num_nodes = 0; 875 int len = 0, listsz = 0; 876 int num_vdev = 0; 877 int i, idx; 878 boolean_t found_node = B_FALSE; 879 char *smode = NULL; 880 char *curr_mode = NULL; 881 char *physname = NULL; 882 char *node_name = NULL; 883 char *dev; 884 uint64_t macaddr = 0; 885 uint64_t md_inst, obp_inst; 886 mde_cookie_t *listp = NULL; 887 mde_cookie_t rootnode; 888 889 D1(vswp, "%s: enter", __func__); 890 891 /* 892 * Further down we compare the obp 'reg' property to the 893 * 'cfg-handle' property in the vsw MD node to determine 894 * if the node refers to this particular instance. So if 895 * we can't read the obp value then there is no point 896 * in proceeding further. 897 */ 898 if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, 899 DDI_PROP_DONTPASS, reg_propname) != 1) { 900 cmn_err(CE_WARN, "Unable to read %s property " 901 "from OBP device node", reg_propname); 902 return; 903 } 904 905 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 906 DDI_PROP_DONTPASS, reg_propname, 0); 907 908 D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); 909 910 if ((mdp = md_get_handle()) == NULL) { 911 DERR(vswp, "%s: unable to init MD", __func__); 912 return; 913 } 914 915 if ((num_nodes = md_node_count(mdp)) <= 0) { 916 DERR(vswp, "%s: invalid number of nodes found %d", 917 __func__, num_nodes); 918 (void) md_fini_handle(mdp); 919 return; 920 } 921 922 D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); 923 924 /* allocate enough space for node list */ 925 listsz = num_nodes * sizeof (mde_cookie_t); 926 listp = kmem_zalloc(listsz, KM_SLEEP); 927 928 rootnode = md_root_node(mdp); 929 930 /* Get the list of virtual devices */ 931 num_vdev = md_scan_dag(mdp, rootnode, 932 md_find_name(mdp, vdev_propname), 933 md_find_name(mdp, "fwd"), listp); 934 935 if (num_vdev <= 0) { 936 DERR(vswp, "%s: didn't find any virtual-device nodes in MD", 937 __func__); 938 goto md_prop_exit; 939 } 940 941 D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); 942 943 /* Look for the virtual switch nodes in the list */ 944 for (idx = 0; idx < num_vdev; idx++) { 945 if (md_get_prop_str(mdp, listp[idx], 946 "name", &node_name) != 0) { 947 DERR(vswp, "%s: unable to get node name", __func__); 948 continue; 949 950 } 951 952 if (strcmp(node_name, vsw_propname) == 0) { 953 /* Virtual switch node */ 954 if (md_get_prop_val(mdp, listp[idx], 955 "cfg-handle", &md_inst) != 0) { 956 DERR(vswp, "%s: unable to get cfg-handle from" 957 " node %d", __func__, idx); 958 goto md_prop_exit; 959 } else if (md_inst == obp_inst) { 960 D2(vswp, "%s: found matching node (%d)" 961 " 0x%llx == 0x%llx", __func__, idx, 962 md_inst, obp_inst); 963 found_node = B_TRUE; 964 break; 965 } 966 } 967 } 968 969 if (!found_node) { 970 DWARN(vswp, "%s: couldn't find correct vsw node", __func__); 971 goto md_prop_exit; 972 } 973 974 /* 975 * Now, having found the correct node, get the various properties. 976 */ 977 978 if (md_get_prop_data(mdp, listp[idx], physdev_propname, 979 (uint8_t **)(&physname), &len) != 0) { 980 cmn_err(CE_WARN, "%s: unable to get name(s) of physical " 981 "device(s) from MD", __func__); 982 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 983 cmn_err(CE_WARN, "%s is too long a device name", physname); 984 } else { 985 (void) strncpy(vswp->physname, physname, strlen(physname) + 1); 986 vswp->mdprops |= VSW_MD_PHYSNAME; 987 D2(vswp, "%s: using first device specified (%s)", 988 __func__, vswp->physname); 989 } 990 991 #ifdef DEBUG 992 /* 993 * As a temporary measure to aid testing we check to see if there 994 * is a vsw.conf file present. If there is we use the value of the 995 * vsw_physname property in the file as the name of the physical 996 * device, overriding the value from the MD. 997 * 998 * There may be multiple devices listed, but for the moment 999 * we just use the first one. 1000 */ 1001 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 1002 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 1003 if ((strlen(dev) + 1) > LIFNAMSIZ) { 1004 cmn_err(CE_WARN, "%s is too long a device name", dev); 1005 } else { 1006 cmn_err(CE_NOTE, "%s: using device name (%s) from " 1007 "config file", __func__, dev); 1008 1009 (void) strncpy(vswp->physname, dev, strlen(dev) + 1); 1010 vswp->mdprops |= VSW_MD_PHYSNAME; 1011 } 1012 1013 ddi_prop_free(dev); 1014 1015 } 1016 #endif 1017 1018 /* mac address for vswitch device itself */ 1019 if (md_get_prop_val(mdp, listp[idx], 1020 macaddr_propname, &macaddr) != 0) { 1021 cmn_err(CE_WARN, "!Unable to get MAC address from MD"); 1022 1023 /* 1024 * Fallback to using the mac address of the physical 1025 * device. 1026 */ 1027 if (vsw_get_physaddr(vswp) == 0) { 1028 cmn_err(CE_NOTE, "!Using MAC address from physical " 1029 "device (%s)", vswp->physname); 1030 } 1031 } else { 1032 READ_ENTER(&vswp->if_lockrw); 1033 for (i = ETHERADDRL - 1; i >= 0; i--) { 1034 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 1035 macaddr >>= 8; 1036 } 1037 RW_EXIT(&vswp->if_lockrw); 1038 vswp->mdprops |= VSW_MD_MACADDR; 1039 } 1040 1041 /* 1042 * Get the switch-mode property. The modes are listed in 1043 * decreasing order of preference, i.e. prefered mode is 1044 * first item in list. 1045 */ 1046 len = 0; 1047 vswp->smode_num = 0; 1048 if (md_get_prop_data(mdp, listp[idx], smode_propname, 1049 (uint8_t **)(&smode), &len) != 0) { 1050 /* 1051 * Unable to get switch-mode property from MD, nothing 1052 * more we can do. 1053 */ 1054 cmn_err(CE_WARN, "!unable to get switch mode property"); 1055 goto md_prop_exit; 1056 } 1057 1058 curr_mode = smode; 1059 /* 1060 * Modes of operation: 1061 * 'switched' - layer 2 switching, underlying HW in 1062 * programmed mode. 1063 * 'promiscuous' - layer 2 switching, underlying HW in 1064 * promiscuous mode. 1065 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 1066 * in non-promiscuous mode. 1067 */ 1068 while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) { 1069 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 1070 if (strcmp(curr_mode, "switched") == 0) { 1071 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1072 } else if (strcmp(curr_mode, "promiscuous") == 0) { 1073 vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC; 1074 } else if (strcmp(curr_mode, "routed") == 0) { 1075 vswp->smode[vswp->smode_num++] = VSW_LAYER3; 1076 } else { 1077 cmn_err(CE_WARN, "Unknown switch mode %s, setting to" 1078 " default switched mode", curr_mode); 1079 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1080 } 1081 curr_mode += strlen(curr_mode) + 1; 1082 } 1083 1084 D2(vswp, "%d switching modes specified", vswp->smode_num); 1085 1086 if (vswp->smode_num > 0) 1087 vswp->mdprops |= VSW_MD_SMODE; 1088 1089 md_prop_exit: 1090 (void) md_fini_handle(mdp); 1091 1092 kmem_free(listp, listsz); 1093 1094 D1(vswp, "%s: exit", __func__); 1095 } 1096 1097 /* 1098 * Get the mac address of the physical device. 1099 * 1100 * Returns 0 on success, 1 on failure. 1101 */ 1102 static int 1103 vsw_get_physaddr(vsw_t *vswp) 1104 { 1105 mac_handle_t mh; 1106 char drv[LIFNAMSIZ]; 1107 uint_t ddi_instance; 1108 1109 D1(vswp, "%s: enter", __func__); 1110 1111 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1112 return (1); 1113 1114 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1115 cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname); 1116 return (1); 1117 } 1118 1119 READ_ENTER(&vswp->if_lockrw); 1120 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1121 RW_EXIT(&vswp->if_lockrw); 1122 1123 mac_close(mh); 1124 1125 vswp->mdprops |= VSW_DEV_MACADDR; 1126 1127 D1(vswp, "%s: exit", __func__); 1128 1129 return (0); 1130 } 1131 1132 /* 1133 * Check to see if the card supports the setting of multiple unicst 1134 * addresses. 1135 * 1136 * Returns 0 if card supports the programming of multiple unicast addresses 1137 * and there are free address slots available, otherwise returns 1. 1138 */ 1139 static int 1140 vsw_get_hw_maddr(vsw_t *vswp) 1141 { 1142 D1(vswp, "%s: enter", __func__); 1143 1144 if (vswp->mh == NULL) { 1145 return (1); 1146 } 1147 1148 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1149 DWARN(vswp, "Unable to get capabilities of" 1150 " underlying device (%s)", vswp->physname); 1151 return (1); 1152 } 1153 1154 if (vswp->maddr.maddr_naddrfree == 0) { 1155 cmn_err(CE_WARN, 1156 "!device %s has no free unicast address slots", 1157 vswp->physname); 1158 return (1); 1159 } 1160 1161 D2(vswp, "%s: %d addrs : %d free", __func__, 1162 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1163 1164 D1(vswp, "%s: exit", __func__); 1165 1166 return (0); 1167 } 1168 1169 /* 1170 * Setup for layer 2 switching. 1171 * 1172 * Returns 0 on success, 1 on failure. 1173 */ 1174 static int 1175 vsw_setup_layer2(vsw_t *vswp) 1176 { 1177 D1(vswp, "%s: enter", __func__); 1178 1179 vsw_switch_frame = vsw_switch_l2_frame; 1180 1181 /* 1182 * Attempt to link into the MAC layer so we can get 1183 * and send packets out over the physical adapter. 1184 */ 1185 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1186 if (vsw_mac_attach(vswp) != 0) { 1187 /* 1188 * Registration with the MAC layer has failed, 1189 * so return 1 so that can fall back to next 1190 * prefered switching method. 1191 */ 1192 cmn_err(CE_WARN, "!Unable to join as MAC layer " 1193 "client"); 1194 return (1); 1195 } 1196 1197 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1198 /* 1199 * Verify that underlying device can support multiple 1200 * unicast mac addresses, and has free capacity. 1201 */ 1202 if (vsw_get_hw_maddr(vswp) != 0) { 1203 cmn_err(CE_WARN, "!unable to setup switching"); 1204 vsw_mac_detach(vswp); 1205 return (1); 1206 } 1207 } 1208 1209 } else { 1210 /* 1211 * No physical device name found in MD which is 1212 * required for layer 2. 1213 */ 1214 cmn_err(CE_WARN, "!no physical device name specified"); 1215 return (1); 1216 } 1217 1218 D1(vswp, "%s: exit", __func__); 1219 1220 return (0); 1221 } 1222 1223 static int 1224 vsw_setup_layer3(vsw_t *vswp) 1225 { 1226 D1(vswp, "%s: enter", __func__); 1227 1228 D2(vswp, "%s: operating in layer 3 mode", __func__); 1229 vsw_switch_frame = vsw_switch_l3_frame; 1230 1231 D1(vswp, "%s: exit", __func__); 1232 1233 return (0); 1234 } 1235 1236 /* 1237 * Link into the MAC layer to gain access to the services provided by 1238 * the underlying physical device driver (which should also have 1239 * registered with the MAC layer). 1240 * 1241 * Only when in layer 2 mode. 1242 */ 1243 static int 1244 vsw_mac_attach(vsw_t *vswp) 1245 { 1246 char drv[LIFNAMSIZ]; 1247 uint_t ddi_instance; 1248 1249 D1(vswp, "%s: enter", __func__); 1250 1251 vswp->mh = NULL; 1252 vswp->mrh = NULL; 1253 vswp->mstarted = B_FALSE; 1254 vswp->mresources = B_FALSE; 1255 1256 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1257 1258 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1259 cmn_err(CE_WARN, "invalid device name: %s", vswp->physname); 1260 goto mac_fail_exit; 1261 } 1262 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1263 cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); 1264 goto mac_fail_exit; 1265 } 1266 1267 ASSERT(vswp->mh != NULL); 1268 1269 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1270 1271 if (vsw_multi_ring_enable) { 1272 vsw_mac_ring_tbl_init(vswp); 1273 1274 /* 1275 * Register our receive callback. 1276 */ 1277 vswp->mrh = mac_rx_add(vswp->mh, 1278 vsw_rx_queue_cb, (void *)vswp); 1279 1280 /* 1281 * Register our mac resource callback. 1282 */ 1283 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1284 vswp->mresources = B_TRUE; 1285 1286 /* 1287 * Get the ring resources available to us from 1288 * the mac below us. 1289 */ 1290 mac_resources(vswp->mh); 1291 } else { 1292 /* 1293 * Just register our rx callback function 1294 */ 1295 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1296 } 1297 1298 ASSERT(vswp->mrh != NULL); 1299 1300 /* Get the MAC tx fn */ 1301 vswp->txinfo = mac_tx_get(vswp->mh); 1302 1303 /* start the interface */ 1304 if (mac_start(vswp->mh) != 0) { 1305 cmn_err(CE_WARN, "could not start mac interface"); 1306 goto mac_fail_exit; 1307 } 1308 1309 vswp->mstarted = B_TRUE; 1310 1311 D1(vswp, "%s: exit", __func__); 1312 return (0); 1313 1314 mac_fail_exit: 1315 vsw_mac_detach(vswp); 1316 1317 D1(vswp, "%s: exit", __func__); 1318 return (1); 1319 } 1320 1321 static void 1322 vsw_mac_detach(vsw_t *vswp) 1323 { 1324 D1(vswp, "vsw_mac_detach: enter"); 1325 1326 ASSERT(vswp != NULL); 1327 1328 if (vsw_multi_ring_enable) { 1329 vsw_mac_ring_tbl_destroy(vswp); 1330 } 1331 1332 if (vswp->mh != NULL) { 1333 if (vswp->mstarted) 1334 mac_stop(vswp->mh); 1335 if (vswp->mrh != NULL) 1336 mac_rx_remove(vswp->mh, vswp->mrh); 1337 if (vswp->mresources) 1338 mac_resource_set(vswp->mh, NULL, NULL); 1339 mac_close(vswp->mh); 1340 } 1341 1342 vswp->mrh = NULL; 1343 vswp->mh = NULL; 1344 vswp->txinfo = NULL; 1345 vswp->mstarted = B_FALSE; 1346 1347 D1(vswp, "vsw_mac_detach: exit"); 1348 } 1349 1350 /* 1351 * Depending on the mode specified, the capabilites and capacity 1352 * of the underlying device setup the physical device. 1353 * 1354 * If in layer 3 mode, then do nothing. 1355 * 1356 * If in layer 2 programmed mode attempt to program the unicast address 1357 * associated with the port into the physical device. If this is not 1358 * possible due to resource exhaustion or simply because the device does 1359 * not support multiple unicast addresses then if required fallback onto 1360 * putting the card into promisc mode. 1361 * 1362 * If in promisc mode then simply set the card into promisc mode. 1363 * 1364 * Returns 0 success, 1 on failure. 1365 */ 1366 static int 1367 vsw_set_hw(vsw_t *vswp, vsw_port_t *port) 1368 { 1369 mac_multi_addr_t mac_addr; 1370 void *mah; 1371 int err; 1372 1373 D1(vswp, "%s: enter", __func__); 1374 1375 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1376 return (0); 1377 1378 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1379 return (vsw_set_hw_promisc(vswp, port)); 1380 } 1381 1382 if (vswp->maddr.maddr_handle == NULL) 1383 return (1); 1384 1385 mah = vswp->maddr.maddr_handle; 1386 1387 /* 1388 * Attempt to program the unicast address into the HW. 1389 */ 1390 mac_addr.mma_addrlen = ETHERADDRL; 1391 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1392 1393 err = vswp->maddr.maddr_add(mah, &mac_addr); 1394 if (err != 0) { 1395 cmn_err(CE_WARN, "!failed to program addr " 1396 "%x:%x:%x:%x:%x:%x for port %d into device %s " 1397 ": err %d", port->p_macaddr.ether_addr_octet[0], 1398 port->p_macaddr.ether_addr_octet[1], 1399 port->p_macaddr.ether_addr_octet[2], 1400 port->p_macaddr.ether_addr_octet[3], 1401 port->p_macaddr.ether_addr_octet[4], 1402 port->p_macaddr.ether_addr_octet[5], 1403 port->p_instance, vswp->physname, err); 1404 1405 /* 1406 * Mark that attempt should be made to re-config sometime 1407 * in future if a port is deleted. 1408 */ 1409 vswp->recfg_reqd = B_TRUE; 1410 1411 /* 1412 * Only 1 mode specified, nothing more to do. 1413 */ 1414 if (vswp->smode_num == 1) 1415 return (err); 1416 1417 /* 1418 * If promiscuous was next mode specified try to 1419 * set the card into that mode. 1420 */ 1421 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1422 (vswp->smode[vswp->smode_idx + 1] 1423 == VSW_LAYER2_PROMISC)) { 1424 vswp->smode_idx += 1; 1425 return (vsw_set_hw_promisc(vswp, port)); 1426 } 1427 return (err); 1428 } 1429 1430 port->addr_slot = mac_addr.mma_slot; 1431 port->addr_set = VSW_ADDR_HW; 1432 1433 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d " 1434 "into slot %d of device %s", 1435 port->p_macaddr.ether_addr_octet[0], 1436 port->p_macaddr.ether_addr_octet[1], 1437 port->p_macaddr.ether_addr_octet[2], 1438 port->p_macaddr.ether_addr_octet[3], 1439 port->p_macaddr.ether_addr_octet[4], 1440 port->p_macaddr.ether_addr_octet[5], 1441 port->p_instance, port->addr_slot, vswp->physname); 1442 1443 D1(vswp, "%s: exit", __func__); 1444 1445 return (0); 1446 } 1447 1448 /* 1449 * If in layer 3 mode do nothing. 1450 * 1451 * If in layer 2 switched mode remove the address from the physical 1452 * device. 1453 * 1454 * If in layer 2 promiscuous mode disable promisc mode. 1455 * 1456 * Returns 0 on success. 1457 */ 1458 static int 1459 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port) 1460 { 1461 int err; 1462 void *mah; 1463 1464 D1(vswp, "%s: enter", __func__); 1465 1466 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1467 return (0); 1468 1469 if (port->addr_set == VSW_ADDR_PROMISC) { 1470 return (vsw_unset_hw_promisc(vswp, port)); 1471 } 1472 1473 if (port->addr_set == VSW_ADDR_HW) { 1474 if (vswp->mh == NULL) 1475 return (1); 1476 1477 if (vswp->maddr.maddr_handle == NULL) 1478 return (1); 1479 1480 mah = vswp->maddr.maddr_handle; 1481 1482 err = vswp->maddr.maddr_remove(mah, port->addr_slot); 1483 if (err != 0) { 1484 cmn_err(CE_WARN, "!Unable to remove addr " 1485 "%x:%x:%x:%x:%x:%x for port %d from device %s" 1486 " : (err %d)", 1487 port->p_macaddr.ether_addr_octet[0], 1488 port->p_macaddr.ether_addr_octet[1], 1489 port->p_macaddr.ether_addr_octet[2], 1490 port->p_macaddr.ether_addr_octet[3], 1491 port->p_macaddr.ether_addr_octet[4], 1492 port->p_macaddr.ether_addr_octet[5], 1493 port->p_instance, vswp->physname, err); 1494 return (err); 1495 } 1496 1497 port->addr_set = VSW_ADDR_UNSET; 1498 1499 D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for " 1500 "port %d from device %s", 1501 port->p_macaddr.ether_addr_octet[0], 1502 port->p_macaddr.ether_addr_octet[1], 1503 port->p_macaddr.ether_addr_octet[2], 1504 port->p_macaddr.ether_addr_octet[3], 1505 port->p_macaddr.ether_addr_octet[4], 1506 port->p_macaddr.ether_addr_octet[5], 1507 port->p_instance, vswp->physname); 1508 } 1509 1510 D1(vswp, "%s: exit", __func__); 1511 return (0); 1512 } 1513 1514 /* 1515 * Set network card into promisc mode. 1516 * 1517 * Returns 0 on success, 1 on failure. 1518 */ 1519 static int 1520 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1521 { 1522 D1(vswp, "%s: enter", __func__); 1523 1524 if (vswp->mh == NULL) 1525 return (1); 1526 1527 if (vswp->promisc_cnt++ == 0) { 1528 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1529 vswp->promisc_cnt--; 1530 return (1); 1531 } 1532 cmn_err(CE_NOTE, "!switching device %s into promiscuous mode", 1533 vswp->physname); 1534 } 1535 port->addr_set = VSW_ADDR_PROMISC; 1536 1537 D1(vswp, "%s: exit", __func__); 1538 1539 return (0); 1540 } 1541 1542 /* 1543 * Turn off promiscuous mode on network card. 1544 * 1545 * Returns 0 on success, 1 on failure. 1546 */ 1547 static int 1548 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1549 { 1550 vsw_port_list_t *plist = &vswp->plist; 1551 1552 D1(vswp, "%s: enter", __func__); 1553 1554 if (vswp->mh == NULL) 1555 return (1); 1556 1557 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1558 1559 if (--vswp->promisc_cnt == 0) { 1560 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1561 vswp->promisc_cnt++; 1562 return (1); 1563 } 1564 1565 /* 1566 * We are exiting promisc mode either because we were 1567 * only in promisc mode because we had failed over from 1568 * switched mode due to HW resource issues, or the user 1569 * wanted the card in promisc mode for all the ports and 1570 * the last port is now being deleted. Tweak the message 1571 * accordingly. 1572 */ 1573 if (plist->num_ports != 0) { 1574 cmn_err(CE_NOTE, "!switching device %s back to " 1575 "programmed mode", vswp->physname); 1576 } else { 1577 cmn_err(CE_NOTE, "!switching device %s out of " 1578 "promiscuous mode", vswp->physname); 1579 } 1580 } 1581 port->addr_set = VSW_ADDR_UNSET; 1582 1583 D1(vswp, "%s: exit", __func__); 1584 return (0); 1585 } 1586 1587 /* 1588 * Determine whether or not we are operating in our prefered 1589 * mode and if not whether the physical resources now allow us 1590 * to operate in it. 1591 * 1592 * Should only be invoked after port which is being deleted has been 1593 * removed from the port list. 1594 */ 1595 static int 1596 vsw_reconfig_hw(vsw_t *vswp) 1597 { 1598 vsw_port_list_t *plist = &vswp->plist; 1599 mac_multi_addr_t mac_addr; 1600 vsw_port_t *tp; 1601 void *mah; 1602 int rv = 0; 1603 int s_idx; 1604 1605 D1(vswp, "%s: enter", __func__); 1606 1607 if (vswp->maddr.maddr_handle == NULL) 1608 return (1); 1609 1610 /* 1611 * Check if there are now sufficient HW resources to 1612 * attempt a re-config. 1613 */ 1614 if (plist->num_ports > vswp->maddr.maddr_naddrfree) 1615 return (1); 1616 1617 /* 1618 * If we are in layer 2 (i.e. switched) or would like to be 1619 * in layer 2 then check if any ports need to be programmed 1620 * into the HW. 1621 * 1622 * This can happen in two cases - switched was specified as 1623 * the prefered mode of operation but we exhausted the HW 1624 * resources and so failed over to the next specifed mode, 1625 * or switched was the only mode specified so after HW 1626 * resources were exhausted there was nothing more we 1627 * could do. 1628 */ 1629 if (vswp->smode_idx > 0) 1630 s_idx = vswp->smode_idx - 1; 1631 else 1632 s_idx = vswp->smode_idx; 1633 1634 if (vswp->smode[s_idx] == VSW_LAYER2) { 1635 mah = vswp->maddr.maddr_handle; 1636 1637 D2(vswp, "%s: attempting reconfig..", __func__); 1638 1639 /* 1640 * Scan the port list for any port whose address has not 1641 * be programmed in HW - there should be a max of one. 1642 */ 1643 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1644 if (tp->addr_set != VSW_ADDR_HW) { 1645 mac_addr.mma_addrlen = ETHERADDRL; 1646 ether_copy(&tp->p_macaddr, &mac_addr.mma_addr); 1647 1648 rv = vswp->maddr.maddr_add(mah, &mac_addr); 1649 if (rv != 0) { 1650 DWARN(vswp, "Error setting addr in " 1651 "HW for port %d err %d", 1652 tp->p_instance, rv); 1653 goto reconfig_err_exit; 1654 } 1655 tp->addr_slot = mac_addr.mma_slot; 1656 1657 D2(vswp, "re-programmed port %d " 1658 "addr %x:%x:%x:%x:%x:%x into slot %d" 1659 " of device %s", tp->p_instance, 1660 tp->p_macaddr.ether_addr_octet[0], 1661 tp->p_macaddr.ether_addr_octet[1], 1662 tp->p_macaddr.ether_addr_octet[2], 1663 tp->p_macaddr.ether_addr_octet[3], 1664 tp->p_macaddr.ether_addr_octet[4], 1665 tp->p_macaddr.ether_addr_octet[5], 1666 tp->addr_slot, vswp->physname); 1667 1668 /* 1669 * If up to now we had to put the card into 1670 * promisc mode to see this address, we 1671 * can now safely disable promisc mode. 1672 */ 1673 if (tp->addr_set == VSW_ADDR_PROMISC) 1674 (void) vsw_unset_hw_promisc(vswp, tp); 1675 1676 tp->addr_set = VSW_ADDR_HW; 1677 } 1678 } 1679 1680 /* no further re-config needed */ 1681 vswp->recfg_reqd = B_FALSE; 1682 1683 vswp->smode_idx = s_idx; 1684 1685 return (0); 1686 } 1687 1688 reconfig_err_exit: 1689 return (rv); 1690 } 1691 1692 static void 1693 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 1694 { 1695 ringp->ring_state = VSW_MAC_RING_FREE; 1696 ringp->ring_arg = NULL; 1697 ringp->ring_blank = NULL; 1698 ringp->ring_vqp = NULL; 1699 ringp->ring_vswp = vswp; 1700 } 1701 1702 static void 1703 vsw_mac_ring_tbl_init(vsw_t *vswp) 1704 { 1705 int i; 1706 1707 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 1708 1709 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 1710 vswp->mac_ring_tbl = 1711 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), 1712 KM_SLEEP); 1713 1714 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 1715 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 1716 } 1717 1718 static void 1719 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 1720 { 1721 int i; 1722 1723 mutex_enter(&vswp->mac_ring_lock); 1724 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1725 if (vswp->mac_ring_tbl[i].ring_state != VSW_MAC_RING_FREE) { 1726 /* 1727 * Destroy the queue. 1728 */ 1729 vsw_queue_stop(vswp->mac_ring_tbl[i].ring_vqp); 1730 vsw_queue_destroy(vswp->mac_ring_tbl[i].ring_vqp); 1731 1732 /* 1733 * Re-initialize the structure. 1734 */ 1735 vsw_mac_ring_tbl_entry_init(vswp, 1736 &vswp->mac_ring_tbl[i]); 1737 } 1738 } 1739 mutex_exit(&vswp->mac_ring_lock); 1740 1741 mutex_destroy(&vswp->mac_ring_lock); 1742 kmem_free(vswp->mac_ring_tbl, 1743 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 1744 vswp->mac_ring_tbl_sz = 0; 1745 } 1746 1747 /* 1748 * Handle resource add callbacks from the driver below. 1749 */ 1750 static mac_resource_handle_t 1751 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 1752 { 1753 vsw_t *vswp = (vsw_t *)arg; 1754 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 1755 vsw_mac_ring_t *ringp; 1756 vsw_queue_t *vqp; 1757 int i; 1758 1759 ASSERT(vswp != NULL); 1760 ASSERT(mrp != NULL); 1761 ASSERT(vswp->mac_ring_tbl != NULL); 1762 1763 D1(vswp, "%s: enter", __func__); 1764 1765 /* 1766 * Check to make sure we have the correct resource type. 1767 */ 1768 if (mrp->mr_type != MAC_RX_FIFO) 1769 return (NULL); 1770 1771 /* 1772 * Find a open entry in the ring table. 1773 */ 1774 mutex_enter(&vswp->mac_ring_lock); 1775 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1776 ringp = &vswp->mac_ring_tbl[i]; 1777 1778 /* 1779 * Check for an empty slot, if found, then setup queue 1780 * and thread. 1781 */ 1782 if (ringp->ring_state == VSW_MAC_RING_FREE) { 1783 /* 1784 * Create the queue for this ring. 1785 */ 1786 vqp = vsw_queue_create(); 1787 1788 /* 1789 * Initialize the ring data structure. 1790 */ 1791 ringp->ring_vqp = vqp; 1792 ringp->ring_arg = mrfp->mrf_arg; 1793 ringp->ring_blank = mrfp->mrf_blank; 1794 ringp->ring_state = VSW_MAC_RING_INUSE; 1795 1796 /* 1797 * Create the worker thread. 1798 */ 1799 vqp->vq_worker = thread_create(NULL, 0, 1800 vsw_queue_worker, ringp, 0, &p0, 1801 TS_RUN, minclsyspri); 1802 if (vqp->vq_worker == NULL) { 1803 vsw_queue_destroy(vqp); 1804 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1805 ringp = NULL; 1806 } 1807 1808 mutex_exit(&vswp->mac_ring_lock); 1809 D1(vswp, "%s: exit", __func__); 1810 return ((mac_resource_handle_t)ringp); 1811 } 1812 } 1813 mutex_exit(&vswp->mac_ring_lock); 1814 1815 /* 1816 * No slots in the ring table available. 1817 */ 1818 D1(vswp, "%s: exit", __func__); 1819 return (NULL); 1820 } 1821 1822 static void 1823 vsw_queue_stop(vsw_queue_t *vqp) 1824 { 1825 mutex_enter(&vqp->vq_lock); 1826 1827 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 1828 vqp->vq_state = VSW_QUEUE_STOP; 1829 cv_signal(&vqp->vq_cv); 1830 1831 while (vqp->vq_state != VSW_QUEUE_DRAINED) 1832 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1833 } 1834 1835 mutex_exit(&vqp->vq_lock); 1836 } 1837 1838 static vsw_queue_t * 1839 vsw_queue_create() 1840 { 1841 vsw_queue_t *vqp; 1842 1843 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 1844 1845 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 1846 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 1847 vqp->vq_first = NULL; 1848 vqp->vq_last = NULL; 1849 vqp->vq_state = VSW_QUEUE_STOP; 1850 1851 return (vqp); 1852 } 1853 1854 static void 1855 vsw_queue_destroy(vsw_queue_t *vqp) 1856 { 1857 cv_destroy(&vqp->vq_cv); 1858 mutex_destroy(&vqp->vq_lock); 1859 kmem_free(vqp, sizeof (vsw_queue_t)); 1860 } 1861 1862 static void 1863 vsw_queue_worker(vsw_mac_ring_t *rrp) 1864 { 1865 mblk_t *mp; 1866 vsw_queue_t *vqp = rrp->ring_vqp; 1867 vsw_t *vswp = rrp->ring_vswp; 1868 1869 mutex_enter(&vqp->vq_lock); 1870 1871 ASSERT(vqp->vq_state == VSW_QUEUE_STOP); 1872 1873 /* 1874 * Set the state to running, since the thread is now active. 1875 */ 1876 vqp->vq_state = VSW_QUEUE_RUNNING; 1877 1878 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 1879 /* 1880 * Wait for work to do or the state has changed 1881 * to not running. 1882 */ 1883 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 1884 (vqp->vq_first == NULL)) { 1885 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1886 } 1887 1888 /* 1889 * Process packets that we received from the interface. 1890 */ 1891 if (vqp->vq_first != NULL) { 1892 mp = vqp->vq_first; 1893 1894 vqp->vq_first = NULL; 1895 vqp->vq_last = NULL; 1896 1897 mutex_exit(&vqp->vq_lock); 1898 1899 /* switch the chain of packets received */ 1900 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 1901 1902 mutex_enter(&vqp->vq_lock); 1903 } 1904 } 1905 1906 /* 1907 * We are drained and signal we are done. 1908 */ 1909 vqp->vq_state = VSW_QUEUE_DRAINED; 1910 cv_signal(&vqp->vq_cv); 1911 1912 /* 1913 * Exit lock and drain the remaining packets. 1914 */ 1915 mutex_exit(&vqp->vq_lock); 1916 1917 /* 1918 * Exit the thread 1919 */ 1920 thread_exit(); 1921 } 1922 1923 /* 1924 * static void 1925 * vsw_rx_queue_cb() - Receive callback routine when 1926 * vsw_multi_ring_enable is non-zero. Queue the packets 1927 * to a packet queue for a worker thread to process. 1928 */ 1929 static void 1930 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 1931 { 1932 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 1933 vsw_t *vswp = (vsw_t *)arg; 1934 vsw_queue_t *vqp; 1935 mblk_t *bp, *last; 1936 1937 ASSERT(mrh != NULL); 1938 ASSERT(vswp != NULL); 1939 ASSERT(mp != NULL); 1940 1941 D1(vswp, "%s: enter", __func__); 1942 1943 /* 1944 * Find the last element in the mblk chain. 1945 */ 1946 bp = mp; 1947 do { 1948 last = bp; 1949 bp = bp->b_next; 1950 } while (bp != NULL); 1951 1952 /* Get the queue for the packets */ 1953 vqp = ringp->ring_vqp; 1954 1955 /* 1956 * Grab the lock such we can queue the packets. 1957 */ 1958 mutex_enter(&vqp->vq_lock); 1959 1960 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 1961 freemsg(mp); 1962 goto vsw_rx_queue_cb_exit; 1963 } 1964 1965 /* 1966 * Add the mblk chain to the queue. If there 1967 * is some mblks in the queue, then add the new 1968 * chain to the end. 1969 */ 1970 if (vqp->vq_first == NULL) 1971 vqp->vq_first = mp; 1972 else 1973 vqp->vq_last->b_next = mp; 1974 1975 vqp->vq_last = last; 1976 1977 /* 1978 * Signal the worker thread that there is work to 1979 * do. 1980 */ 1981 cv_signal(&vqp->vq_cv); 1982 1983 /* 1984 * Let go of the lock and exit. 1985 */ 1986 vsw_rx_queue_cb_exit: 1987 mutex_exit(&vqp->vq_lock); 1988 D1(vswp, "%s: exit", __func__); 1989 } 1990 1991 /* 1992 * receive callback routine. Invoked by MAC layer when there 1993 * are pkts being passed up from physical device. 1994 * 1995 * PERF: It may be more efficient when the card is in promisc 1996 * mode to check the dest address of the pkts here (against 1997 * the FDB) rather than checking later. Needs to be investigated. 1998 */ 1999 static void 2000 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2001 { 2002 _NOTE(ARGUNUSED(mrh)) 2003 2004 vsw_t *vswp = (vsw_t *)arg; 2005 2006 ASSERT(vswp != NULL); 2007 2008 D1(vswp, "vsw_rx_cb: enter"); 2009 2010 /* switch the chain of packets received */ 2011 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2012 2013 D1(vswp, "vsw_rx_cb: exit"); 2014 } 2015 2016 /* 2017 * Send a message out over the physical device via the MAC layer. 2018 * 2019 * Returns any mblks that it was unable to transmit. 2020 */ 2021 static mblk_t * 2022 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2023 { 2024 const mac_txinfo_t *mtp; 2025 mblk_t *nextp; 2026 2027 if (vswp->mh == NULL) { 2028 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2029 return (mp); 2030 } else { 2031 for (;;) { 2032 nextp = mp->b_next; 2033 mp->b_next = NULL; 2034 2035 mtp = vswp->txinfo; 2036 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2037 mp->b_next = nextp; 2038 break; 2039 } 2040 2041 if ((mp = nextp) == NULL) 2042 break; 2043 2044 } 2045 2046 } 2047 2048 return (mp); 2049 } 2050 2051 /* 2052 * Register with the MAC layer as a network device, so we 2053 * can be plumbed if necessary. 2054 */ 2055 static int 2056 vsw_mac_register(vsw_t *vswp) 2057 { 2058 mac_register_t *macp; 2059 int rv; 2060 2061 D1(vswp, "%s: enter", __func__); 2062 2063 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2064 return (EINVAL); 2065 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2066 macp->m_driver = vswp; 2067 macp->m_dip = vswp->dip; 2068 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2069 macp->m_callbacks = &vsw_m_callbacks; 2070 macp->m_min_sdu = 0; 2071 macp->m_max_sdu = ETHERMTU; 2072 rv = mac_register(macp, &vswp->if_mh); 2073 mac_free(macp); 2074 if (rv == 0) 2075 vswp->if_state |= VSW_IF_REG; 2076 2077 D1(vswp, "%s: exit", __func__); 2078 2079 return (rv); 2080 } 2081 2082 static int 2083 vsw_mac_unregister(vsw_t *vswp) 2084 { 2085 int rv = 0; 2086 2087 D1(vswp, "%s: enter", __func__); 2088 2089 WRITE_ENTER(&vswp->if_lockrw); 2090 2091 if (vswp->if_state & VSW_IF_REG) { 2092 rv = mac_unregister(vswp->if_mh); 2093 if (rv != 0) { 2094 DWARN(vswp, "%s: unable to unregister from MAC " 2095 "framework", __func__); 2096 2097 RW_EXIT(&vswp->if_lockrw); 2098 D1(vswp, "%s: fail exit", __func__); 2099 return (rv); 2100 } 2101 2102 /* mark i/f as down and unregistered */ 2103 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2104 } 2105 RW_EXIT(&vswp->if_lockrw); 2106 2107 vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR); 2108 2109 D1(vswp, "%s: exit", __func__); 2110 2111 return (rv); 2112 } 2113 2114 static int 2115 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2116 { 2117 vsw_t *vswp = (vsw_t *)arg; 2118 2119 D1(vswp, "%s: enter", __func__); 2120 2121 if (vswp->mh == NULL) 2122 return (EINVAL); 2123 2124 /* return stats from underlying device */ 2125 *val = mac_stat_get(vswp->mh, stat); 2126 return (0); 2127 } 2128 2129 static void 2130 vsw_m_stop(void *arg) 2131 { 2132 vsw_t *vswp = (vsw_t *)arg; 2133 2134 D1(vswp, "%s: enter", __func__); 2135 2136 WRITE_ENTER(&vswp->if_lockrw); 2137 vswp->if_state &= ~VSW_IF_UP; 2138 RW_EXIT(&vswp->if_lockrw); 2139 2140 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2141 } 2142 2143 static int 2144 vsw_m_start(void *arg) 2145 { 2146 vsw_t *vswp = (vsw_t *)arg; 2147 2148 D1(vswp, "%s: enter", __func__); 2149 2150 WRITE_ENTER(&vswp->if_lockrw); 2151 vswp->if_state |= VSW_IF_UP; 2152 RW_EXIT(&vswp->if_lockrw); 2153 2154 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2155 return (0); 2156 } 2157 2158 /* 2159 * Change the local interface address. 2160 */ 2161 static int 2162 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2163 { 2164 vsw_t *vswp = (vsw_t *)arg; 2165 2166 D1(vswp, "%s: enter", __func__); 2167 2168 WRITE_ENTER(&vswp->if_lockrw); 2169 ether_copy(macaddr, &vswp->if_addr); 2170 RW_EXIT(&vswp->if_lockrw); 2171 2172 D1(vswp, "%s: exit", __func__); 2173 2174 return (0); 2175 } 2176 2177 static int 2178 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2179 { 2180 vsw_t *vswp = (vsw_t *)arg; 2181 mcst_addr_t *mcst_p = NULL; 2182 uint64_t addr = 0x0; 2183 int i, ret = 0; 2184 2185 D1(vswp, "%s: enter", __func__); 2186 2187 /* 2188 * Convert address into form that can be used 2189 * as hash table key. 2190 */ 2191 for (i = 0; i < ETHERADDRL; i++) { 2192 addr = (addr << 8) | mca[i]; 2193 } 2194 2195 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2196 2197 if (add) { 2198 D2(vswp, "%s: adding multicast", __func__); 2199 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2200 /* 2201 * Update the list of multicast addresses 2202 * contained within the vsw_t structure to 2203 * include this new one. 2204 */ 2205 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2206 if (mcst_p == NULL) { 2207 DERR(vswp, "%s unable to alloc mem", __func__); 2208 return (1); 2209 } 2210 mcst_p->addr = addr; 2211 2212 mutex_enter(&vswp->mca_lock); 2213 mcst_p->nextp = vswp->mcap; 2214 vswp->mcap = mcst_p; 2215 mutex_exit(&vswp->mca_lock); 2216 2217 /* 2218 * Call into the underlying driver to program the 2219 * address into HW. 2220 */ 2221 if (vswp->mh != NULL) { 2222 ret = mac_multicst_add(vswp->mh, mca); 2223 if (ret != 0) { 2224 cmn_err(CE_WARN, "!unable to add " 2225 "multicast address"); 2226 goto vsw_remove_addr; 2227 } 2228 } 2229 } else { 2230 cmn_err(CE_WARN, "!unable to add multicast address"); 2231 } 2232 return (ret); 2233 } 2234 2235 vsw_remove_addr: 2236 2237 D2(vswp, "%s: removing multicast", __func__); 2238 /* 2239 * Remove the address from the hash table.. 2240 */ 2241 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2242 2243 /* 2244 * ..and then from the list maintained in the 2245 * vsw_t structure. 2246 */ 2247 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2248 2249 if (vswp->mh != NULL) 2250 (void) mac_multicst_remove(vswp->mh, mca); 2251 } 2252 2253 D1(vswp, "%s: exit", __func__); 2254 2255 return (0); 2256 } 2257 2258 static int 2259 vsw_m_promisc(void *arg, boolean_t on) 2260 { 2261 vsw_t *vswp = (vsw_t *)arg; 2262 2263 D1(vswp, "%s: enter", __func__); 2264 2265 WRITE_ENTER(&vswp->if_lockrw); 2266 if (on) 2267 vswp->if_state |= VSW_IF_PROMISC; 2268 else 2269 vswp->if_state &= ~VSW_IF_PROMISC; 2270 RW_EXIT(&vswp->if_lockrw); 2271 2272 D1(vswp, "%s: exit", __func__); 2273 2274 return (0); 2275 } 2276 2277 static mblk_t * 2278 vsw_m_tx(void *arg, mblk_t *mp) 2279 { 2280 vsw_t *vswp = (vsw_t *)arg; 2281 2282 D1(vswp, "%s: enter", __func__); 2283 2284 vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2285 2286 D1(vswp, "%s: exit", __func__); 2287 2288 return (NULL); 2289 } 2290 2291 /* 2292 * Register for machine description (MD) updates. 2293 */ 2294 static void 2295 vsw_mdeg_register(vsw_t *vswp) 2296 { 2297 mdeg_prop_spec_t *pspecp; 2298 mdeg_node_spec_t *inst_specp; 2299 mdeg_handle_t mdeg_hdl; 2300 size_t templatesz; 2301 int inst, rv; 2302 2303 D1(vswp, "%s: enter", __func__); 2304 2305 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 2306 DDI_PROP_DONTPASS, reg_propname, -1); 2307 if (inst == -1) { 2308 DERR(vswp, "%s: unable to get %s property", 2309 __func__, reg_propname); 2310 return; 2311 } 2312 2313 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 2314 2315 /* 2316 * Allocate and initialize a per-instance copy 2317 * of the global property spec array that will 2318 * uniquely identify this vsw instance. 2319 */ 2320 templatesz = sizeof (vsw_prop_template); 2321 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2322 2323 bcopy(vsw_prop_template, pspecp, templatesz); 2324 2325 VSW_SET_MDEG_PROP_INST(pspecp, inst); 2326 2327 /* initialize the complete prop spec structure */ 2328 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2329 inst_specp->namep = "virtual-device"; 2330 inst_specp->specp = pspecp; 2331 2332 /* perform the registration */ 2333 rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, 2334 (void *)vswp, &mdeg_hdl); 2335 2336 if (rv != MDEG_SUCCESS) { 2337 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2338 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2339 kmem_free(pspecp, templatesz); 2340 return; 2341 } 2342 2343 /* save off data that will be needed later */ 2344 vswp->inst_spec = inst_specp; 2345 vswp->mdeg_hdl = mdeg_hdl; 2346 2347 D1(vswp, "%s: exit", __func__); 2348 } 2349 2350 static void 2351 vsw_mdeg_unregister(vsw_t *vswp) 2352 { 2353 D1(vswp, "vsw_mdeg_unregister: enter"); 2354 2355 (void) mdeg_unregister(vswp->mdeg_hdl); 2356 2357 if (vswp->inst_spec->specp != NULL) { 2358 (void) kmem_free(vswp->inst_spec->specp, 2359 sizeof (vsw_prop_template)); 2360 vswp->inst_spec->specp = NULL; 2361 } 2362 2363 if (vswp->inst_spec != NULL) { 2364 (void) kmem_free(vswp->inst_spec, 2365 sizeof (mdeg_node_spec_t)); 2366 vswp->inst_spec = NULL; 2367 } 2368 2369 D1(vswp, "vsw_mdeg_unregister: exit"); 2370 } 2371 2372 static int 2373 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2374 { 2375 vsw_t *vswp; 2376 int idx; 2377 md_t *mdp; 2378 mde_cookie_t node; 2379 uint64_t inst; 2380 2381 if (resp == NULL) 2382 return (MDEG_FAILURE); 2383 2384 vswp = (vsw_t *)cb_argp; 2385 2386 D1(vswp, "%s: added %d : removed %d : matched %d", 2387 __func__, resp->added.nelem, resp->removed.nelem, 2388 resp->match_prev.nelem); 2389 2390 /* process added ports */ 2391 for (idx = 0; idx < resp->added.nelem; idx++) { 2392 mdp = resp->added.mdp; 2393 node = resp->added.mdep[idx]; 2394 2395 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2396 2397 if (vsw_port_add(vswp, mdp, &node) != 0) { 2398 cmn_err(CE_WARN, "Unable to add new port (0x%lx)", 2399 node); 2400 } 2401 } 2402 2403 /* process removed ports */ 2404 for (idx = 0; idx < resp->removed.nelem; idx++) { 2405 mdp = resp->removed.mdp; 2406 node = resp->removed.mdep[idx]; 2407 2408 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2409 DERR(vswp, "%s: prop(%s) not found port(%d)", 2410 __func__, id_propname, idx); 2411 continue; 2412 } 2413 2414 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2415 2416 if (vsw_port_detach(vswp, inst) != 0) { 2417 cmn_err(CE_WARN, "Unable to remove port %ld", inst); 2418 } 2419 } 2420 2421 /* 2422 * Currently no support for updating already active ports. 2423 * So, ignore the match_curr and match_priv arrays for now. 2424 */ 2425 2426 D1(vswp, "%s: exit", __func__); 2427 2428 return (MDEG_SUCCESS); 2429 } 2430 2431 /* 2432 * Add a new port to the system. 2433 * 2434 * Returns 0 on success, 1 on failure. 2435 */ 2436 int 2437 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 2438 { 2439 uint64_t ldc_id; 2440 uint8_t *addrp; 2441 int i, addrsz; 2442 int num_nodes = 0, nchan = 0; 2443 int listsz = 0; 2444 mde_cookie_t *listp = NULL; 2445 struct ether_addr ea; 2446 uint64_t macaddr; 2447 uint64_t inst = 0; 2448 vsw_port_t *port; 2449 2450 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 2451 DWARN(vswp, "%s: prop(%s) not found", __func__, 2452 id_propname); 2453 return (1); 2454 } 2455 2456 /* 2457 * Find the channel endpoint node(s) (which should be under this 2458 * port node) which contain the channel id(s). 2459 */ 2460 if ((num_nodes = md_node_count(mdp)) <= 0) { 2461 DERR(vswp, "%s: invalid number of nodes found (%d)", 2462 __func__, num_nodes); 2463 return (1); 2464 } 2465 2466 /* allocate enough space for node list */ 2467 listsz = num_nodes * sizeof (mde_cookie_t); 2468 listp = kmem_zalloc(listsz, KM_SLEEP); 2469 2470 nchan = md_scan_dag(mdp, *node, 2471 md_find_name(mdp, chan_propname), 2472 md_find_name(mdp, "fwd"), listp); 2473 2474 if (nchan <= 0) { 2475 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 2476 kmem_free(listp, listsz); 2477 return (1); 2478 } 2479 2480 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 2481 2482 /* use property from first node found */ 2483 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 2484 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 2485 id_propname); 2486 kmem_free(listp, listsz); 2487 return (1); 2488 } 2489 2490 /* don't need list any more */ 2491 kmem_free(listp, listsz); 2492 2493 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 2494 2495 /* read mac-address property */ 2496 if (md_get_prop_data(mdp, *node, remaddr_propname, 2497 &addrp, &addrsz)) { 2498 DWARN(vswp, "%s: prop(%s) not found", 2499 __func__, remaddr_propname); 2500 return (1); 2501 } 2502 2503 if (addrsz < ETHERADDRL) { 2504 DWARN(vswp, "%s: invalid address size", __func__); 2505 return (1); 2506 } 2507 2508 macaddr = *((uint64_t *)addrp); 2509 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 2510 2511 for (i = ETHERADDRL - 1; i >= 0; i--) { 2512 ea.ether_addr_octet[i] = macaddr & 0xFF; 2513 macaddr >>= 8; 2514 } 2515 2516 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 2517 DERR(vswp, "%s: failed to attach port", __func__); 2518 return (1); 2519 } 2520 2521 port = vsw_lookup_port(vswp, (int)inst); 2522 2523 /* just successfuly created the port, so it should exist */ 2524 ASSERT(port != NULL); 2525 2526 return (0); 2527 } 2528 2529 /* 2530 * Attach the specified port. 2531 * 2532 * Returns 0 on success, 1 on failure. 2533 */ 2534 static int 2535 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 2536 struct ether_addr *macaddr) 2537 { 2538 vsw_port_list_t *plist = &vswp->plist; 2539 vsw_port_t *port, **prev_port; 2540 int i; 2541 2542 D1(vswp, "%s: enter : port %d", __func__, p_instance); 2543 2544 /* port already exists? */ 2545 READ_ENTER(&plist->lockrw); 2546 for (port = plist->head; port != NULL; port = port->p_next) { 2547 if (port->p_instance == p_instance) { 2548 DWARN(vswp, "%s: port instance %d already attached", 2549 __func__, p_instance); 2550 RW_EXIT(&plist->lockrw); 2551 return (1); 2552 } 2553 } 2554 RW_EXIT(&plist->lockrw); 2555 2556 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 2557 port->p_vswp = vswp; 2558 port->p_instance = p_instance; 2559 port->p_ldclist.num_ldcs = 0; 2560 port->p_ldclist.head = NULL; 2561 port->addr_set = VSW_ADDR_UNSET; 2562 2563 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 2564 2565 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 2566 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 2567 2568 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 2569 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 2570 2571 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 2572 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 2573 port->state = VSW_PORT_INIT; 2574 2575 if (nids > VSW_PORT_MAX_LDCS) { 2576 D2(vswp, "%s: using first of %d ldc ids", 2577 __func__, nids); 2578 nids = VSW_PORT_MAX_LDCS; 2579 } 2580 2581 D2(vswp, "%s: %d nids", __func__, nids); 2582 for (i = 0; i < nids; i++) { 2583 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 2584 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 2585 DERR(vswp, "%s: ldc_attach failed", __func__); 2586 2587 rw_destroy(&port->p_ldclist.lockrw); 2588 2589 cv_destroy(&port->ref_cv); 2590 mutex_destroy(&port->ref_lock); 2591 2592 cv_destroy(&port->state_cv); 2593 mutex_destroy(&port->state_lock); 2594 2595 mutex_destroy(&port->tx_lock); 2596 mutex_destroy(&port->mca_lock); 2597 kmem_free(port, sizeof (vsw_port_t)); 2598 return (1); 2599 } 2600 } 2601 2602 ether_copy(macaddr, &port->p_macaddr); 2603 2604 WRITE_ENTER(&plist->lockrw); 2605 2606 /* create the fdb entry for this port/mac address */ 2607 (void) vsw_add_fdb(vswp, port); 2608 2609 (void) vsw_set_hw(vswp, port); 2610 2611 /* link it into the list of ports for this vsw instance */ 2612 prev_port = (vsw_port_t **)(&plist->head); 2613 port->p_next = *prev_port; 2614 *prev_port = port; 2615 plist->num_ports++; 2616 RW_EXIT(&plist->lockrw); 2617 2618 /* 2619 * Initialise the port and any ldc's under it. 2620 */ 2621 (void) vsw_init_ldcs(port); 2622 2623 D1(vswp, "%s: exit", __func__); 2624 return (0); 2625 } 2626 2627 /* 2628 * Detach the specified port. 2629 * 2630 * Returns 0 on success, 1 on failure. 2631 */ 2632 static int 2633 vsw_port_detach(vsw_t *vswp, int p_instance) 2634 { 2635 vsw_port_t *port = NULL; 2636 vsw_port_list_t *plist = &vswp->plist; 2637 2638 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 2639 2640 WRITE_ENTER(&plist->lockrw); 2641 2642 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 2643 RW_EXIT(&plist->lockrw); 2644 return (1); 2645 } 2646 2647 if (vsw_plist_del_node(vswp, port)) { 2648 RW_EXIT(&plist->lockrw); 2649 return (1); 2650 } 2651 2652 /* Remove address if was programmed into HW. */ 2653 (void) vsw_unset_hw(vswp, port); 2654 2655 /* Remove the fdb entry for this port/mac address */ 2656 (void) vsw_del_fdb(vswp, port); 2657 2658 /* Remove any multicast addresses.. */ 2659 vsw_del_mcst_port(port); 2660 2661 /* 2662 * No longer need to hold writer lock on port list now 2663 * that we have unlinked the target port from the list. 2664 */ 2665 RW_EXIT(&plist->lockrw); 2666 2667 READ_ENTER(&plist->lockrw); 2668 2669 if (vswp->recfg_reqd) 2670 (void) vsw_reconfig_hw(vswp); 2671 2672 RW_EXIT(&plist->lockrw); 2673 2674 if (vsw_port_delete(port)) { 2675 return (1); 2676 } 2677 2678 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 2679 return (0); 2680 } 2681 2682 /* 2683 * Detach all active ports. 2684 * 2685 * Returns 0 on success, 1 on failure. 2686 */ 2687 static int 2688 vsw_detach_ports(vsw_t *vswp) 2689 { 2690 vsw_port_list_t *plist = &vswp->plist; 2691 vsw_port_t *port = NULL; 2692 2693 D1(vswp, "%s: enter", __func__); 2694 2695 WRITE_ENTER(&plist->lockrw); 2696 2697 while ((port = plist->head) != NULL) { 2698 if (vsw_plist_del_node(vswp, port)) { 2699 DERR(vswp, "%s: Error deleting port %d" 2700 " from port list", __func__, 2701 port->p_instance); 2702 RW_EXIT(&plist->lockrw); 2703 return (1); 2704 } 2705 2706 /* Remove address if was programmed into HW. */ 2707 (void) vsw_unset_hw(vswp, port); 2708 2709 /* Remove the fdb entry for this port/mac address */ 2710 (void) vsw_del_fdb(vswp, port); 2711 2712 /* Remove any multicast addresses.. */ 2713 vsw_del_mcst_port(port); 2714 2715 /* 2716 * No longer need to hold the lock on the port list 2717 * now that we have unlinked the target port from the 2718 * list. 2719 */ 2720 RW_EXIT(&plist->lockrw); 2721 if (vsw_port_delete(port)) { 2722 DERR(vswp, "%s: Error deleting port %d", 2723 __func__, port->p_instance); 2724 return (1); 2725 } 2726 WRITE_ENTER(&plist->lockrw); 2727 } 2728 RW_EXIT(&plist->lockrw); 2729 2730 D1(vswp, "%s: exit", __func__); 2731 2732 return (0); 2733 } 2734 2735 /* 2736 * Delete the specified port. 2737 * 2738 * Returns 0 on success, 1 on failure. 2739 */ 2740 static int 2741 vsw_port_delete(vsw_port_t *port) 2742 { 2743 vsw_ldc_list_t *ldcl; 2744 vsw_t *vswp = port->p_vswp; 2745 2746 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 2747 2748 (void) vsw_uninit_ldcs(port); 2749 2750 /* 2751 * Wait for any pending ctrl msg tasks which reference this 2752 * port to finish. 2753 */ 2754 if (vsw_drain_port_taskq(port)) 2755 return (1); 2756 2757 /* 2758 * Wait for port reference count to hit zero. 2759 */ 2760 mutex_enter(&port->ref_lock); 2761 while (port->ref_cnt != 0) 2762 cv_wait(&port->ref_cv, &port->ref_lock); 2763 mutex_exit(&port->ref_lock); 2764 2765 /* 2766 * Wait for any active callbacks to finish 2767 */ 2768 if (vsw_drain_ldcs(port)) 2769 return (1); 2770 2771 ldcl = &port->p_ldclist; 2772 WRITE_ENTER(&ldcl->lockrw); 2773 while (ldcl->num_ldcs > 0) { 2774 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 2775 cmn_err(CE_WARN, "unable to detach ldc %ld", 2776 ldcl->head->ldc_id); 2777 RW_EXIT(&ldcl->lockrw); 2778 return (1); 2779 } 2780 } 2781 RW_EXIT(&ldcl->lockrw); 2782 2783 rw_destroy(&port->p_ldclist.lockrw); 2784 2785 mutex_destroy(&port->mca_lock); 2786 mutex_destroy(&port->tx_lock); 2787 cv_destroy(&port->ref_cv); 2788 mutex_destroy(&port->ref_lock); 2789 2790 cv_destroy(&port->state_cv); 2791 mutex_destroy(&port->state_lock); 2792 2793 kmem_free(port, sizeof (vsw_port_t)); 2794 2795 D1(vswp, "%s: exit", __func__); 2796 2797 return (0); 2798 } 2799 2800 /* 2801 * Attach a logical domain channel (ldc) under a specified port. 2802 * 2803 * Returns 0 on success, 1 on failure. 2804 */ 2805 static int 2806 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 2807 { 2808 vsw_t *vswp = port->p_vswp; 2809 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2810 vsw_ldc_t *ldcp = NULL; 2811 ldc_attr_t attr; 2812 ldc_status_t istatus; 2813 int status = DDI_FAILURE; 2814 int rv; 2815 enum { PROG_init = 0x0, PROG_mblks = 0x1, 2816 PROG_callback = 0x2} 2817 progress; 2818 2819 progress = PROG_init; 2820 2821 D1(vswp, "%s: enter", __func__); 2822 2823 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 2824 if (ldcp == NULL) { 2825 DERR(vswp, "%s: kmem_zalloc failed", __func__); 2826 return (1); 2827 } 2828 ldcp->ldc_id = ldc_id; 2829 2830 /* allocate pool of receive mblks */ 2831 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 2832 if (rv) { 2833 DWARN(vswp, "%s: unable to create free mblk pool for" 2834 " channel %ld (rv %d)", __func__, ldc_id, rv); 2835 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2836 return (1); 2837 } 2838 2839 progress |= PROG_mblks; 2840 2841 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 2842 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 2843 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 2844 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 2845 2846 /* required for handshake with peer */ 2847 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 2848 ldcp->peer_session = 0; 2849 ldcp->session_status = 0; 2850 2851 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 2852 ldcp->hss_id = 1; /* Initial handshake session id */ 2853 2854 /* only set for outbound lane, inbound set by peer */ 2855 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 2856 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 2857 vsw_set_lane_attr(vswp, &ldcp->lane_out); 2858 2859 attr.devclass = LDC_DEV_NT_SVC; 2860 attr.instance = ddi_get_instance(vswp->dip); 2861 attr.mode = LDC_MODE_UNRELIABLE; 2862 attr.mtu = VSW_LDC_MTU; 2863 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 2864 if (status != 0) { 2865 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 2866 __func__, ldc_id, status); 2867 goto ldc_attach_fail; 2868 } 2869 2870 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 2871 if (status != 0) { 2872 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 2873 __func__, ldc_id, status); 2874 (void) ldc_fini(ldcp->ldc_handle); 2875 goto ldc_attach_fail; 2876 } 2877 2878 progress |= PROG_callback; 2879 2880 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 2881 2882 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2883 DERR(vswp, "%s: ldc_status failed", __func__); 2884 mutex_destroy(&ldcp->status_lock); 2885 goto ldc_attach_fail; 2886 } 2887 2888 ldcp->ldc_status = istatus; 2889 ldcp->ldc_port = port; 2890 ldcp->ldc_vswp = vswp; 2891 2892 /* link it into the list of channels for this port */ 2893 WRITE_ENTER(&ldcl->lockrw); 2894 ldcp->ldc_next = ldcl->head; 2895 ldcl->head = ldcp; 2896 ldcl->num_ldcs++; 2897 RW_EXIT(&ldcl->lockrw); 2898 2899 D1(vswp, "%s: exit", __func__); 2900 return (0); 2901 2902 ldc_attach_fail: 2903 mutex_destroy(&ldcp->ldc_txlock); 2904 mutex_destroy(&ldcp->ldc_cblock); 2905 2906 cv_destroy(&ldcp->drain_cv); 2907 2908 if (progress & PROG_callback) { 2909 (void) ldc_unreg_callback(ldcp->ldc_handle); 2910 } 2911 2912 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 2913 if (vio_destroy_mblks(ldcp->rxh) != 0) { 2914 /* 2915 * Something odd has happened, as the destroy 2916 * will only fail if some mblks have been allocated 2917 * from the pool already (which shouldn't happen) 2918 * and have not been returned. 2919 * 2920 * Add the pool pointer to a list maintained in 2921 * the device instance. Another attempt will be made 2922 * to free the pool when the device itself detaches. 2923 */ 2924 cmn_err(CE_WARN, "Creation of ldc channel %ld failed" 2925 " and cannot destroy associated mblk pool", 2926 ldc_id); 2927 ldcp->rxh->nextp = vswp->rxh; 2928 vswp->rxh = ldcp->rxh; 2929 } 2930 } 2931 mutex_destroy(&ldcp->drain_cv_lock); 2932 mutex_destroy(&ldcp->hss_lock); 2933 2934 mutex_destroy(&ldcp->lane_in.seq_lock); 2935 mutex_destroy(&ldcp->lane_out.seq_lock); 2936 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2937 2938 return (1); 2939 } 2940 2941 /* 2942 * Detach a logical domain channel (ldc) belonging to a 2943 * particular port. 2944 * 2945 * Returns 0 on success, 1 on failure. 2946 */ 2947 static int 2948 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 2949 { 2950 vsw_t *vswp = port->p_vswp; 2951 vsw_ldc_t *ldcp, *prev_ldcp; 2952 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2953 int rv; 2954 2955 prev_ldcp = ldcl->head; 2956 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 2957 if (ldcp->ldc_id == ldc_id) { 2958 break; 2959 } 2960 } 2961 2962 /* specified ldc id not found */ 2963 if (ldcp == NULL) { 2964 DERR(vswp, "%s: ldcp = NULL", __func__); 2965 return (1); 2966 } 2967 2968 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 2969 2970 /* 2971 * Before we can close the channel we must release any mapped 2972 * resources (e.g. drings). 2973 */ 2974 vsw_free_lane_resources(ldcp, INBOUND); 2975 vsw_free_lane_resources(ldcp, OUTBOUND); 2976 2977 /* 2978 * If the close fails we are in serious trouble, as won't 2979 * be able to delete the parent port. 2980 */ 2981 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 2982 DERR(vswp, "%s: error %d closing channel %lld", 2983 __func__, rv, ldcp->ldc_id); 2984 return (1); 2985 } 2986 2987 (void) ldc_fini(ldcp->ldc_handle); 2988 2989 ldcp->ldc_status = LDC_INIT; 2990 ldcp->ldc_handle = NULL; 2991 ldcp->ldc_vswp = NULL; 2992 2993 if (ldcp->rxh != NULL) { 2994 if (vio_destroy_mblks(ldcp->rxh)) { 2995 /* 2996 * Mostly likely some mblks are still in use and 2997 * have not been returned to the pool. Add the pool 2998 * to the list maintained in the device instance. 2999 * Another attempt will be made to destroy the pool 3000 * when the device detaches. 3001 */ 3002 ldcp->rxh->nextp = vswp->rxh; 3003 vswp->rxh = ldcp->rxh; 3004 } 3005 } 3006 3007 /* unlink it from the list */ 3008 prev_ldcp = ldcp->ldc_next; 3009 ldcl->num_ldcs--; 3010 3011 mutex_destroy(&ldcp->ldc_txlock); 3012 mutex_destroy(&ldcp->ldc_cblock); 3013 cv_destroy(&ldcp->drain_cv); 3014 mutex_destroy(&ldcp->drain_cv_lock); 3015 mutex_destroy(&ldcp->hss_lock); 3016 mutex_destroy(&ldcp->lane_in.seq_lock); 3017 mutex_destroy(&ldcp->lane_out.seq_lock); 3018 mutex_destroy(&ldcp->status_lock); 3019 3020 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3021 3022 return (0); 3023 } 3024 3025 /* 3026 * Open and attempt to bring up the channel. Note that channel 3027 * can only be brought up if peer has also opened channel. 3028 * 3029 * Returns 0 if can open and bring up channel, otherwise 3030 * returns 1. 3031 */ 3032 static int 3033 vsw_ldc_init(vsw_ldc_t *ldcp) 3034 { 3035 vsw_t *vswp = ldcp->ldc_vswp; 3036 ldc_status_t istatus = 0; 3037 int rv; 3038 3039 D1(vswp, "%s: enter", __func__); 3040 3041 LDC_ENTER_LOCK(ldcp); 3042 3043 /* don't start at 0 in case clients don't like that */ 3044 ldcp->next_ident = 1; 3045 3046 rv = ldc_open(ldcp->ldc_handle); 3047 if (rv != 0) { 3048 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 3049 __func__, ldcp->ldc_id, rv); 3050 LDC_EXIT_LOCK(ldcp); 3051 return (1); 3052 } 3053 3054 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3055 DERR(vswp, "%s: unable to get status", __func__); 3056 LDC_EXIT_LOCK(ldcp); 3057 return (1); 3058 3059 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 3060 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 3061 __func__, ldcp->ldc_id, istatus); 3062 LDC_EXIT_LOCK(ldcp); 3063 return (1); 3064 } 3065 3066 mutex_enter(&ldcp->status_lock); 3067 ldcp->ldc_status = istatus; 3068 mutex_exit(&ldcp->status_lock); 3069 3070 rv = ldc_up(ldcp->ldc_handle); 3071 if (rv != 0) { 3072 /* 3073 * Not a fatal error for ldc_up() to fail, as peer 3074 * end point may simply not be ready yet. 3075 */ 3076 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3077 ldcp->ldc_id, rv); 3078 LDC_EXIT_LOCK(ldcp); 3079 return (1); 3080 } 3081 3082 /* 3083 * ldc_up() call is non-blocking so need to explicitly 3084 * check channel status to see if in fact the channel 3085 * is UP. 3086 */ 3087 mutex_enter(&ldcp->status_lock); 3088 istatus = ldcp->ldc_status; 3089 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3090 DERR(vswp, "%s: unable to get status", __func__); 3091 mutex_exit(&ldcp->status_lock); 3092 LDC_EXIT_LOCK(ldcp); 3093 return (1); 3094 3095 } 3096 mutex_exit(&ldcp->status_lock); 3097 LDC_EXIT_LOCK(ldcp); 3098 3099 if ((istatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { 3100 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 3101 ldcp->ldc_id, istatus); 3102 vsw_restart_handshake(ldcp); 3103 } 3104 3105 D1(vswp, "%s: exit", __func__); 3106 return (0); 3107 } 3108 3109 /* disable callbacks on the channel */ 3110 static int 3111 vsw_ldc_uninit(vsw_ldc_t *ldcp) 3112 { 3113 vsw_t *vswp = ldcp->ldc_vswp; 3114 int rv; 3115 3116 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 3117 3118 LDC_ENTER_LOCK(ldcp); 3119 3120 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 3121 if (rv != 0) { 3122 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 3123 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 3124 LDC_EXIT_LOCK(ldcp); 3125 return (1); 3126 } 3127 3128 mutex_enter(&ldcp->status_lock); 3129 ldcp->ldc_status = LDC_INIT; 3130 mutex_exit(&ldcp->status_lock); 3131 3132 LDC_EXIT_LOCK(ldcp); 3133 3134 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 3135 3136 return (0); 3137 } 3138 3139 static int 3140 vsw_init_ldcs(vsw_port_t *port) 3141 { 3142 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3143 vsw_ldc_t *ldcp; 3144 3145 READ_ENTER(&ldcl->lockrw); 3146 ldcp = ldcl->head; 3147 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3148 (void) vsw_ldc_init(ldcp); 3149 } 3150 RW_EXIT(&ldcl->lockrw); 3151 3152 return (0); 3153 } 3154 3155 static int 3156 vsw_uninit_ldcs(vsw_port_t *port) 3157 { 3158 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3159 vsw_ldc_t *ldcp; 3160 3161 D1(NULL, "vsw_uninit_ldcs: enter\n"); 3162 3163 READ_ENTER(&ldcl->lockrw); 3164 ldcp = ldcl->head; 3165 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3166 (void) vsw_ldc_uninit(ldcp); 3167 } 3168 RW_EXIT(&ldcl->lockrw); 3169 3170 D1(NULL, "vsw_uninit_ldcs: exit\n"); 3171 3172 return (0); 3173 } 3174 3175 /* 3176 * Wait until the callback(s) associated with the ldcs under the specified 3177 * port have completed. 3178 * 3179 * Prior to this function being invoked each channel under this port 3180 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3181 * 3182 * A short explaination of what we are doing below.. 3183 * 3184 * The simplest approach would be to have a reference counter in 3185 * the ldc structure which is increment/decremented by the callbacks as 3186 * they use the channel. The drain function could then simply disable any 3187 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 3188 * there is a tiny window here - before the callback is able to get the lock 3189 * on the channel it is interrupted and this function gets to execute. It 3190 * sees that the ref count is zero and believes its free to delete the 3191 * associated data structures. 3192 * 3193 * We get around this by taking advantage of the fact that before the ldc 3194 * framework invokes a callback it sets a flag to indicate that there is a 3195 * callback active (or about to become active). If when we attempt to 3196 * unregister a callback when this active flag is set then the unregister 3197 * will fail with EWOULDBLOCK. 3198 * 3199 * If the unregister fails we do a cv_timedwait. We will either be signaled 3200 * by the callback as it is exiting (note we have to wait a short period to 3201 * allow the callback to return fully to the ldc framework and it to clear 3202 * the active flag), or by the timer expiring. In either case we again attempt 3203 * the unregister. We repeat this until we can succesfully unregister the 3204 * callback. 3205 * 3206 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 3207 * the case where the callback has finished but the ldc framework has not yet 3208 * cleared the active flag. In this case we would never get a cv_signal. 3209 */ 3210 static int 3211 vsw_drain_ldcs(vsw_port_t *port) 3212 { 3213 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3214 vsw_ldc_t *ldcp; 3215 vsw_t *vswp = port->p_vswp; 3216 3217 D1(vswp, "%s: enter", __func__); 3218 3219 READ_ENTER(&ldcl->lockrw); 3220 3221 ldcp = ldcl->head; 3222 3223 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3224 /* 3225 * If we can unregister the channel callback then we 3226 * know that there is no callback either running or 3227 * scheduled to run for this channel so move on to next 3228 * channel in the list. 3229 */ 3230 mutex_enter(&ldcp->drain_cv_lock); 3231 3232 /* prompt active callbacks to quit */ 3233 ldcp->drain_state = VSW_LDC_DRAINING; 3234 3235 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 3236 D2(vswp, "%s: unreg callback for chan %ld", __func__, 3237 ldcp->ldc_id); 3238 mutex_exit(&ldcp->drain_cv_lock); 3239 continue; 3240 } else { 3241 /* 3242 * If we end up here we know that either 1) a callback 3243 * is currently executing, 2) is about to start (i.e. 3244 * the ldc framework has set the active flag but 3245 * has not actually invoked the callback yet, or 3) 3246 * has finished and has returned to the ldc framework 3247 * but the ldc framework has not yet cleared the 3248 * active bit. 3249 * 3250 * Wait for it to finish. 3251 */ 3252 while (ldc_unreg_callback(ldcp->ldc_handle) 3253 == EWOULDBLOCK) 3254 (void) cv_timedwait(&ldcp->drain_cv, 3255 &ldcp->drain_cv_lock, lbolt + hz); 3256 3257 mutex_exit(&ldcp->drain_cv_lock); 3258 D2(vswp, "%s: unreg callback for chan %ld after " 3259 "timeout", __func__, ldcp->ldc_id); 3260 } 3261 } 3262 RW_EXIT(&ldcl->lockrw); 3263 3264 D1(vswp, "%s: exit", __func__); 3265 return (0); 3266 } 3267 3268 /* 3269 * Wait until all tasks which reference this port have completed. 3270 * 3271 * Prior to this function being invoked each channel under this port 3272 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3273 */ 3274 static int 3275 vsw_drain_port_taskq(vsw_port_t *port) 3276 { 3277 vsw_t *vswp = port->p_vswp; 3278 3279 D1(vswp, "%s: enter", __func__); 3280 3281 /* 3282 * Mark the port as in the process of being detached, and 3283 * dispatch a marker task to the queue so we know when all 3284 * relevant tasks have completed. 3285 */ 3286 mutex_enter(&port->state_lock); 3287 port->state = VSW_PORT_DETACHING; 3288 3289 if ((vswp->taskq_p == NULL) || 3290 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 3291 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 3292 DERR(vswp, "%s: unable to dispatch marker task", 3293 __func__); 3294 mutex_exit(&port->state_lock); 3295 return (1); 3296 } 3297 3298 /* 3299 * Wait for the marker task to finish. 3300 */ 3301 while (port->state != VSW_PORT_DETACHABLE) 3302 cv_wait(&port->state_cv, &port->state_lock); 3303 3304 mutex_exit(&port->state_lock); 3305 3306 D1(vswp, "%s: exit", __func__); 3307 3308 return (0); 3309 } 3310 3311 static void 3312 vsw_marker_task(void *arg) 3313 { 3314 vsw_port_t *port = arg; 3315 vsw_t *vswp = port->p_vswp; 3316 3317 D1(vswp, "%s: enter", __func__); 3318 3319 mutex_enter(&port->state_lock); 3320 3321 /* 3322 * No further tasks should be dispatched which reference 3323 * this port so ok to mark it as safe to detach. 3324 */ 3325 port->state = VSW_PORT_DETACHABLE; 3326 3327 cv_signal(&port->state_cv); 3328 3329 mutex_exit(&port->state_lock); 3330 3331 D1(vswp, "%s: exit", __func__); 3332 } 3333 3334 static vsw_port_t * 3335 vsw_lookup_port(vsw_t *vswp, int p_instance) 3336 { 3337 vsw_port_list_t *plist = &vswp->plist; 3338 vsw_port_t *port; 3339 3340 for (port = plist->head; port != NULL; port = port->p_next) { 3341 if (port->p_instance == p_instance) { 3342 D2(vswp, "vsw_lookup_port: found p_instance\n"); 3343 return (port); 3344 } 3345 } 3346 3347 return (NULL); 3348 } 3349 3350 /* 3351 * Search for and remove the specified port from the port 3352 * list. Returns 0 if able to locate and remove port, otherwise 3353 * returns 1. 3354 */ 3355 static int 3356 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 3357 { 3358 vsw_port_list_t *plist = &vswp->plist; 3359 vsw_port_t *curr_p, *prev_p; 3360 3361 if (plist->head == NULL) 3362 return (1); 3363 3364 curr_p = prev_p = plist->head; 3365 3366 while (curr_p != NULL) { 3367 if (curr_p == port) { 3368 if (prev_p == curr_p) { 3369 plist->head = curr_p->p_next; 3370 } else { 3371 prev_p->p_next = curr_p->p_next; 3372 } 3373 plist->num_ports--; 3374 break; 3375 } else { 3376 prev_p = curr_p; 3377 curr_p = curr_p->p_next; 3378 } 3379 } 3380 return (0); 3381 } 3382 3383 /* 3384 * Interrupt handler for ldc messages. 3385 */ 3386 static uint_t 3387 vsw_ldc_cb(uint64_t event, caddr_t arg) 3388 { 3389 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3390 vsw_t *vswp = ldcp->ldc_vswp; 3391 ldc_status_t lstatus; 3392 int rv; 3393 3394 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3395 3396 mutex_enter(&ldcp->ldc_cblock); 3397 3398 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 3399 mutex_exit(&ldcp->ldc_cblock); 3400 return (LDC_SUCCESS); 3401 } 3402 3403 mutex_enter(&ldcp->status_lock); 3404 lstatus = ldcp->ldc_status; 3405 rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); 3406 mutex_exit(&ldcp->status_lock); 3407 if (rv != 0) { 3408 cmn_err(CE_WARN, "Unable to read channel state"); 3409 goto vsw_cb_exit; 3410 } 3411 3412 if (event & LDC_EVT_UP) { 3413 /* 3414 * Channel has come up, get the state and then start 3415 * the handshake. 3416 */ 3417 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 3418 __func__, ldcp->ldc_id, event, lstatus); 3419 D2(vswp, "%s: UP: old status %ld : cur status %ld", 3420 __func__, lstatus, ldcp->ldc_status); 3421 if ((ldcp->ldc_status != lstatus) && 3422 (ldcp->ldc_status == LDC_UP)) { 3423 vsw_restart_handshake(ldcp); 3424 } 3425 3426 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3427 } 3428 3429 if (event & LDC_EVT_READ) { 3430 /* 3431 * Data available for reading. 3432 */ 3433 D2(vswp, "%s: id(ld) event(%llx) data READ", 3434 __func__, ldcp->ldc_id, event); 3435 3436 vsw_process_pkt(ldcp); 3437 3438 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3439 3440 goto vsw_cb_exit; 3441 } 3442 3443 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 3444 D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET", 3445 __func__, ldcp->ldc_id, event); 3446 3447 /* attempt to restart the connection */ 3448 vsw_restart_ldc(ldcp); 3449 3450 /* 3451 * vsw_restart_ldc() will attempt to bring the channel 3452 * back up. Check here to see if that succeeded. 3453 */ 3454 mutex_enter(&ldcp->status_lock); 3455 lstatus = ldcp->ldc_status; 3456 rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); 3457 mutex_exit(&ldcp->status_lock); 3458 if (rv != 0) { 3459 DERR(vswp, "%s: unable to read status for channel %ld", 3460 __func__, ldcp->ldc_id); 3461 goto vsw_cb_exit; 3462 } 3463 3464 D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET event:" 3465 " old status %ld : cur status %ld", __func__, 3466 ldcp->ldc_id, event, lstatus, ldcp->ldc_status); 3467 3468 /* 3469 * If channel was not previously UP then (re)start the 3470 * handshake. 3471 */ 3472 if ((ldcp->ldc_status == LDC_UP) && (lstatus != LDC_UP)) { 3473 D2(vswp, "%s: channel %ld now UP, restarting " 3474 "handshake", __func__, ldcp->ldc_id); 3475 vsw_restart_handshake(ldcp); 3476 } 3477 } 3478 3479 /* 3480 * Catch either LDC_EVT_WRITE which we don't support or any 3481 * unknown event. 3482 */ 3483 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 3484 | LDC_EVT_DOWN | LDC_EVT_READ)) { 3485 3486 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 3487 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3488 } 3489 3490 vsw_cb_exit: 3491 mutex_exit(&ldcp->ldc_cblock); 3492 3493 /* 3494 * Let the drain function know we are finishing if it 3495 * is waiting. 3496 */ 3497 mutex_enter(&ldcp->drain_cv_lock); 3498 if (ldcp->drain_state == VSW_LDC_DRAINING) 3499 cv_signal(&ldcp->drain_cv); 3500 mutex_exit(&ldcp->drain_cv_lock); 3501 3502 return (LDC_SUCCESS); 3503 } 3504 3505 /* 3506 * Restart the connection with our peer. Free any existing 3507 * data structures and then attempt to bring channel back 3508 * up. 3509 */ 3510 static void 3511 vsw_restart_ldc(vsw_ldc_t *ldcp) 3512 { 3513 int rv; 3514 vsw_t *vswp = ldcp->ldc_vswp; 3515 vsw_port_t *port; 3516 vsw_ldc_list_t *ldcl; 3517 3518 D1(vswp, "%s: enter", __func__); 3519 3520 port = ldcp->ldc_port; 3521 ldcl = &port->p_ldclist; 3522 3523 READ_ENTER(&ldcl->lockrw); 3524 3525 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 3526 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 3527 3528 vsw_free_lane_resources(ldcp, INBOUND); 3529 vsw_free_lane_resources(ldcp, OUTBOUND); 3530 RW_EXIT(&ldcl->lockrw); 3531 3532 ldcp->lane_in.lstate = 0; 3533 ldcp->lane_out.lstate = 0; 3534 3535 /* 3536 * Remove parent port from any multicast groups 3537 * it may have registered with. Client must resend 3538 * multicast add command after handshake completes. 3539 */ 3540 (void) vsw_del_fdb(vswp, port); 3541 3542 vsw_del_mcst_port(port); 3543 3544 ldcp->peer_session = 0; 3545 ldcp->session_status = 0; 3546 ldcp->hcnt = 0; 3547 ldcp->hphase = VSW_MILESTONE0; 3548 3549 rv = ldc_up(ldcp->ldc_handle); 3550 if (rv != 0) { 3551 /* 3552 * Not a fatal error for ldc_up() to fail, as peer 3553 * end point may simply not be ready yet. 3554 */ 3555 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3556 ldcp->ldc_id, rv); 3557 } 3558 3559 D1(vswp, "%s: exit", __func__); 3560 } 3561 3562 /* 3563 * (Re)start a handshake with our peer by sending them 3564 * our version info. 3565 */ 3566 static void 3567 vsw_restart_handshake(vsw_ldc_t *ldcp) 3568 { 3569 vsw_t *vswp = ldcp->ldc_vswp; 3570 3571 D1(vswp, "vsw_restart_handshake: enter"); 3572 3573 if (ldcp->hphase != VSW_MILESTONE0) { 3574 vsw_restart_ldc(ldcp); 3575 } 3576 3577 /* 3578 * We now increment the transaction group id. This allows 3579 * us to identify and disard any tasks which are still pending 3580 * on the taskq and refer to the handshake session we are about 3581 * to restart. These stale messages no longer have any real 3582 * meaning. 3583 */ 3584 mutex_enter(&ldcp->hss_lock); 3585 ldcp->hss_id++; 3586 mutex_exit(&ldcp->hss_lock); 3587 3588 if (ldcp->hcnt++ > vsw_num_handshakes) { 3589 cmn_err(CE_WARN, "exceeded number of permitted " 3590 "handshake attempts (%d) on channel %ld", 3591 ldcp->hcnt, ldcp->ldc_id); 3592 return; 3593 } 3594 3595 if ((vswp->taskq_p == NULL) || 3596 (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 3597 DDI_NOSLEEP) != DDI_SUCCESS)) { 3598 cmn_err(CE_WARN, "Can't dispatch version handshake task"); 3599 } 3600 3601 D1(vswp, "vsw_restart_handshake: exit"); 3602 } 3603 3604 /* 3605 * Deal appropriately with a ECONNRESET event encountered in a ldc_* 3606 * call. 3607 */ 3608 static void 3609 vsw_handle_reset(vsw_ldc_t *ldcp) 3610 { 3611 vsw_t *vswp = ldcp->ldc_vswp; 3612 ldc_status_t lstatus; 3613 3614 D1(vswp, "%s: enter", __func__); 3615 3616 mutex_enter(&ldcp->status_lock); 3617 lstatus = ldcp->ldc_status; 3618 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3619 DERR(vswp, "%s: unable to read status for channel %ld", 3620 __func__, ldcp->ldc_id); 3621 mutex_exit(&ldcp->status_lock); 3622 return; 3623 } 3624 mutex_exit(&ldcp->status_lock); 3625 3626 /* 3627 * Check the channel's previous recorded state to 3628 * determine if this is the first ECONNRESET event 3629 * we've gotten for this particular channel (i.e. was 3630 * previously up but is no longer). If so, terminate 3631 * the channel. 3632 */ 3633 if ((ldcp->ldc_status != LDC_UP) && (lstatus == LDC_UP)) { 3634 vsw_restart_ldc(ldcp); 3635 } 3636 3637 /* 3638 * vsw_restart_ldc() will also attempt to bring channel 3639 * back up. Check here if that succeeds. 3640 */ 3641 mutex_enter(&ldcp->status_lock); 3642 lstatus = ldcp->ldc_status; 3643 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3644 DERR(vswp, "%s: unable to read status for channel %ld", 3645 __func__, ldcp->ldc_id); 3646 mutex_exit(&ldcp->status_lock); 3647 return; 3648 } 3649 mutex_exit(&ldcp->status_lock); 3650 3651 /* 3652 * If channel is now up and no one else (i.e. the callback routine) 3653 * has dealt with it then we restart the handshake here. 3654 */ 3655 if ((lstatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { 3656 vsw_restart_handshake(ldcp); 3657 } 3658 3659 D1(vswp, "%s: exit", __func__); 3660 } 3661 3662 /* 3663 * returns 0 if legal for event signified by flag to have 3664 * occured at the time it did. Otherwise returns 1. 3665 */ 3666 int 3667 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 3668 { 3669 vsw_t *vswp = ldcp->ldc_vswp; 3670 uint64_t state; 3671 uint64_t phase; 3672 3673 if (dir == INBOUND) 3674 state = ldcp->lane_in.lstate; 3675 else 3676 state = ldcp->lane_out.lstate; 3677 3678 phase = ldcp->hphase; 3679 3680 switch (flag) { 3681 case VSW_VER_INFO_RECV: 3682 if (phase > VSW_MILESTONE0) { 3683 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 3684 " when in state %d\n", ldcp->ldc_id, phase); 3685 vsw_restart_handshake(ldcp); 3686 return (1); 3687 } 3688 break; 3689 3690 case VSW_VER_ACK_RECV: 3691 case VSW_VER_NACK_RECV: 3692 if (!(state & VSW_VER_INFO_SENT)) { 3693 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 3694 " or VER_NACK when in state %d\n", 3695 ldcp->ldc_id, phase); 3696 vsw_restart_handshake(ldcp); 3697 return (1); 3698 } else 3699 state &= ~VSW_VER_INFO_SENT; 3700 break; 3701 3702 case VSW_ATTR_INFO_RECV: 3703 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 3704 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 3705 " when in state %d\n", ldcp->ldc_id, phase); 3706 vsw_restart_handshake(ldcp); 3707 return (1); 3708 } 3709 break; 3710 3711 case VSW_ATTR_ACK_RECV: 3712 case VSW_ATTR_NACK_RECV: 3713 if (!(state & VSW_ATTR_INFO_SENT)) { 3714 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 3715 " or ATTR_NACK when in state %d\n", 3716 ldcp->ldc_id, phase); 3717 vsw_restart_handshake(ldcp); 3718 return (1); 3719 } else 3720 state &= ~VSW_ATTR_INFO_SENT; 3721 break; 3722 3723 case VSW_DRING_INFO_RECV: 3724 if (phase < VSW_MILESTONE1) { 3725 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 3726 " when in state %d\n", ldcp->ldc_id, phase); 3727 vsw_restart_handshake(ldcp); 3728 return (1); 3729 } 3730 break; 3731 3732 case VSW_DRING_ACK_RECV: 3733 case VSW_DRING_NACK_RECV: 3734 if (!(state & VSW_DRING_INFO_SENT)) { 3735 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 3736 " or DRING_NACK when in state %d\n", 3737 ldcp->ldc_id, phase); 3738 vsw_restart_handshake(ldcp); 3739 return (1); 3740 } else 3741 state &= ~VSW_DRING_INFO_SENT; 3742 break; 3743 3744 case VSW_RDX_INFO_RECV: 3745 if (phase < VSW_MILESTONE3) { 3746 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 3747 " when in state %d\n", ldcp->ldc_id, phase); 3748 vsw_restart_handshake(ldcp); 3749 return (1); 3750 } 3751 break; 3752 3753 case VSW_RDX_ACK_RECV: 3754 case VSW_RDX_NACK_RECV: 3755 if (!(state & VSW_RDX_INFO_SENT)) { 3756 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 3757 " or RDX_NACK when in state %d\n", 3758 ldcp->ldc_id, phase); 3759 vsw_restart_handshake(ldcp); 3760 return (1); 3761 } else 3762 state &= ~VSW_RDX_INFO_SENT; 3763 break; 3764 3765 case VSW_MCST_INFO_RECV: 3766 if (phase < VSW_MILESTONE3) { 3767 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 3768 " when in state %d\n", ldcp->ldc_id, phase); 3769 vsw_restart_handshake(ldcp); 3770 return (1); 3771 } 3772 break; 3773 3774 default: 3775 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 3776 ldcp->ldc_id, flag); 3777 return (1); 3778 } 3779 3780 if (dir == INBOUND) 3781 ldcp->lane_in.lstate = state; 3782 else 3783 ldcp->lane_out.lstate = state; 3784 3785 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 3786 3787 return (0); 3788 } 3789 3790 void 3791 vsw_next_milestone(vsw_ldc_t *ldcp) 3792 { 3793 vsw_t *vswp = ldcp->ldc_vswp; 3794 3795 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 3796 ldcp->ldc_id, ldcp->hphase); 3797 3798 DUMP_FLAGS(ldcp->lane_in.lstate); 3799 DUMP_FLAGS(ldcp->lane_out.lstate); 3800 3801 switch (ldcp->hphase) { 3802 3803 case VSW_MILESTONE0: 3804 /* 3805 * If we haven't started to handshake with our peer, 3806 * start to do so now. 3807 */ 3808 if (ldcp->lane_out.lstate == 0) { 3809 D2(vswp, "%s: (chan %lld) starting handshake " 3810 "with peer", __func__, ldcp->ldc_id); 3811 vsw_restart_handshake(ldcp); 3812 } 3813 3814 /* 3815 * Only way to pass this milestone is to have successfully 3816 * negotiated version info. 3817 */ 3818 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 3819 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 3820 3821 D2(vswp, "%s: (chan %lld) leaving milestone 0", 3822 __func__, ldcp->ldc_id); 3823 3824 /* 3825 * Next milestone is passed when attribute 3826 * information has been successfully exchanged. 3827 */ 3828 ldcp->hphase = VSW_MILESTONE1; 3829 vsw_send_attr(ldcp); 3830 3831 } 3832 break; 3833 3834 case VSW_MILESTONE1: 3835 /* 3836 * Only way to pass this milestone is to have successfully 3837 * negotiated attribute information. 3838 */ 3839 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 3840 3841 ldcp->hphase = VSW_MILESTONE2; 3842 3843 /* 3844 * If the peer device has said it wishes to 3845 * use descriptor rings then we send it our ring 3846 * info, otherwise we just set up a private ring 3847 * which we use an internal buffer 3848 */ 3849 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 3850 vsw_send_dring_info(ldcp); 3851 } 3852 break; 3853 3854 3855 case VSW_MILESTONE2: 3856 /* 3857 * If peer has indicated in its attribute message that 3858 * it wishes to use descriptor rings then the only way 3859 * to pass this milestone is for us to have received 3860 * valid dring info. 3861 * 3862 * If peer is not using descriptor rings then just fall 3863 * through. 3864 */ 3865 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 3866 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 3867 break; 3868 3869 D2(vswp, "%s: (chan %lld) leaving milestone 2", 3870 __func__, ldcp->ldc_id); 3871 3872 ldcp->hphase = VSW_MILESTONE3; 3873 vsw_send_rdx(ldcp); 3874 break; 3875 3876 case VSW_MILESTONE3: 3877 /* 3878 * Pass this milestone when all paramaters have been 3879 * successfully exchanged and RDX sent in both directions. 3880 * 3881 * Mark outbound lane as available to transmit data. 3882 */ 3883 if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && 3884 (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { 3885 3886 D2(vswp, "%s: (chan %lld) leaving milestone 3", 3887 __func__, ldcp->ldc_id); 3888 D2(vswp, "%s: ** handshake complete (0x%llx : " 3889 "0x%llx) **", __func__, ldcp->lane_in.lstate, 3890 ldcp->lane_out.lstate); 3891 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 3892 ldcp->hphase = VSW_MILESTONE4; 3893 ldcp->hcnt = 0; 3894 DISPLAY_STATE(); 3895 } else { 3896 D2(vswp, "%s: still in milestone 3 (0x%llx :" 3897 " 0x%llx", __func__, ldcp->lane_in.lstate, 3898 ldcp->lane_out.lstate); 3899 } 3900 break; 3901 3902 case VSW_MILESTONE4: 3903 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 3904 ldcp->ldc_id); 3905 break; 3906 3907 default: 3908 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 3909 ldcp->ldc_id, ldcp->hphase); 3910 } 3911 3912 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 3913 ldcp->hphase); 3914 } 3915 3916 /* 3917 * Check if major version is supported. 3918 * 3919 * Returns 0 if finds supported major number, and if necessary 3920 * adjusts the minor field. 3921 * 3922 * Returns 1 if can't match major number exactly. Sets mjor/minor 3923 * to next lowest support values, or to zero if no other values possible. 3924 */ 3925 static int 3926 vsw_supported_version(vio_ver_msg_t *vp) 3927 { 3928 int i; 3929 3930 D1(NULL, "vsw_supported_version: enter"); 3931 3932 for (i = 0; i < VSW_NUM_VER; i++) { 3933 if (vsw_versions[i].ver_major == vp->ver_major) { 3934 /* 3935 * Matching or lower major version found. Update 3936 * minor number if necessary. 3937 */ 3938 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3939 D2(NULL, "%s: adjusting minor value" 3940 " from %d to %d", __func__, 3941 vp->ver_minor, 3942 vsw_versions[i].ver_minor); 3943 vp->ver_minor = vsw_versions[i].ver_minor; 3944 } 3945 3946 return (0); 3947 } 3948 3949 if (vsw_versions[i].ver_major < vp->ver_major) { 3950 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3951 D2(NULL, "%s: adjusting minor value" 3952 " from %d to %d", __func__, 3953 vp->ver_minor, 3954 vsw_versions[i].ver_minor); 3955 vp->ver_minor = vsw_versions[i].ver_minor; 3956 } 3957 return (1); 3958 } 3959 } 3960 3961 /* No match was possible, zero out fields */ 3962 vp->ver_major = 0; 3963 vp->ver_minor = 0; 3964 3965 D1(NULL, "vsw_supported_version: exit"); 3966 3967 return (1); 3968 } 3969 3970 /* 3971 * Main routine for processing messages received over LDC. 3972 */ 3973 static void 3974 vsw_process_pkt(void *arg) 3975 { 3976 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3977 vsw_t *vswp = ldcp->ldc_vswp; 3978 size_t msglen; 3979 vio_msg_tag_t tag; 3980 def_msg_t dmsg; 3981 int rv = 0; 3982 3983 3984 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3985 3986 /* 3987 * If channel is up read messages until channel is empty. 3988 */ 3989 do { 3990 msglen = sizeof (dmsg); 3991 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 3992 3993 if (rv != 0) { 3994 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 3995 "len(%d)\n", __func__, ldcp->ldc_id, 3996 rv, msglen); 3997 } 3998 3999 /* channel has been reset */ 4000 if (rv == ECONNRESET) { 4001 vsw_handle_reset(ldcp); 4002 break; 4003 } 4004 4005 if (msglen == 0) { 4006 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 4007 ldcp->ldc_id); 4008 break; 4009 } 4010 4011 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 4012 ldcp->ldc_id, msglen); 4013 4014 /* 4015 * Figure out what sort of packet we have gotten by 4016 * examining the msg tag, and then switch it appropriately. 4017 */ 4018 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 4019 4020 switch (tag.vio_msgtype) { 4021 case VIO_TYPE_CTRL: 4022 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 4023 break; 4024 case VIO_TYPE_DATA: 4025 vsw_process_data_pkt(ldcp, &dmsg, tag); 4026 break; 4027 case VIO_TYPE_ERR: 4028 vsw_process_err_pkt(ldcp, &dmsg, tag); 4029 break; 4030 default: 4031 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 4032 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 4033 break; 4034 } 4035 } while (msglen); 4036 4037 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4038 } 4039 4040 /* 4041 * Dispatch a task to process a VIO control message. 4042 */ 4043 static void 4044 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 4045 { 4046 vsw_ctrl_task_t *ctaskp = NULL; 4047 vsw_port_t *port = ldcp->ldc_port; 4048 vsw_t *vswp = port->p_vswp; 4049 4050 D1(vswp, "%s: enter", __func__); 4051 4052 /* 4053 * We need to handle RDX ACK messages in-band as once they 4054 * are exchanged it is possible that we will get an 4055 * immediate (legitimate) data packet. 4056 */ 4057 if ((tag.vio_subtype_env == VIO_RDX) && 4058 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 4059 4060 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) 4061 return; 4062 4063 ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; 4064 D2(vswp, "%s (%ld) handling RDX_ACK in place " 4065 "(ostate 0x%llx : hphase %d)", __func__, 4066 ldcp->ldc_id, ldcp->lane_out.lstate, ldcp->hphase); 4067 vsw_next_milestone(ldcp); 4068 return; 4069 } 4070 4071 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 4072 4073 if (ctaskp == NULL) { 4074 DERR(vswp, "%s: unable to alloc space for ctrl" 4075 " msg", __func__); 4076 vsw_restart_handshake(ldcp); 4077 return; 4078 } 4079 4080 ctaskp->ldcp = ldcp; 4081 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 4082 mutex_enter(&ldcp->hss_lock); 4083 ctaskp->hss_id = ldcp->hss_id; 4084 mutex_exit(&ldcp->hss_lock); 4085 4086 /* 4087 * Dispatch task to processing taskq if port is not in 4088 * the process of being detached. 4089 */ 4090 mutex_enter(&port->state_lock); 4091 if (port->state == VSW_PORT_INIT) { 4092 if ((vswp->taskq_p == NULL) || 4093 (ddi_taskq_dispatch(vswp->taskq_p, 4094 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 4095 != DDI_SUCCESS)) { 4096 DERR(vswp, "%s: unable to dispatch task to taskq", 4097 __func__); 4098 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4099 mutex_exit(&port->state_lock); 4100 vsw_restart_handshake(ldcp); 4101 return; 4102 } 4103 } else { 4104 DWARN(vswp, "%s: port %d detaching, not dispatching " 4105 "task", __func__, port->p_instance); 4106 } 4107 4108 mutex_exit(&port->state_lock); 4109 4110 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 4111 ldcp->ldc_id); 4112 D1(vswp, "%s: exit", __func__); 4113 } 4114 4115 /* 4116 * Process a VIO ctrl message. Invoked from taskq. 4117 */ 4118 static void 4119 vsw_process_ctrl_pkt(void *arg) 4120 { 4121 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 4122 vsw_ldc_t *ldcp = ctaskp->ldcp; 4123 vsw_t *vswp = ldcp->ldc_vswp; 4124 vio_msg_tag_t tag; 4125 uint16_t env; 4126 4127 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4128 4129 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 4130 env = tag.vio_subtype_env; 4131 4132 /* stale pkt check */ 4133 mutex_enter(&ldcp->hss_lock); 4134 if (ctaskp->hss_id < ldcp->hss_id) { 4135 DWARN(vswp, "%s: discarding stale packet belonging to" 4136 " earlier (%ld) handshake session", __func__, 4137 ctaskp->hss_id); 4138 mutex_exit(&ldcp->hss_lock); 4139 return; 4140 } 4141 mutex_exit(&ldcp->hss_lock); 4142 4143 /* session id check */ 4144 if (ldcp->session_status & VSW_PEER_SESSION) { 4145 if (ldcp->peer_session != tag.vio_sid) { 4146 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4147 __func__, ldcp->ldc_id, tag.vio_sid); 4148 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4149 vsw_restart_handshake(ldcp); 4150 return; 4151 } 4152 } 4153 4154 /* 4155 * Switch on vio_subtype envelope, then let lower routines 4156 * decide if its an INFO, ACK or NACK packet. 4157 */ 4158 switch (env) { 4159 case VIO_VER_INFO: 4160 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 4161 break; 4162 case VIO_DRING_REG: 4163 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 4164 break; 4165 case VIO_DRING_UNREG: 4166 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 4167 break; 4168 case VIO_ATTR_INFO: 4169 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 4170 break; 4171 case VNET_MCAST_INFO: 4172 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 4173 break; 4174 case VIO_RDX: 4175 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 4176 break; 4177 default: 4178 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4179 __func__, env); 4180 } 4181 4182 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4183 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4184 } 4185 4186 /* 4187 * Version negotiation. We can end up here either because our peer 4188 * has responded to a handshake message we have sent it, or our peer 4189 * has initiated a handshake with us. If its the former then can only 4190 * be ACK or NACK, if its the later can only be INFO. 4191 * 4192 * If its an ACK we move to the next stage of the handshake, namely 4193 * attribute exchange. If its a NACK we see if we can specify another 4194 * version, if we can't we stop. 4195 * 4196 * If it is an INFO we reset all params associated with communication 4197 * in that direction over this channel (remember connection is 4198 * essentially 2 independent simplex channels). 4199 */ 4200 void 4201 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 4202 { 4203 vio_ver_msg_t *ver_pkt; 4204 vsw_t *vswp = ldcp->ldc_vswp; 4205 4206 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4207 4208 /* 4209 * We know this is a ctrl/version packet so 4210 * cast it into the correct structure. 4211 */ 4212 ver_pkt = (vio_ver_msg_t *)pkt; 4213 4214 switch (ver_pkt->tag.vio_subtype) { 4215 case VIO_SUBTYPE_INFO: 4216 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 4217 4218 /* 4219 * Record the session id, which we will use from now 4220 * until we see another VER_INFO msg. Even then the 4221 * session id in most cases will be unchanged, execpt 4222 * if channel was reset. 4223 */ 4224 if ((ldcp->session_status & VSW_PEER_SESSION) && 4225 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 4226 DERR(vswp, "%s: updating session id for chan %lld " 4227 "from %llx to %llx", __func__, ldcp->ldc_id, 4228 ldcp->peer_session, ver_pkt->tag.vio_sid); 4229 } 4230 4231 ldcp->peer_session = ver_pkt->tag.vio_sid; 4232 ldcp->session_status |= VSW_PEER_SESSION; 4233 4234 /* Legal message at this time ? */ 4235 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 4236 return; 4237 4238 /* 4239 * First check the device class. Currently only expect 4240 * to be talking to a network device. In the future may 4241 * also talk to another switch. 4242 */ 4243 if (ver_pkt->dev_class != VDEV_NETWORK) { 4244 DERR(vswp, "%s: illegal device class %d", __func__, 4245 ver_pkt->dev_class); 4246 4247 ver_pkt->tag.vio_sid = ldcp->local_session; 4248 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4249 4250 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4251 4252 vsw_send_msg(ldcp, (void *)ver_pkt, 4253 sizeof (vio_ver_msg_t)); 4254 4255 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4256 vsw_next_milestone(ldcp); 4257 return; 4258 } else { 4259 ldcp->dev_class = ver_pkt->dev_class; 4260 } 4261 4262 /* 4263 * Now check the version. 4264 */ 4265 if (vsw_supported_version(ver_pkt) == 0) { 4266 /* 4267 * Support this major version and possibly 4268 * adjusted minor version. 4269 */ 4270 4271 D2(vswp, "%s: accepted ver %d:%d", __func__, 4272 ver_pkt->ver_major, ver_pkt->ver_minor); 4273 4274 /* Store accepted values */ 4275 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4276 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4277 4278 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4279 4280 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 4281 } else { 4282 /* 4283 * NACK back with the next lower major/minor 4284 * pairing we support (if don't suuport any more 4285 * versions then they will be set to zero. 4286 */ 4287 4288 D2(vswp, "%s: replying with ver %d:%d", __func__, 4289 ver_pkt->ver_major, ver_pkt->ver_minor); 4290 4291 /* Store updated values */ 4292 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4293 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4294 4295 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4296 4297 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4298 } 4299 4300 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4301 ver_pkt->tag.vio_sid = ldcp->local_session; 4302 vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); 4303 4304 vsw_next_milestone(ldcp); 4305 break; 4306 4307 case VIO_SUBTYPE_ACK: 4308 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 4309 4310 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 4311 return; 4312 4313 /* Store updated values */ 4314 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4315 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4316 4317 4318 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 4319 vsw_next_milestone(ldcp); 4320 4321 break; 4322 4323 case VIO_SUBTYPE_NACK: 4324 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 4325 4326 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 4327 return; 4328 4329 /* 4330 * If our peer sent us a NACK with the ver fields set to 4331 * zero then there is nothing more we can do. Otherwise see 4332 * if we support either the version suggested, or a lesser 4333 * one. 4334 */ 4335 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4336 DERR(vswp, "%s: peer unable to negotiate any " 4337 "further.", __func__); 4338 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4339 vsw_next_milestone(ldcp); 4340 return; 4341 } 4342 4343 /* 4344 * Check to see if we support this major version or 4345 * a lower one. If we don't then maj/min will be set 4346 * to zero. 4347 */ 4348 (void) vsw_supported_version(ver_pkt); 4349 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4350 /* Nothing more we can do */ 4351 DERR(vswp, "%s: version negotiation failed.\n", 4352 __func__); 4353 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4354 vsw_next_milestone(ldcp); 4355 } else { 4356 /* found a supported major version */ 4357 ldcp->lane_out.ver_major = ver_pkt->ver_major; 4358 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 4359 4360 D2(vswp, "%s: resending with updated values (%x, %x)", 4361 __func__, ver_pkt->ver_major, 4362 ver_pkt->ver_minor); 4363 4364 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 4365 ver_pkt->tag.vio_sid = ldcp->local_session; 4366 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 4367 4368 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4369 4370 vsw_send_msg(ldcp, (void *)ver_pkt, 4371 sizeof (vio_ver_msg_t)); 4372 4373 vsw_next_milestone(ldcp); 4374 4375 } 4376 break; 4377 4378 default: 4379 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4380 ver_pkt->tag.vio_subtype); 4381 } 4382 4383 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 4384 } 4385 4386 /* 4387 * Process an attribute packet. We can end up here either because our peer 4388 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 4389 * peer has sent us an attribute INFO message 4390 * 4391 * If its an ACK we then move to the next stage of the handshake which 4392 * is to send our descriptor ring info to our peer. If its a NACK then 4393 * there is nothing more we can (currently) do. 4394 * 4395 * If we get a valid/acceptable INFO packet (and we have already negotiated 4396 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 4397 * NACK back and reset channel state to INACTIV. 4398 * 4399 * FUTURE: in time we will probably negotiate over attributes, but for 4400 * the moment unacceptable attributes are regarded as a fatal error. 4401 * 4402 */ 4403 void 4404 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 4405 { 4406 vnet_attr_msg_t *attr_pkt; 4407 vsw_t *vswp = ldcp->ldc_vswp; 4408 vsw_port_t *port = ldcp->ldc_port; 4409 uint64_t macaddr = 0; 4410 int i; 4411 4412 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4413 4414 /* 4415 * We know this is a ctrl/attr packet so 4416 * cast it into the correct structure. 4417 */ 4418 attr_pkt = (vnet_attr_msg_t *)pkt; 4419 4420 switch (attr_pkt->tag.vio_subtype) { 4421 case VIO_SUBTYPE_INFO: 4422 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4423 4424 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 4425 return; 4426 4427 /* 4428 * If the attributes are unacceptable then we NACK back. 4429 */ 4430 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 4431 4432 DERR(vswp, "%s (chan %d): invalid attributes", 4433 __func__, ldcp->ldc_id); 4434 4435 vsw_free_lane_resources(ldcp, INBOUND); 4436 4437 attr_pkt->tag.vio_sid = ldcp->local_session; 4438 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4439 4440 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4441 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 4442 vsw_send_msg(ldcp, (void *)attr_pkt, 4443 sizeof (vnet_attr_msg_t)); 4444 4445 vsw_next_milestone(ldcp); 4446 return; 4447 } 4448 4449 /* 4450 * Otherwise store attributes for this lane and update 4451 * lane state. 4452 */ 4453 ldcp->lane_in.mtu = attr_pkt->mtu; 4454 ldcp->lane_in.addr = attr_pkt->addr; 4455 ldcp->lane_in.addr_type = attr_pkt->addr_type; 4456 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 4457 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 4458 4459 macaddr = ldcp->lane_in.addr; 4460 for (i = ETHERADDRL - 1; i >= 0; i--) { 4461 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 4462 macaddr >>= 8; 4463 } 4464 4465 /* create the fdb entry for this port/mac address */ 4466 (void) vsw_add_fdb(vswp, port); 4467 4468 /* setup device specifc xmit routines */ 4469 mutex_enter(&port->tx_lock); 4470 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 4471 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 4472 port->transmit = vsw_dringsend; 4473 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 4474 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 4475 vsw_create_privring(ldcp); 4476 port->transmit = vsw_descrsend; 4477 } 4478 mutex_exit(&port->tx_lock); 4479 4480 attr_pkt->tag.vio_sid = ldcp->local_session; 4481 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4482 4483 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4484 4485 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 4486 4487 vsw_send_msg(ldcp, (void *)attr_pkt, 4488 sizeof (vnet_attr_msg_t)); 4489 4490 vsw_next_milestone(ldcp); 4491 break; 4492 4493 case VIO_SUBTYPE_ACK: 4494 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4495 4496 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 4497 return; 4498 4499 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 4500 vsw_next_milestone(ldcp); 4501 break; 4502 4503 case VIO_SUBTYPE_NACK: 4504 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4505 4506 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 4507 return; 4508 4509 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 4510 vsw_next_milestone(ldcp); 4511 break; 4512 4513 default: 4514 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4515 attr_pkt->tag.vio_subtype); 4516 } 4517 4518 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4519 } 4520 4521 /* 4522 * Process a dring info packet. We can end up here either because our peer 4523 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 4524 * peer has sent us a dring INFO message. 4525 * 4526 * If we get a valid/acceptable INFO packet (and we have already negotiated 4527 * a version) we ACK back and update the lane state, otherwise we NACK back. 4528 * 4529 * FUTURE: nothing to stop client from sending us info on multiple dring's 4530 * but for the moment we will just use the first one we are given. 4531 * 4532 */ 4533 void 4534 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 4535 { 4536 vio_dring_reg_msg_t *dring_pkt; 4537 vsw_t *vswp = ldcp->ldc_vswp; 4538 ldc_mem_info_t minfo; 4539 dring_info_t *dp, *dbp; 4540 int dring_found = 0; 4541 4542 /* 4543 * We know this is a ctrl/dring packet so 4544 * cast it into the correct structure. 4545 */ 4546 dring_pkt = (vio_dring_reg_msg_t *)pkt; 4547 4548 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4549 4550 switch (dring_pkt->tag.vio_subtype) { 4551 case VIO_SUBTYPE_INFO: 4552 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4553 4554 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 4555 return; 4556 4557 /* 4558 * If the dring params are unacceptable then we NACK back. 4559 */ 4560 if (vsw_check_dring_info(dring_pkt)) { 4561 4562 DERR(vswp, "%s (%lld): invalid dring info", 4563 __func__, ldcp->ldc_id); 4564 4565 vsw_free_lane_resources(ldcp, INBOUND); 4566 4567 dring_pkt->tag.vio_sid = ldcp->local_session; 4568 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4569 4570 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4571 4572 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4573 4574 vsw_send_msg(ldcp, (void *)dring_pkt, 4575 sizeof (vio_dring_reg_msg_t)); 4576 4577 vsw_next_milestone(ldcp); 4578 return; 4579 } 4580 4581 /* 4582 * Otherwise, attempt to map in the dring using the 4583 * cookie. If that succeeds we send back a unique dring 4584 * identifier that the sending side will use in future 4585 * to refer to this descriptor ring. 4586 */ 4587 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 4588 4589 dp->num_descriptors = dring_pkt->num_descriptors; 4590 dp->descriptor_size = dring_pkt->descriptor_size; 4591 dp->options = dring_pkt->options; 4592 dp->ncookies = dring_pkt->ncookies; 4593 4594 /* 4595 * Note: should only get one cookie. Enforced in 4596 * the ldc layer. 4597 */ 4598 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 4599 sizeof (ldc_mem_cookie_t)); 4600 4601 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 4602 dp->num_descriptors, dp->descriptor_size); 4603 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 4604 dp->options, dp->ncookies); 4605 4606 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 4607 dp->ncookies, dp->num_descriptors, 4608 dp->descriptor_size, LDC_SHADOW_MAP, 4609 &(dp->handle))) != 0) { 4610 4611 DERR(vswp, "%s: dring_map failed\n", __func__); 4612 4613 kmem_free(dp, sizeof (dring_info_t)); 4614 vsw_free_lane_resources(ldcp, INBOUND); 4615 4616 dring_pkt->tag.vio_sid = ldcp->local_session; 4617 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4618 4619 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4620 4621 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4622 vsw_send_msg(ldcp, (void *)dring_pkt, 4623 sizeof (vio_dring_reg_msg_t)); 4624 4625 vsw_next_milestone(ldcp); 4626 return; 4627 } 4628 4629 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 4630 4631 DERR(vswp, "%s: dring_addr failed\n", __func__); 4632 4633 kmem_free(dp, sizeof (dring_info_t)); 4634 vsw_free_lane_resources(ldcp, INBOUND); 4635 4636 dring_pkt->tag.vio_sid = ldcp->local_session; 4637 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4638 4639 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4640 4641 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4642 vsw_send_msg(ldcp, (void *)dring_pkt, 4643 sizeof (vio_dring_reg_msg_t)); 4644 4645 vsw_next_milestone(ldcp); 4646 return; 4647 } else { 4648 /* store the address of the pub part of ring */ 4649 dp->pub_addr = minfo.vaddr; 4650 } 4651 4652 /* no private section as we are importing */ 4653 dp->priv_addr = NULL; 4654 4655 /* 4656 * Using simple mono increasing int for ident at 4657 * the moment. 4658 */ 4659 dp->ident = ldcp->next_ident; 4660 ldcp->next_ident++; 4661 4662 dp->end_idx = 0; 4663 dp->next = NULL; 4664 4665 /* 4666 * Link it onto the end of the list of drings 4667 * for this lane. 4668 */ 4669 if (ldcp->lane_in.dringp == NULL) { 4670 D2(vswp, "%s: adding first INBOUND dring", __func__); 4671 ldcp->lane_in.dringp = dp; 4672 } else { 4673 dbp = ldcp->lane_in.dringp; 4674 4675 while (dbp->next != NULL) 4676 dbp = dbp->next; 4677 4678 dbp->next = dp; 4679 } 4680 4681 /* acknowledge it */ 4682 dring_pkt->tag.vio_sid = ldcp->local_session; 4683 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4684 dring_pkt->dring_ident = dp->ident; 4685 4686 vsw_send_msg(ldcp, (void *)dring_pkt, 4687 sizeof (vio_dring_reg_msg_t)); 4688 4689 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 4690 vsw_next_milestone(ldcp); 4691 break; 4692 4693 case VIO_SUBTYPE_ACK: 4694 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4695 4696 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 4697 return; 4698 4699 /* 4700 * Peer is acknowledging our dring info and will have 4701 * sent us a dring identifier which we will use to 4702 * refer to this ring w.r.t. our peer. 4703 */ 4704 dp = ldcp->lane_out.dringp; 4705 if (dp != NULL) { 4706 /* 4707 * Find the ring this ident should be associated 4708 * with. 4709 */ 4710 if (vsw_dring_match(dp, dring_pkt)) { 4711 dring_found = 1; 4712 4713 } else while (dp != NULL) { 4714 if (vsw_dring_match(dp, dring_pkt)) { 4715 dring_found = 1; 4716 break; 4717 } 4718 dp = dp->next; 4719 } 4720 4721 if (dring_found == 0) { 4722 DERR(NULL, "%s: unrecognised ring cookie", 4723 __func__); 4724 vsw_restart_handshake(ldcp); 4725 return; 4726 } 4727 4728 } else { 4729 DERR(vswp, "%s: DRING ACK received but no drings " 4730 "allocated", __func__); 4731 vsw_restart_handshake(ldcp); 4732 return; 4733 } 4734 4735 /* store ident */ 4736 dp->ident = dring_pkt->dring_ident; 4737 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 4738 vsw_next_milestone(ldcp); 4739 break; 4740 4741 case VIO_SUBTYPE_NACK: 4742 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4743 4744 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 4745 return; 4746 4747 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 4748 vsw_next_milestone(ldcp); 4749 break; 4750 4751 default: 4752 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4753 dring_pkt->tag.vio_subtype); 4754 } 4755 4756 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4757 } 4758 4759 /* 4760 * Process a request from peer to unregister a dring. 4761 * 4762 * For the moment we just restart the handshake if our 4763 * peer endpoint attempts to unregister a dring. 4764 */ 4765 void 4766 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 4767 { 4768 vsw_t *vswp = ldcp->ldc_vswp; 4769 vio_dring_unreg_msg_t *dring_pkt; 4770 4771 /* 4772 * We know this is a ctrl/dring packet so 4773 * cast it into the correct structure. 4774 */ 4775 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 4776 4777 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4778 4779 switch (dring_pkt->tag.vio_subtype) { 4780 case VIO_SUBTYPE_INFO: 4781 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4782 4783 DWARN(vswp, "%s: restarting handshake..", __func__); 4784 vsw_restart_handshake(ldcp); 4785 break; 4786 4787 case VIO_SUBTYPE_ACK: 4788 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4789 4790 DWARN(vswp, "%s: restarting handshake..", __func__); 4791 vsw_restart_handshake(ldcp); 4792 break; 4793 4794 case VIO_SUBTYPE_NACK: 4795 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4796 4797 DWARN(vswp, "%s: restarting handshake..", __func__); 4798 vsw_restart_handshake(ldcp); 4799 break; 4800 4801 default: 4802 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4803 dring_pkt->tag.vio_subtype); 4804 vsw_restart_handshake(ldcp); 4805 } 4806 4807 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4808 } 4809 4810 #define SND_MCST_NACK(ldcp, pkt) \ 4811 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4812 pkt->tag.vio_sid = ldcp->local_session; \ 4813 vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); 4814 4815 /* 4816 * Process a multicast request from a vnet. 4817 * 4818 * Vnet's specify a multicast address that they are interested in. This 4819 * address is used as a key into the hash table which forms the multicast 4820 * forwarding database (mFDB). 4821 * 4822 * The table keys are the multicast addresses, while the table entries 4823 * are pointers to lists of ports which wish to receive packets for the 4824 * specified multicast address. 4825 * 4826 * When a multicast packet is being switched we use the address as a key 4827 * into the hash table, and then walk the appropriate port list forwarding 4828 * the pkt to each port in turn. 4829 * 4830 * If a vnet is no longer interested in a particular multicast grouping 4831 * we simply find the correct location in the hash table and then delete 4832 * the relevant port from the port list. 4833 * 4834 * To deal with the case whereby a port is being deleted without first 4835 * removing itself from the lists in the hash table, we maintain a list 4836 * of multicast addresses the port has registered an interest in, within 4837 * the port structure itself. We then simply walk that list of addresses 4838 * using them as keys into the hash table and remove the port from the 4839 * appropriate lists. 4840 */ 4841 static void 4842 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 4843 { 4844 vnet_mcast_msg_t *mcst_pkt; 4845 vsw_port_t *port = ldcp->ldc_port; 4846 vsw_t *vswp = ldcp->ldc_vswp; 4847 int i; 4848 4849 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4850 4851 /* 4852 * We know this is a ctrl/mcast packet so 4853 * cast it into the correct structure. 4854 */ 4855 mcst_pkt = (vnet_mcast_msg_t *)pkt; 4856 4857 switch (mcst_pkt->tag.vio_subtype) { 4858 case VIO_SUBTYPE_INFO: 4859 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4860 4861 /* 4862 * Check if in correct state to receive a multicast 4863 * message (i.e. handshake complete). If not reset 4864 * the handshake. 4865 */ 4866 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 4867 return; 4868 4869 /* 4870 * Before attempting to add or remove address check 4871 * that they are valid multicast addresses. 4872 * If not, then NACK back. 4873 */ 4874 for (i = 0; i < mcst_pkt->count; i++) { 4875 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 4876 DERR(vswp, "%s: invalid multicast address", 4877 __func__); 4878 SND_MCST_NACK(ldcp, mcst_pkt); 4879 return; 4880 } 4881 } 4882 4883 /* 4884 * Now add/remove the addresses. If this fails we 4885 * NACK back. 4886 */ 4887 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 4888 SND_MCST_NACK(ldcp, mcst_pkt); 4889 return; 4890 } 4891 4892 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4893 mcst_pkt->tag.vio_sid = ldcp->local_session; 4894 4895 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 4896 4897 vsw_send_msg(ldcp, (void *)mcst_pkt, 4898 sizeof (vnet_mcast_msg_t)); 4899 break; 4900 4901 case VIO_SUBTYPE_ACK: 4902 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4903 4904 /* 4905 * We shouldn't ever get a multicast ACK message as 4906 * at the moment we never request multicast addresses 4907 * to be set on some other device. This may change in 4908 * the future if we have cascading switches. 4909 */ 4910 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 4911 return; 4912 4913 /* Do nothing */ 4914 break; 4915 4916 case VIO_SUBTYPE_NACK: 4917 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4918 4919 /* 4920 * We shouldn't get a multicast NACK packet for the 4921 * same reasons as we shouldn't get a ACK packet. 4922 */ 4923 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 4924 return; 4925 4926 /* Do nothing */ 4927 break; 4928 4929 default: 4930 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4931 mcst_pkt->tag.vio_subtype); 4932 } 4933 4934 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4935 } 4936 4937 static void 4938 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 4939 { 4940 vio_rdx_msg_t *rdx_pkt; 4941 vsw_t *vswp = ldcp->ldc_vswp; 4942 4943 /* 4944 * We know this is a ctrl/rdx packet so 4945 * cast it into the correct structure. 4946 */ 4947 rdx_pkt = (vio_rdx_msg_t *)pkt; 4948 4949 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4950 4951 switch (rdx_pkt->tag.vio_subtype) { 4952 case VIO_SUBTYPE_INFO: 4953 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4954 4955 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) 4956 return; 4957 4958 rdx_pkt->tag.vio_sid = ldcp->local_session; 4959 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4960 4961 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 4962 4963 ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; 4964 4965 vsw_send_msg(ldcp, (void *)rdx_pkt, 4966 sizeof (vio_rdx_msg_t)); 4967 4968 vsw_next_milestone(ldcp); 4969 break; 4970 4971 case VIO_SUBTYPE_ACK: 4972 /* 4973 * Should be handled in-band by callback handler. 4974 */ 4975 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 4976 vsw_restart_handshake(ldcp); 4977 break; 4978 4979 case VIO_SUBTYPE_NACK: 4980 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4981 4982 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) 4983 return; 4984 4985 ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; 4986 vsw_next_milestone(ldcp); 4987 break; 4988 4989 default: 4990 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4991 rdx_pkt->tag.vio_subtype); 4992 } 4993 4994 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4995 } 4996 4997 static void 4998 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 4999 { 5000 uint16_t env = tag.vio_subtype_env; 5001 vsw_t *vswp = ldcp->ldc_vswp; 5002 5003 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5004 5005 /* session id check */ 5006 if (ldcp->session_status & VSW_PEER_SESSION) { 5007 if (ldcp->peer_session != tag.vio_sid) { 5008 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5009 __func__, ldcp->ldc_id, tag.vio_sid); 5010 vsw_restart_handshake(ldcp); 5011 return; 5012 } 5013 } 5014 5015 /* 5016 * It is an error for us to be getting data packets 5017 * before the handshake has completed. 5018 */ 5019 if (ldcp->hphase != VSW_MILESTONE4) { 5020 DERR(vswp, "%s: got data packet before handshake complete " 5021 "hphase %d (%x: %x)", __func__, ldcp->hphase, 5022 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 5023 DUMP_FLAGS(ldcp->lane_in.lstate); 5024 DUMP_FLAGS(ldcp->lane_out.lstate); 5025 vsw_restart_handshake(ldcp); 5026 return; 5027 } 5028 5029 /* 5030 * Switch on vio_subtype envelope, then let lower routines 5031 * decide if its an INFO, ACK or NACK packet. 5032 */ 5033 if (env == VIO_DRING_DATA) { 5034 vsw_process_data_dring_pkt(ldcp, dpkt); 5035 } else if (env == VIO_PKT_DATA) { 5036 vsw_process_data_raw_pkt(ldcp, dpkt); 5037 } else if (env == VIO_DESC_DATA) { 5038 vsw_process_data_ibnd_pkt(ldcp, dpkt); 5039 } else { 5040 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 5041 __func__, env); 5042 } 5043 5044 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5045 } 5046 5047 #define SND_DRING_NACK(ldcp, pkt) \ 5048 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5049 pkt->tag.vio_sid = ldcp->local_session; \ 5050 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); 5051 5052 static void 5053 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 5054 { 5055 vio_dring_msg_t *dring_pkt; 5056 vnet_public_desc_t *pub_addr = NULL; 5057 vsw_private_desc_t *priv_addr = NULL; 5058 dring_info_t *dp = NULL; 5059 vsw_t *vswp = ldcp->ldc_vswp; 5060 mblk_t *mp = NULL; 5061 mblk_t *bp = NULL; 5062 mblk_t *bpt = NULL; 5063 size_t nbytes = 0; 5064 size_t off = 0; 5065 uint64_t ncookies = 0; 5066 uint64_t chain = 0; 5067 uint64_t j, len; 5068 uint32_t pos, start, datalen; 5069 uint32_t range_start, range_end; 5070 int32_t end, num, cnt = 0; 5071 int i, rv; 5072 boolean_t ack_needed = B_FALSE; 5073 boolean_t prev_desc_ack = B_FALSE; 5074 int read_attempts = 0; 5075 5076 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5077 5078 /* 5079 * We know this is a data/dring packet so 5080 * cast it into the correct structure. 5081 */ 5082 dring_pkt = (vio_dring_msg_t *)dpkt; 5083 5084 /* 5085 * Switch on the vio_subtype. If its INFO then we need to 5086 * process the data. If its an ACK we need to make sure 5087 * it makes sense (i.e did we send an earlier data/info), 5088 * and if its a NACK then we maybe attempt a retry. 5089 */ 5090 switch (dring_pkt->tag.vio_subtype) { 5091 case VIO_SUBTYPE_INFO: 5092 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 5093 5094 if ((dp = vsw_ident2dring(&ldcp->lane_in, 5095 dring_pkt->dring_ident)) == NULL) { 5096 5097 DERR(vswp, "%s(%lld): unable to find dring from " 5098 "ident 0x%llx", __func__, ldcp->ldc_id, 5099 dring_pkt->dring_ident); 5100 5101 SND_DRING_NACK(ldcp, dring_pkt); 5102 return; 5103 } 5104 5105 start = pos = dring_pkt->start_idx; 5106 end = dring_pkt->end_idx; 5107 len = dp->num_descriptors; 5108 5109 range_start = range_end = pos; 5110 5111 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 5112 __func__, ldcp->ldc_id, start, end); 5113 5114 if (end == -1) { 5115 num = -1; 5116 } else if (end >= 0) { 5117 num = end >= pos ? 5118 end - pos + 1: (len - pos + 1) + end; 5119 5120 /* basic sanity check */ 5121 if (end > len) { 5122 DERR(vswp, "%s(%lld): endpoint %lld outside " 5123 "ring length %lld", __func__, 5124 ldcp->ldc_id, end, len); 5125 5126 SND_DRING_NACK(ldcp, dring_pkt); 5127 return; 5128 } 5129 } else { 5130 DERR(vswp, "%s(%lld): invalid endpoint %lld", 5131 __func__, ldcp->ldc_id, end); 5132 SND_DRING_NACK(ldcp, dring_pkt); 5133 return; 5134 } 5135 5136 while (cnt != num) { 5137 vsw_recheck_desc: 5138 if ((rv = ldc_mem_dring_acquire(dp->handle, 5139 pos, pos)) != 0) { 5140 DERR(vswp, "%s(%lld): unable to acquire " 5141 "descriptor at pos %d: err %d", 5142 __func__, pos, ldcp->ldc_id, rv); 5143 SND_DRING_NACK(ldcp, dring_pkt); 5144 return; 5145 } 5146 5147 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 5148 5149 /* 5150 * When given a bounded range of descriptors 5151 * to process, its an error to hit a descriptor 5152 * which is not ready. In the non-bounded case 5153 * (end_idx == -1) this simply indicates we have 5154 * reached the end of the current active range. 5155 */ 5156 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 5157 /* unbound - no error */ 5158 if (end == -1) { 5159 if (read_attempts == vsw_read_attempts) 5160 break; 5161 5162 delay(drv_usectohz(vsw_desc_delay)); 5163 read_attempts++; 5164 goto vsw_recheck_desc; 5165 } 5166 5167 /* bounded - error - so NACK back */ 5168 DERR(vswp, "%s(%lld): descriptor not READY " 5169 "(%d)", __func__, ldcp->ldc_id, 5170 pub_addr->hdr.dstate); 5171 SND_DRING_NACK(ldcp, dring_pkt); 5172 return; 5173 } 5174 5175 DTRACE_PROBE1(read_attempts, int, read_attempts); 5176 5177 range_end = pos; 5178 5179 /* 5180 * If we ACK'd the previous descriptor then now 5181 * record the new range start position for later 5182 * ACK's. 5183 */ 5184 if (prev_desc_ack) { 5185 range_start = pos; 5186 5187 D2(vswp, "%s(%lld): updating range start " 5188 "to be %d", __func__, ldcp->ldc_id, 5189 range_start); 5190 5191 prev_desc_ack = B_FALSE; 5192 } 5193 5194 /* 5195 * Data is padded to align on 8 byte boundary, 5196 * datalen is actual data length, i.e. minus that 5197 * padding. 5198 */ 5199 datalen = pub_addr->nbytes; 5200 5201 /* 5202 * Does peer wish us to ACK when we have finished 5203 * with this descriptor ? 5204 */ 5205 if (pub_addr->hdr.ack) 5206 ack_needed = B_TRUE; 5207 5208 D2(vswp, "%s(%lld): processing desc %lld at pos" 5209 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 5210 __func__, ldcp->ldc_id, pos, pub_addr, 5211 pub_addr->hdr.dstate, datalen); 5212 5213 /* 5214 * Mark that we are starting to process descriptor. 5215 */ 5216 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 5217 5218 mp = vio_allocb(ldcp->rxh); 5219 if (mp == NULL) { 5220 /* 5221 * No free receive buffers available, so 5222 * fallback onto allocb(9F). Make sure that 5223 * we get a data buffer which is a multiple 5224 * of 8 as this is required by ldc_mem_copy. 5225 */ 5226 DTRACE_PROBE(allocb); 5227 mp = allocb(datalen + VNET_IPALIGN + 8, 5228 BPRI_MED); 5229 } 5230 5231 /* 5232 * Ensure that we ask ldc for an aligned 5233 * number of bytes. 5234 */ 5235 nbytes = datalen + VNET_IPALIGN; 5236 if (nbytes & 0x7) { 5237 off = 8 - (nbytes & 0x7); 5238 nbytes += off; 5239 } 5240 5241 ncookies = pub_addr->ncookies; 5242 rv = ldc_mem_copy(ldcp->ldc_handle, 5243 (caddr_t)mp->b_rptr, 0, &nbytes, 5244 pub_addr->memcookie, ncookies, 5245 LDC_COPY_IN); 5246 5247 if (rv != 0) { 5248 DERR(vswp, "%s(%d): unable to copy in " 5249 "data from %d cookies in desc %d" 5250 " (rv %d)", __func__, ldcp->ldc_id, 5251 ncookies, pos, rv); 5252 freemsg(mp); 5253 5254 pub_addr->hdr.dstate = VIO_DESC_DONE; 5255 (void) ldc_mem_dring_release(dp->handle, 5256 pos, pos); 5257 break; 5258 } else { 5259 D2(vswp, "%s(%d): copied in %ld bytes" 5260 " using %d cookies", __func__, 5261 ldcp->ldc_id, nbytes, ncookies); 5262 } 5263 5264 /* adjust the read pointer to skip over the padding */ 5265 mp->b_rptr += VNET_IPALIGN; 5266 5267 /* point to the actual end of data */ 5268 mp->b_wptr = mp->b_rptr + datalen; 5269 5270 /* build a chain of received packets */ 5271 if (bp == NULL) { 5272 /* first pkt */ 5273 bp = mp; 5274 bp->b_next = bp->b_prev = NULL; 5275 bpt = bp; 5276 chain = 1; 5277 } else { 5278 mp->b_next = NULL; 5279 mp->b_prev = bpt; 5280 bpt->b_next = mp; 5281 bpt = mp; 5282 chain++; 5283 } 5284 5285 /* mark we are finished with this descriptor */ 5286 pub_addr->hdr.dstate = VIO_DESC_DONE; 5287 5288 (void) ldc_mem_dring_release(dp->handle, pos, pos); 5289 5290 /* 5291 * Send an ACK back to peer if requested. 5292 */ 5293 if (ack_needed) { 5294 ack_needed = B_FALSE; 5295 5296 dring_pkt->start_idx = range_start; 5297 dring_pkt->end_idx = range_end; 5298 5299 DERR(vswp, "%s(%lld): processed %d %d, ACK" 5300 " requested", __func__, ldcp->ldc_id, 5301 dring_pkt->start_idx, 5302 dring_pkt->end_idx); 5303 5304 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 5305 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5306 dring_pkt->tag.vio_sid = ldcp->local_session; 5307 vsw_send_msg(ldcp, (void *)dring_pkt, 5308 sizeof (vio_dring_msg_t)); 5309 5310 prev_desc_ack = B_TRUE; 5311 range_start = pos; 5312 } 5313 5314 /* next descriptor */ 5315 pos = (pos + 1) % len; 5316 cnt++; 5317 5318 /* 5319 * Break out of loop here and stop processing to 5320 * allow some other network device (or disk) to 5321 * get access to the cpu. 5322 */ 5323 /* send the chain of packets to be switched */ 5324 if (chain > vsw_chain_len) { 5325 D3(vswp, "%s(%lld): switching chain of %d " 5326 "msgs", __func__, ldcp->ldc_id, chain); 5327 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 5328 ldcp->ldc_port, NULL); 5329 bp = NULL; 5330 break; 5331 } 5332 } 5333 5334 /* send the chain of packets to be switched */ 5335 if (bp != NULL) { 5336 D3(vswp, "%s(%lld): switching chain of %d msgs", 5337 __func__, ldcp->ldc_id, chain); 5338 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 5339 ldcp->ldc_port, NULL); 5340 } 5341 5342 DTRACE_PROBE1(msg_cnt, int, cnt); 5343 5344 /* 5345 * We are now finished so ACK back with the state 5346 * set to STOPPING so our peer knows we are finished 5347 */ 5348 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5349 dring_pkt->tag.vio_sid = ldcp->local_session; 5350 5351 dring_pkt->dring_process_state = VIO_DP_STOPPED; 5352 5353 DTRACE_PROBE(stop_process_sent); 5354 5355 /* 5356 * We have not processed any more descriptors beyond 5357 * the last one we ACK'd. 5358 */ 5359 if (prev_desc_ack) 5360 range_start = range_end; 5361 5362 dring_pkt->start_idx = range_start; 5363 dring_pkt->end_idx = range_end; 5364 5365 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 5366 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5367 dring_pkt->end_idx); 5368 5369 vsw_send_msg(ldcp, (void *)dring_pkt, 5370 sizeof (vio_dring_msg_t)); 5371 break; 5372 5373 case VIO_SUBTYPE_ACK: 5374 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 5375 /* 5376 * Verify that the relevant descriptors are all 5377 * marked as DONE 5378 */ 5379 if ((dp = vsw_ident2dring(&ldcp->lane_out, 5380 dring_pkt->dring_ident)) == NULL) { 5381 DERR(vswp, "%s: unknown ident in ACK", __func__); 5382 return; 5383 } 5384 5385 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 5386 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5387 5388 start = end = 0; 5389 start = dring_pkt->start_idx; 5390 end = dring_pkt->end_idx; 5391 len = dp->num_descriptors; 5392 5393 j = num = 0; 5394 /* calculate # descriptors taking into a/c wrap around */ 5395 num = end >= start ? end - start + 1: (len - start + 1) + end; 5396 5397 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 5398 __func__, ldcp->ldc_id, start, end, num); 5399 5400 mutex_enter(&dp->dlock); 5401 dp->last_ack_recv = end; 5402 mutex_exit(&dp->dlock); 5403 5404 for (i = start; j < num; i = (i + 1) % len, j++) { 5405 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 5406 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5407 5408 /* 5409 * If the last descriptor in a range has the ACK 5410 * bit set then we will get two messages from our 5411 * peer relating to it. The normal ACK msg and then 5412 * a subsequent STOP msg. The first message will have 5413 * resulted in the descriptor being reclaimed and 5414 * its state set to FREE so when we encounter a non 5415 * DONE descriptor we need to check to see if its 5416 * because we have just reclaimed it. 5417 */ 5418 mutex_enter(&priv_addr->dstate_lock); 5419 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 5420 /* clear all the fields */ 5421 bzero(priv_addr->datap, priv_addr->datalen); 5422 priv_addr->datalen = 0; 5423 5424 pub_addr->hdr.dstate = VIO_DESC_FREE; 5425 pub_addr->hdr.ack = 0; 5426 5427 priv_addr->dstate = VIO_DESC_FREE; 5428 mutex_exit(&priv_addr->dstate_lock); 5429 5430 D3(vswp, "clearing descp %d : pub state " 5431 "0x%llx : priv state 0x%llx", i, 5432 pub_addr->hdr.dstate, 5433 priv_addr->dstate); 5434 5435 } else { 5436 mutex_exit(&priv_addr->dstate_lock); 5437 5438 if (dring_pkt->dring_process_state != 5439 VIO_DP_STOPPED) { 5440 DERR(vswp, "%s: descriptor %lld at pos " 5441 " 0x%llx not DONE (0x%lx)\n", 5442 __func__, i, pub_addr, 5443 pub_addr->hdr.dstate); 5444 return; 5445 } 5446 } 5447 } 5448 5449 /* 5450 * If our peer is stopping processing descriptors then 5451 * we check to make sure it has processed all the descriptors 5452 * we have updated. If not then we send it a new message 5453 * to prompt it to restart. 5454 */ 5455 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 5456 DTRACE_PROBE(stop_process_recv); 5457 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 5458 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5459 dring_pkt->end_idx); 5460 5461 /* 5462 * Check next descriptor in public section of ring. 5463 * If its marked as READY then we need to prompt our 5464 * peer to start processing the ring again. 5465 */ 5466 i = (end + 1) % len; 5467 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 5468 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5469 5470 /* 5471 * Hold the restart lock across all of this to 5472 * make sure that its not possible for us to 5473 * decide that a msg needs to be sent in the future 5474 * but the sending code having already checked is 5475 * about to exit. 5476 */ 5477 mutex_enter(&dp->restart_lock); 5478 mutex_enter(&priv_addr->dstate_lock); 5479 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 5480 5481 mutex_exit(&priv_addr->dstate_lock); 5482 5483 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 5484 dring_pkt->tag.vio_sid = ldcp->local_session; 5485 5486 mutex_enter(&ldcp->lane_out.seq_lock); 5487 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 5488 mutex_exit(&ldcp->lane_out.seq_lock); 5489 5490 dring_pkt->start_idx = (end + 1) % len; 5491 dring_pkt->end_idx = -1; 5492 5493 D2(vswp, "%s(%lld) : sending restart msg:" 5494 " %d : %d", __func__, ldcp->ldc_id, 5495 dring_pkt->start_idx, 5496 dring_pkt->end_idx); 5497 5498 vsw_send_msg(ldcp, (void *)dring_pkt, 5499 sizeof (vio_dring_msg_t)); 5500 } else { 5501 mutex_exit(&priv_addr->dstate_lock); 5502 dp->restart_reqd = B_TRUE; 5503 } 5504 mutex_exit(&dp->restart_lock); 5505 } 5506 break; 5507 5508 case VIO_SUBTYPE_NACK: 5509 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 5510 __func__, ldcp->ldc_id); 5511 /* 5512 * Something is badly wrong if we are getting NACK's 5513 * for our data pkts. So reset the channel. 5514 */ 5515 vsw_restart_handshake(ldcp); 5516 5517 break; 5518 5519 default: 5520 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5521 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 5522 } 5523 5524 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5525 } 5526 5527 /* 5528 * VIO_PKT_DATA (a.k.a raw data mode ) 5529 * 5530 * Note - currently not supported. Do nothing. 5531 */ 5532 static void 5533 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 5534 { 5535 _NOTE(ARGUNUSED(dpkt)) 5536 5537 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5538 5539 DERR(NULL, "%s (%lld): currently not supported", 5540 __func__, ldcp->ldc_id); 5541 5542 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5543 } 5544 5545 #define SND_IBND_DESC_NACK(ldcp, pkt) \ 5546 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5547 pkt->tag.vio_sid = ldcp->local_session; \ 5548 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t)); 5549 5550 /* 5551 * Process an in-band descriptor message (most likely from 5552 * OBP). 5553 */ 5554 static void 5555 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 5556 { 5557 vio_ibnd_desc_t *ibnd_desc; 5558 dring_info_t *dp = NULL; 5559 vsw_private_desc_t *priv_addr = NULL; 5560 vsw_t *vswp = ldcp->ldc_vswp; 5561 mblk_t *mp = NULL; 5562 size_t nbytes = 0; 5563 size_t off = 0; 5564 uint64_t idx = 0; 5565 uint32_t num = 1, len, datalen = 0; 5566 uint64_t ncookies = 0; 5567 int i, rv; 5568 int j = 0; 5569 5570 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5571 5572 ibnd_desc = (vio_ibnd_desc_t *)pkt; 5573 5574 switch (ibnd_desc->hdr.tag.vio_subtype) { 5575 case VIO_SUBTYPE_INFO: 5576 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5577 5578 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5579 return; 5580 5581 /* 5582 * Data is padded to align on a 8 byte boundary, 5583 * nbytes is actual data length, i.e. minus that 5584 * padding. 5585 */ 5586 datalen = ibnd_desc->nbytes; 5587 5588 D2(vswp, "%s(%lld): processing inband desc : " 5589 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 5590 5591 ncookies = ibnd_desc->ncookies; 5592 5593 /* 5594 * allocb(9F) returns an aligned data block. We 5595 * need to ensure that we ask ldc for an aligned 5596 * number of bytes also. 5597 */ 5598 nbytes = datalen; 5599 if (nbytes & 0x7) { 5600 off = 8 - (nbytes & 0x7); 5601 nbytes += off; 5602 } 5603 5604 mp = allocb(datalen, BPRI_MED); 5605 if (mp == NULL) { 5606 DERR(vswp, "%s(%lld): allocb failed", 5607 __func__, ldcp->ldc_id); 5608 return; 5609 } 5610 5611 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 5612 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 5613 LDC_COPY_IN); 5614 5615 if (rv != 0) { 5616 DERR(vswp, "%s(%d): unable to copy in data from " 5617 "%d cookie(s)", __func__, 5618 ldcp->ldc_id, ncookies); 5619 freemsg(mp); 5620 return; 5621 } else { 5622 D2(vswp, "%s(%d): copied in %ld bytes using %d " 5623 "cookies", __func__, ldcp->ldc_id, nbytes, 5624 ncookies); 5625 } 5626 5627 /* point to the actual end of data */ 5628 mp->b_wptr = mp->b_rptr + datalen; 5629 5630 /* 5631 * We ACK back every in-band descriptor message we process 5632 */ 5633 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 5634 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 5635 vsw_send_msg(ldcp, (void *)ibnd_desc, 5636 sizeof (vio_ibnd_desc_t)); 5637 5638 /* send the packet to be switched */ 5639 vsw_switch_frame(vswp, mp, VSW_VNETPORT, 5640 ldcp->ldc_port, NULL); 5641 5642 break; 5643 5644 case VIO_SUBTYPE_ACK: 5645 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5646 5647 /* Verify the ACK is valid */ 5648 idx = ibnd_desc->hdr.desc_handle; 5649 5650 if (idx >= VSW_RING_NUM_EL) { 5651 cmn_err(CE_WARN, "%s: corrupted ACK received " 5652 "(idx %ld)", __func__, idx); 5653 return; 5654 } 5655 5656 if ((dp = ldcp->lane_out.dringp) == NULL) { 5657 DERR(vswp, "%s: no dring found", __func__); 5658 return; 5659 } 5660 5661 len = dp->num_descriptors; 5662 /* 5663 * If the descriptor we are being ACK'ed for is not the 5664 * one we expected, then pkts were lost somwhere, either 5665 * when we tried to send a msg, or a previous ACK msg from 5666 * our peer. In either case we now reclaim the descriptors 5667 * in the range from the last ACK we received up to the 5668 * current ACK. 5669 */ 5670 if (idx != dp->last_ack_recv) { 5671 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 5672 __func__, dp->last_ack_recv, idx); 5673 num = idx >= dp->last_ack_recv ? 5674 idx - dp->last_ack_recv + 1: 5675 (len - dp->last_ack_recv + 1) + idx; 5676 } 5677 5678 /* 5679 * When we sent the in-band message to our peer we 5680 * marked the copy in our private ring as READY. We now 5681 * check that the descriptor we are being ACK'ed for is in 5682 * fact READY, i.e. it is one we have shared with our peer. 5683 * 5684 * If its not we flag an error, but still reset the descr 5685 * back to FREE. 5686 */ 5687 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 5688 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5689 mutex_enter(&priv_addr->dstate_lock); 5690 if (priv_addr->dstate != VIO_DESC_READY) { 5691 DERR(vswp, "%s: (%ld) desc at index %ld not " 5692 "READY (0x%lx)", __func__, 5693 ldcp->ldc_id, idx, priv_addr->dstate); 5694 DERR(vswp, "%s: bound %d: ncookies %ld : " 5695 "datalen %ld", __func__, 5696 priv_addr->bound, priv_addr->ncookies, 5697 priv_addr->datalen); 5698 } 5699 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 5700 ldcp->ldc_id, idx); 5701 /* release resources associated with sent msg */ 5702 bzero(priv_addr->datap, priv_addr->datalen); 5703 priv_addr->datalen = 0; 5704 priv_addr->dstate = VIO_DESC_FREE; 5705 mutex_exit(&priv_addr->dstate_lock); 5706 } 5707 /* update to next expected value */ 5708 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 5709 5710 break; 5711 5712 case VIO_SUBTYPE_NACK: 5713 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5714 5715 /* 5716 * We should only get a NACK if our peer doesn't like 5717 * something about a message we have sent it. If this 5718 * happens we just release the resources associated with 5719 * the message. (We are relying on higher layers to decide 5720 * whether or not to resend. 5721 */ 5722 5723 /* limit check */ 5724 idx = ibnd_desc->hdr.desc_handle; 5725 5726 if (idx >= VSW_RING_NUM_EL) { 5727 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 5728 __func__, idx); 5729 return; 5730 } 5731 5732 if ((dp = ldcp->lane_out.dringp) == NULL) { 5733 DERR(vswp, "%s: no dring found", __func__); 5734 return; 5735 } 5736 5737 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5738 5739 /* move to correct location in ring */ 5740 priv_addr += idx; 5741 5742 /* release resources associated with sent msg */ 5743 mutex_enter(&priv_addr->dstate_lock); 5744 bzero(priv_addr->datap, priv_addr->datalen); 5745 priv_addr->datalen = 0; 5746 priv_addr->dstate = VIO_DESC_FREE; 5747 mutex_exit(&priv_addr->dstate_lock); 5748 5749 break; 5750 5751 default: 5752 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5753 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 5754 } 5755 5756 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5757 } 5758 5759 static void 5760 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 5761 { 5762 _NOTE(ARGUNUSED(epkt)) 5763 5764 vsw_t *vswp = ldcp->ldc_vswp; 5765 uint16_t env = tag.vio_subtype_env; 5766 5767 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5768 5769 /* 5770 * Error vio_subtypes have yet to be defined. So for 5771 * the moment we can't do anything. 5772 */ 5773 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 5774 5775 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5776 } 5777 5778 /* 5779 * Switch the given ethernet frame when operating in layer 2 mode. 5780 * 5781 * vswp: pointer to the vsw instance 5782 * mp: pointer to chain of ethernet frame(s) to be switched 5783 * caller: identifies the source of this frame as: 5784 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 5785 * 2. VSW_PHYSDEV - the physical ethernet device 5786 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 5787 * arg: argument provided by the caller. 5788 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 5789 * 2. for PHYSDEV - NULL 5790 * 3. for LOCALDEV - pointer to to this vsw_t(self) 5791 */ 5792 void 5793 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 5794 vsw_port_t *arg, mac_resource_handle_t mrh) 5795 { 5796 struct ether_header *ehp; 5797 vsw_port_t *port = NULL; 5798 mblk_t *bp, *ret_m; 5799 mblk_t *nmp = NULL; 5800 vsw_port_list_t *plist = &vswp->plist; 5801 5802 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5803 5804 /* 5805 * PERF: rather than breaking up the chain here, scan it 5806 * to find all mblks heading to same destination and then 5807 * pass that sub-chain to the lower transmit functions. 5808 */ 5809 5810 /* process the chain of packets */ 5811 bp = mp; 5812 while (bp) { 5813 mp = bp; 5814 bp = bp->b_next; 5815 mp->b_next = mp->b_prev = NULL; 5816 ehp = (struct ether_header *)mp->b_rptr; 5817 5818 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5819 __func__, MBLKSIZE(mp), MBLKL(mp)); 5820 5821 READ_ENTER(&vswp->if_lockrw); 5822 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 5823 /* 5824 * If destination is VSW_LOCALDEV (vsw as an eth 5825 * interface) and if the device is up & running, 5826 * send the packet up the stack on this host. 5827 * If the virtual interface is down, drop the packet. 5828 */ 5829 if (caller != VSW_LOCALDEV) { 5830 if (vswp->if_state & VSW_IF_UP) { 5831 RW_EXIT(&vswp->if_lockrw); 5832 mac_rx(vswp->if_mh, mrh, mp); 5833 } else { 5834 RW_EXIT(&vswp->if_lockrw); 5835 /* Interface down, drop pkt */ 5836 freemsg(mp); 5837 } 5838 } else { 5839 RW_EXIT(&vswp->if_lockrw); 5840 freemsg(mp); 5841 } 5842 continue; 5843 } 5844 RW_EXIT(&vswp->if_lockrw); 5845 5846 READ_ENTER(&plist->lockrw); 5847 port = vsw_lookup_fdb(vswp, ehp); 5848 if (port) { 5849 /* 5850 * Mark the port as in-use. 5851 */ 5852 mutex_enter(&port->ref_lock); 5853 port->ref_cnt++; 5854 mutex_exit(&port->ref_lock); 5855 RW_EXIT(&plist->lockrw); 5856 5857 /* 5858 * If plumbed and in promisc mode then copy msg 5859 * and send up the stack. 5860 */ 5861 READ_ENTER(&vswp->if_lockrw); 5862 if (VSW_U_P(vswp->if_state)) { 5863 RW_EXIT(&vswp->if_lockrw); 5864 nmp = copymsg(mp); 5865 if (nmp) 5866 mac_rx(vswp->if_mh, mrh, nmp); 5867 } else { 5868 RW_EXIT(&vswp->if_lockrw); 5869 } 5870 5871 /* 5872 * If the destination is in FDB, the packet 5873 * should be forwarded to the correponding 5874 * vsw_port (connected to a vnet device - 5875 * VSW_VNETPORT) 5876 */ 5877 (void) vsw_portsend(port, mp); 5878 5879 /* 5880 * Decrement use count in port and check if 5881 * should wake delete thread. 5882 */ 5883 mutex_enter(&port->ref_lock); 5884 port->ref_cnt--; 5885 if (port->ref_cnt == 0) 5886 cv_signal(&port->ref_cv); 5887 mutex_exit(&port->ref_lock); 5888 } else { 5889 RW_EXIT(&plist->lockrw); 5890 /* 5891 * Destination not in FDB. 5892 * 5893 * If the destination is broadcast or 5894 * multicast forward the packet to all 5895 * (VNETPORTs, PHYSDEV, LOCALDEV), 5896 * except the caller. 5897 */ 5898 if (IS_BROADCAST(ehp)) { 5899 D3(vswp, "%s: BROADCAST pkt", __func__); 5900 (void) vsw_forward_all(vswp, mp, 5901 caller, arg); 5902 } else if (IS_MULTICAST(ehp)) { 5903 D3(vswp, "%s: MULTICAST pkt", __func__); 5904 (void) vsw_forward_grp(vswp, mp, 5905 caller, arg); 5906 } else { 5907 /* 5908 * If the destination is unicast, and came 5909 * from either a logical network device or 5910 * the switch itself when it is plumbed, then 5911 * send it out on the physical device and also 5912 * up the stack if the logical interface is 5913 * in promiscious mode. 5914 * 5915 * NOTE: The assumption here is that if we 5916 * cannot find the destination in our fdb, its 5917 * a unicast address, and came from either a 5918 * vnet or down the stack (when plumbed) it 5919 * must be destinded for an ethernet device 5920 * outside our ldoms. 5921 */ 5922 if (caller == VSW_VNETPORT) { 5923 READ_ENTER(&vswp->if_lockrw); 5924 if (VSW_U_P(vswp->if_state)) { 5925 RW_EXIT(&vswp->if_lockrw); 5926 nmp = copymsg(mp); 5927 if (nmp) 5928 mac_rx(vswp->if_mh, 5929 mrh, nmp); 5930 } else { 5931 RW_EXIT(&vswp->if_lockrw); 5932 } 5933 if ((ret_m = vsw_tx_msg(vswp, mp)) 5934 != NULL) { 5935 DERR(vswp, "%s: drop mblks to " 5936 "phys dev", __func__); 5937 freemsg(ret_m); 5938 } 5939 5940 } else if (caller == VSW_PHYSDEV) { 5941 /* 5942 * Pkt seen because card in promisc 5943 * mode. Send up stack if plumbed in 5944 * promisc mode, else drop it. 5945 */ 5946 READ_ENTER(&vswp->if_lockrw); 5947 if (VSW_U_P(vswp->if_state)) { 5948 RW_EXIT(&vswp->if_lockrw); 5949 mac_rx(vswp->if_mh, mrh, mp); 5950 } else { 5951 RW_EXIT(&vswp->if_lockrw); 5952 freemsg(mp); 5953 } 5954 5955 } else if (caller == VSW_LOCALDEV) { 5956 /* 5957 * Pkt came down the stack, send out 5958 * over physical device. 5959 */ 5960 if ((ret_m = vsw_tx_msg(vswp, mp)) 5961 != NULL) { 5962 DERR(vswp, "%s: drop mblks to " 5963 "phys dev", __func__); 5964 freemsg(ret_m); 5965 } 5966 } 5967 } 5968 } 5969 } 5970 D1(vswp, "%s: exit\n", __func__); 5971 } 5972 5973 /* 5974 * Switch ethernet frame when in layer 3 mode (i.e. using IP 5975 * layer to do the routing). 5976 * 5977 * There is a large amount of overlap between this function and 5978 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 5979 * both these functions. 5980 */ 5981 void 5982 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 5983 vsw_port_t *arg, mac_resource_handle_t mrh) 5984 { 5985 struct ether_header *ehp; 5986 vsw_port_t *port = NULL; 5987 mblk_t *bp = NULL; 5988 vsw_port_list_t *plist = &vswp->plist; 5989 5990 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5991 5992 /* 5993 * In layer 3 mode should only ever be switching packets 5994 * between IP layer and vnet devices. So make sure thats 5995 * who is invoking us. 5996 */ 5997 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 5998 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 5999 freemsgchain(mp); 6000 return; 6001 } 6002 6003 /* process the chain of packets */ 6004 bp = mp; 6005 while (bp) { 6006 mp = bp; 6007 bp = bp->b_next; 6008 mp->b_next = mp->b_prev = NULL; 6009 ehp = (struct ether_header *)mp->b_rptr; 6010 6011 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6012 __func__, MBLKSIZE(mp), MBLKL(mp)); 6013 6014 READ_ENTER(&plist->lockrw); 6015 port = vsw_lookup_fdb(vswp, ehp); 6016 if (port) { 6017 /* 6018 * Mark port as in-use. 6019 */ 6020 mutex_enter(&port->ref_lock); 6021 port->ref_cnt++; 6022 mutex_exit(&port->ref_lock); 6023 RW_EXIT(&plist->lockrw); 6024 6025 D2(vswp, "%s: sending to target port", __func__); 6026 (void) vsw_portsend(port, mp); 6027 6028 /* 6029 * Finished with port so decrement ref count and 6030 * check if should wake delete thread. 6031 */ 6032 mutex_enter(&port->ref_lock); 6033 port->ref_cnt--; 6034 if (port->ref_cnt == 0) 6035 cv_signal(&port->ref_cv); 6036 mutex_exit(&port->ref_lock); 6037 } else { 6038 RW_EXIT(&plist->lockrw); 6039 /* 6040 * Destination not in FDB 6041 * 6042 * If the destination is broadcast or 6043 * multicast forward the packet to all 6044 * (VNETPORTs, PHYSDEV, LOCALDEV), 6045 * except the caller. 6046 */ 6047 if (IS_BROADCAST(ehp)) { 6048 D2(vswp, "%s: BROADCAST pkt", __func__); 6049 (void) vsw_forward_all(vswp, mp, 6050 caller, arg); 6051 } else if (IS_MULTICAST(ehp)) { 6052 D2(vswp, "%s: MULTICAST pkt", __func__); 6053 (void) vsw_forward_grp(vswp, mp, 6054 caller, arg); 6055 } else { 6056 /* 6057 * Unicast pkt from vnet that we don't have 6058 * an FDB entry for, so must be destinded for 6059 * the outside world. Attempt to send up to the 6060 * IP layer to allow it to deal with it. 6061 */ 6062 if (caller == VSW_VNETPORT) { 6063 READ_ENTER(&vswp->if_lockrw); 6064 if (vswp->if_state & VSW_IF_UP) { 6065 RW_EXIT(&vswp->if_lockrw); 6066 D2(vswp, "%s: sending up", 6067 __func__); 6068 mac_rx(vswp->if_mh, mrh, mp); 6069 } else { 6070 RW_EXIT(&vswp->if_lockrw); 6071 /* Interface down, drop pkt */ 6072 D2(vswp, "%s I/F down", 6073 __func__); 6074 freemsg(mp); 6075 } 6076 } 6077 } 6078 } 6079 } 6080 6081 D1(vswp, "%s: exit", __func__); 6082 } 6083 6084 /* 6085 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 6086 * except the caller (port on which frame arrived). 6087 */ 6088 static int 6089 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6090 { 6091 vsw_port_list_t *plist = &vswp->plist; 6092 vsw_port_t *portp; 6093 mblk_t *nmp = NULL; 6094 mblk_t *ret_m = NULL; 6095 int skip_port = 0; 6096 6097 D1(vswp, "vsw_forward_all: enter\n"); 6098 6099 /* 6100 * Broadcast message from inside ldoms so send to outside 6101 * world if in either of layer 2 modes. 6102 */ 6103 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6104 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6105 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 6106 6107 nmp = dupmsg(mp); 6108 if (nmp) { 6109 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6110 DERR(vswp, "%s: dropping pkt(s) " 6111 "consisting of %ld bytes of data for" 6112 " physical device", __func__, MBLKL(ret_m)); 6113 freemsg(ret_m); 6114 } 6115 } 6116 } 6117 6118 if (caller == VSW_VNETPORT) 6119 skip_port = 1; 6120 6121 /* 6122 * Broadcast message from other vnet (layer 2 or 3) or outside 6123 * world (layer 2 only), send up stack if plumbed. 6124 */ 6125 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 6126 READ_ENTER(&vswp->if_lockrw); 6127 if (vswp->if_state & VSW_IF_UP) { 6128 RW_EXIT(&vswp->if_lockrw); 6129 nmp = copymsg(mp); 6130 if (nmp) 6131 mac_rx(vswp->if_mh, NULL, nmp); 6132 } else { 6133 RW_EXIT(&vswp->if_lockrw); 6134 } 6135 } 6136 6137 /* send it to all VNETPORTs */ 6138 READ_ENTER(&plist->lockrw); 6139 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 6140 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 6141 /* 6142 * Caution ! - don't reorder these two checks as arg 6143 * will be NULL if the caller is PHYSDEV. skip_port is 6144 * only set if caller is VNETPORT. 6145 */ 6146 if ((skip_port) && (portp == arg)) 6147 continue; 6148 else { 6149 nmp = dupmsg(mp); 6150 if (nmp) { 6151 (void) vsw_portsend(portp, nmp); 6152 } else { 6153 DERR(vswp, "vsw_forward_all: nmp NULL"); 6154 } 6155 } 6156 } 6157 RW_EXIT(&plist->lockrw); 6158 6159 freemsg(mp); 6160 6161 D1(vswp, "vsw_forward_all: exit\n"); 6162 return (0); 6163 } 6164 6165 /* 6166 * Forward pkts to any devices or interfaces which have registered 6167 * an interest in them (i.e. multicast groups). 6168 */ 6169 static int 6170 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6171 { 6172 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 6173 mfdb_ent_t *entp = NULL; 6174 mfdb_ent_t *tpp = NULL; 6175 vsw_port_t *port; 6176 uint64_t key = 0; 6177 mblk_t *nmp = NULL; 6178 mblk_t *ret_m = NULL; 6179 boolean_t check_if = B_TRUE; 6180 6181 /* 6182 * Convert address to hash table key 6183 */ 6184 KEY_HASH(key, ehp->ether_dhost); 6185 6186 D1(vswp, "%s: key 0x%llx", __func__, key); 6187 6188 /* 6189 * If pkt came from either a vnet or down the stack (if we are 6190 * plumbed) and we are in layer 2 mode, then we send the pkt out 6191 * over the physical adapter, and then check to see if any other 6192 * vnets are interested in it. 6193 */ 6194 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6195 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6196 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 6197 nmp = dupmsg(mp); 6198 if (nmp) { 6199 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6200 DERR(vswp, "%s: dropping pkt(s) " 6201 "consisting of %ld bytes of " 6202 "data for physical device", 6203 __func__, MBLKL(ret_m)); 6204 freemsg(ret_m); 6205 } 6206 } 6207 } 6208 6209 READ_ENTER(&vswp->mfdbrw); 6210 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 6211 (mod_hash_val_t *)&entp) != 0) { 6212 D3(vswp, "%s: no table entry found for addr 0x%llx", 6213 __func__, key); 6214 } else { 6215 /* 6216 * Send to list of devices associated with this address... 6217 */ 6218 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 6219 6220 /* dont send to ourselves */ 6221 if ((caller == VSW_VNETPORT) && 6222 (tpp->d_addr == (void *)arg)) { 6223 port = (vsw_port_t *)tpp->d_addr; 6224 D3(vswp, "%s: not sending to ourselves" 6225 " : port %d", __func__, 6226 port->p_instance); 6227 continue; 6228 6229 } else if ((caller == VSW_LOCALDEV) && 6230 (tpp->d_type == VSW_LOCALDEV)) { 6231 D3(vswp, "%s: not sending back up stack", 6232 __func__); 6233 continue; 6234 } 6235 6236 if (tpp->d_type == VSW_VNETPORT) { 6237 port = (vsw_port_t *)tpp->d_addr; 6238 D3(vswp, "%s: sending to port %ld for " 6239 " addr 0x%llx", __func__, 6240 port->p_instance, key); 6241 6242 nmp = dupmsg(mp); 6243 if (nmp) 6244 (void) vsw_portsend(port, nmp); 6245 } else { 6246 if (vswp->if_state & VSW_IF_UP) { 6247 nmp = copymsg(mp); 6248 if (nmp) 6249 mac_rx(vswp->if_mh, NULL, nmp); 6250 check_if = B_FALSE; 6251 D3(vswp, "%s: sending up stack" 6252 " for addr 0x%llx", __func__, 6253 key); 6254 } 6255 } 6256 } 6257 } 6258 6259 RW_EXIT(&vswp->mfdbrw); 6260 6261 /* 6262 * If the pkt came from either a vnet or from physical device, 6263 * and if we havent already sent the pkt up the stack then we 6264 * check now if we can/should (i.e. the interface is plumbed 6265 * and in promisc mode). 6266 */ 6267 if ((check_if) && 6268 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 6269 READ_ENTER(&vswp->if_lockrw); 6270 if (VSW_U_P(vswp->if_state)) { 6271 RW_EXIT(&vswp->if_lockrw); 6272 D3(vswp, "%s: (caller %d) finally sending up stack" 6273 " for addr 0x%llx", __func__, caller, key); 6274 nmp = copymsg(mp); 6275 if (nmp) 6276 mac_rx(vswp->if_mh, NULL, nmp); 6277 } else { 6278 RW_EXIT(&vswp->if_lockrw); 6279 } 6280 } 6281 6282 freemsg(mp); 6283 6284 D1(vswp, "%s: exit", __func__); 6285 6286 return (0); 6287 } 6288 6289 /* transmit the packet over the given port */ 6290 static int 6291 vsw_portsend(vsw_port_t *port, mblk_t *mp) 6292 { 6293 vsw_ldc_list_t *ldcl = &port->p_ldclist; 6294 vsw_ldc_t *ldcp; 6295 int status = 0; 6296 6297 6298 READ_ENTER(&ldcl->lockrw); 6299 /* 6300 * Note for now, we have a single channel. 6301 */ 6302 ldcp = ldcl->head; 6303 if (ldcp == NULL) { 6304 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 6305 freemsg(mp); 6306 RW_EXIT(&ldcl->lockrw); 6307 return (1); 6308 } 6309 6310 /* 6311 * Send the message out using the appropriate 6312 * transmit function which will free mblock when it 6313 * is finished with it. 6314 */ 6315 mutex_enter(&port->tx_lock); 6316 if (port->transmit != NULL) 6317 status = (*port->transmit)(ldcp, mp); 6318 else { 6319 freemsg(mp); 6320 } 6321 mutex_exit(&port->tx_lock); 6322 6323 RW_EXIT(&ldcl->lockrw); 6324 6325 return (status); 6326 } 6327 6328 /* 6329 * Send packet out via descriptor ring to a logical device. 6330 */ 6331 static int 6332 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 6333 { 6334 vio_dring_msg_t dring_pkt; 6335 dring_info_t *dp = NULL; 6336 vsw_private_desc_t *priv_desc = NULL; 6337 vnet_public_desc_t *pub = NULL; 6338 vsw_t *vswp = ldcp->ldc_vswp; 6339 mblk_t *bp; 6340 size_t n, size; 6341 caddr_t bufp; 6342 int idx; 6343 int status = LDC_TX_SUCCESS; 6344 6345 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 6346 6347 /* TODO: make test a macro */ 6348 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 6349 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 6350 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 6351 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 6352 ldcp->lane_out.lstate); 6353 freemsg(mp); 6354 return (LDC_TX_FAILURE); 6355 } 6356 6357 /* 6358 * Note - using first ring only, this may change 6359 * in the future. 6360 */ 6361 if ((dp = ldcp->lane_out.dringp) == NULL) { 6362 DERR(vswp, "%s(%lld): no dring for outbound lane on" 6363 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 6364 freemsg(mp); 6365 return (LDC_TX_FAILURE); 6366 } 6367 6368 size = msgsize(mp); 6369 if (size > (size_t)ETHERMAX) { 6370 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6371 ldcp->ldc_id, size); 6372 freemsg(mp); 6373 return (LDC_TX_FAILURE); 6374 } 6375 6376 /* 6377 * Find a free descriptor 6378 * 6379 * Note: for the moment we are assuming that we will only 6380 * have one dring going from the switch to each of its 6381 * peers. This may change in the future. 6382 */ 6383 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6384 D2(vswp, "%s(%lld): no descriptor available for ring " 6385 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6386 6387 /* nothing more we can do */ 6388 status = LDC_TX_NORESOURCES; 6389 goto vsw_dringsend_free_exit; 6390 } else { 6391 D2(vswp, "%s(%lld): free private descriptor found at pos " 6392 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 6393 priv_desc); 6394 } 6395 6396 /* copy data into the descriptor */ 6397 bufp = priv_desc->datap; 6398 bufp += VNET_IPALIGN; 6399 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6400 n = MBLKL(bp); 6401 bcopy(bp->b_rptr, bufp, n); 6402 bufp += n; 6403 } 6404 6405 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6406 6407 pub = priv_desc->descp; 6408 pub->nbytes = priv_desc->datalen; 6409 6410 mutex_enter(&priv_desc->dstate_lock); 6411 pub->hdr.dstate = VIO_DESC_READY; 6412 mutex_exit(&priv_desc->dstate_lock); 6413 6414 /* 6415 * Determine whether or not we need to send a message to our 6416 * peer prompting them to read our newly updated descriptor(s). 6417 */ 6418 mutex_enter(&dp->restart_lock); 6419 if (dp->restart_reqd) { 6420 dp->restart_reqd = B_FALSE; 6421 mutex_exit(&dp->restart_lock); 6422 6423 /* 6424 * Send a vio_dring_msg to peer to prompt them to read 6425 * the updated descriptor ring. 6426 */ 6427 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 6428 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 6429 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 6430 dring_pkt.tag.vio_sid = ldcp->local_session; 6431 6432 /* Note - for now using first ring */ 6433 dring_pkt.dring_ident = dp->ident; 6434 6435 mutex_enter(&ldcp->lane_out.seq_lock); 6436 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 6437 mutex_exit(&ldcp->lane_out.seq_lock); 6438 6439 /* 6440 * If last_ack_recv is -1 then we know we've not 6441 * received any ack's yet, so this must be the first 6442 * msg sent, so set the start to the begining of the ring. 6443 */ 6444 mutex_enter(&dp->dlock); 6445 if (dp->last_ack_recv == -1) { 6446 dring_pkt.start_idx = 0; 6447 } else { 6448 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 6449 dp->num_descriptors; 6450 } 6451 dring_pkt.end_idx = -1; 6452 mutex_exit(&dp->dlock); 6453 6454 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 6455 ldcp->ldc_id, dp, dring_pkt.dring_ident); 6456 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 6457 __func__, ldcp->ldc_id, dring_pkt.start_idx, 6458 dring_pkt.end_idx, dring_pkt.seq_num); 6459 6460 vsw_send_msg(ldcp, (void *)&dring_pkt, 6461 sizeof (vio_dring_msg_t)); 6462 } else { 6463 mutex_exit(&dp->restart_lock); 6464 D2(vswp, "%s(%lld): updating descp %d", __func__, 6465 ldcp->ldc_id, idx); 6466 } 6467 6468 vsw_dringsend_free_exit: 6469 6470 /* free the message block */ 6471 freemsg(mp); 6472 6473 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 6474 return (status); 6475 } 6476 6477 /* 6478 * Send an in-band descriptor message over ldc. 6479 */ 6480 static int 6481 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 6482 { 6483 vsw_t *vswp = ldcp->ldc_vswp; 6484 vio_ibnd_desc_t ibnd_msg; 6485 vsw_private_desc_t *priv_desc = NULL; 6486 dring_info_t *dp = NULL; 6487 size_t n, size = 0; 6488 caddr_t bufp; 6489 mblk_t *bp; 6490 int idx, i; 6491 int status = LDC_TX_SUCCESS; 6492 static int warn_msg = 1; 6493 6494 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6495 6496 ASSERT(mp != NULL); 6497 6498 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 6499 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 6500 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 6501 __func__, ldcp->ldc_id, ldcp->ldc_status, 6502 ldcp->lane_out.lstate); 6503 freemsg(mp); 6504 return (LDC_TX_FAILURE); 6505 } 6506 6507 /* 6508 * only expect single dring to exist, which we use 6509 * as an internal buffer, rather than a transfer channel. 6510 */ 6511 if ((dp = ldcp->lane_out.dringp) == NULL) { 6512 DERR(vswp, "%s(%lld): no dring for outbound lane", 6513 __func__, ldcp->ldc_id); 6514 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 6515 __func__, ldcp->ldc_id, ldcp->ldc_status, 6516 ldcp->lane_out.lstate); 6517 freemsg(mp); 6518 return (LDC_TX_FAILURE); 6519 } 6520 6521 size = msgsize(mp); 6522 if (size > (size_t)ETHERMAX) { 6523 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6524 ldcp->ldc_id, size); 6525 freemsg(mp); 6526 return (LDC_TX_FAILURE); 6527 } 6528 6529 /* 6530 * Find a free descriptor in our buffer ring 6531 */ 6532 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6533 if (warn_msg) { 6534 DERR(vswp, "%s(%lld): no descriptor available for ring " 6535 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6536 warn_msg = 0; 6537 } 6538 6539 /* nothing more we can do */ 6540 status = LDC_TX_NORESOURCES; 6541 goto vsw_descrsend_free_exit; 6542 } else { 6543 D2(vswp, "%s(%lld): free private descriptor found at pos " 6544 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 6545 priv_desc); 6546 warn_msg = 1; 6547 } 6548 6549 /* copy data into the descriptor */ 6550 bufp = priv_desc->datap; 6551 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6552 n = MBLKL(bp); 6553 bcopy(bp->b_rptr, bufp, n); 6554 bufp += n; 6555 } 6556 6557 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6558 6559 /* create and send the in-band descp msg */ 6560 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 6561 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 6562 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 6563 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 6564 6565 mutex_enter(&ldcp->lane_out.seq_lock); 6566 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 6567 mutex_exit(&ldcp->lane_out.seq_lock); 6568 6569 /* 6570 * Copy the mem cookies describing the data from the 6571 * private region of the descriptor ring into the inband 6572 * descriptor. 6573 */ 6574 for (i = 0; i < priv_desc->ncookies; i++) { 6575 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 6576 sizeof (ldc_mem_cookie_t)); 6577 } 6578 6579 ibnd_msg.hdr.desc_handle = idx; 6580 ibnd_msg.ncookies = priv_desc->ncookies; 6581 ibnd_msg.nbytes = size; 6582 6583 vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t)); 6584 6585 vsw_descrsend_free_exit: 6586 6587 /* free the allocated message blocks */ 6588 freemsg(mp); 6589 6590 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6591 return (status); 6592 } 6593 6594 static void 6595 vsw_send_ver(void *arg) 6596 { 6597 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 6598 vsw_t *vswp = ldcp->ldc_vswp; 6599 lane_t *lp = &ldcp->lane_out; 6600 vio_ver_msg_t ver_msg; 6601 6602 D1(vswp, "%s enter", __func__); 6603 6604 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6605 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6606 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 6607 ver_msg.tag.vio_sid = ldcp->local_session; 6608 6609 ver_msg.ver_major = vsw_versions[0].ver_major; 6610 ver_msg.ver_minor = vsw_versions[0].ver_minor; 6611 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 6612 6613 lp->lstate |= VSW_VER_INFO_SENT; 6614 lp->ver_major = ver_msg.ver_major; 6615 lp->ver_minor = ver_msg.ver_minor; 6616 6617 DUMP_TAG(ver_msg.tag); 6618 6619 vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); 6620 6621 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 6622 } 6623 6624 static void 6625 vsw_send_attr(vsw_ldc_t *ldcp) 6626 { 6627 vsw_t *vswp = ldcp->ldc_vswp; 6628 lane_t *lp = &ldcp->lane_out; 6629 vnet_attr_msg_t attr_msg; 6630 6631 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6632 6633 /* 6634 * Subtype is set to INFO by default 6635 */ 6636 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6637 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6638 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 6639 attr_msg.tag.vio_sid = ldcp->local_session; 6640 6641 /* payload copied from default settings for lane */ 6642 attr_msg.mtu = lp->mtu; 6643 attr_msg.addr_type = lp->addr_type; 6644 attr_msg.xfer_mode = lp->xfer_mode; 6645 attr_msg.ack_freq = lp->xfer_mode; 6646 6647 READ_ENTER(&vswp->if_lockrw); 6648 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 6649 RW_EXIT(&vswp->if_lockrw); 6650 6651 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 6652 6653 DUMP_TAG(attr_msg.tag); 6654 6655 vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); 6656 6657 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6658 } 6659 6660 /* 6661 * Create dring info msg (which also results in the creation of 6662 * a dring). 6663 */ 6664 static vio_dring_reg_msg_t * 6665 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 6666 { 6667 vio_dring_reg_msg_t *mp; 6668 dring_info_t *dp; 6669 vsw_t *vswp = ldcp->ldc_vswp; 6670 6671 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 6672 6673 /* 6674 * If we can't create a dring, obviously no point sending 6675 * a message. 6676 */ 6677 if ((dp = vsw_create_dring(ldcp)) == NULL) 6678 return (NULL); 6679 6680 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 6681 6682 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 6683 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 6684 mp->tag.vio_subtype_env = VIO_DRING_REG; 6685 mp->tag.vio_sid = ldcp->local_session; 6686 6687 /* payload */ 6688 mp->num_descriptors = dp->num_descriptors; 6689 mp->descriptor_size = dp->descriptor_size; 6690 mp->options = dp->options; 6691 mp->ncookies = dp->ncookies; 6692 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 6693 6694 mp->dring_ident = 0; 6695 6696 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 6697 6698 return (mp); 6699 } 6700 6701 static void 6702 vsw_send_dring_info(vsw_ldc_t *ldcp) 6703 { 6704 vio_dring_reg_msg_t *dring_msg; 6705 vsw_t *vswp = ldcp->ldc_vswp; 6706 6707 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 6708 6709 dring_msg = vsw_create_dring_info_pkt(ldcp); 6710 if (dring_msg == NULL) { 6711 cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); 6712 return; 6713 } 6714 6715 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 6716 6717 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 6718 6719 vsw_send_msg(ldcp, dring_msg, 6720 sizeof (vio_dring_reg_msg_t)); 6721 6722 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 6723 6724 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 6725 } 6726 6727 static void 6728 vsw_send_rdx(vsw_ldc_t *ldcp) 6729 { 6730 vsw_t *vswp = ldcp->ldc_vswp; 6731 vio_rdx_msg_t rdx_msg; 6732 6733 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6734 6735 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6736 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6737 rdx_msg.tag.vio_subtype_env = VIO_RDX; 6738 rdx_msg.tag.vio_sid = ldcp->local_session; 6739 6740 ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; 6741 6742 DUMP_TAG(rdx_msg.tag); 6743 6744 vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); 6745 6746 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 6747 } 6748 6749 /* 6750 * Generic routine to send message out over ldc channel. 6751 */ 6752 static void 6753 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) 6754 { 6755 int rv; 6756 size_t msglen = size; 6757 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 6758 vsw_t *vswp = ldcp->ldc_vswp; 6759 6760 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 6761 ldcp->ldc_id, size); 6762 6763 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 6764 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 6765 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 6766 6767 mutex_enter(&ldcp->ldc_txlock); 6768 do { 6769 msglen = size; 6770 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 6771 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 6772 6773 if ((rv != 0) || (msglen != size)) { 6774 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 6775 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 6776 rv, size, msglen); 6777 } 6778 mutex_exit(&ldcp->ldc_txlock); 6779 6780 /* channel has been reset */ 6781 if (rv == ECONNRESET) { 6782 vsw_handle_reset(ldcp); 6783 } 6784 6785 D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", 6786 ldcp->ldc_id, msglen); 6787 } 6788 6789 /* 6790 * Add an entry into FDB, for the given mac address and port_id. 6791 * Returns 0 on success, 1 on failure. 6792 * 6793 * Lock protecting FDB must be held by calling process. 6794 */ 6795 static int 6796 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 6797 { 6798 uint64_t addr = 0; 6799 6800 D1(vswp, "%s: enter", __func__); 6801 6802 KEY_HASH(addr, port->p_macaddr); 6803 6804 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6805 6806 /* 6807 * Note: duplicate keys will be rejected by mod_hash. 6808 */ 6809 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 6810 (mod_hash_val_t)port) != 0) { 6811 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 6812 return (1); 6813 } 6814 6815 D1(vswp, "%s: exit", __func__); 6816 return (0); 6817 } 6818 6819 /* 6820 * Remove an entry from FDB. 6821 * Returns 0 on success, 1 on failure. 6822 */ 6823 static int 6824 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 6825 { 6826 uint64_t addr = 0; 6827 6828 D1(vswp, "%s: enter", __func__); 6829 6830 KEY_HASH(addr, port->p_macaddr); 6831 6832 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6833 6834 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 6835 6836 D1(vswp, "%s: enter", __func__); 6837 6838 return (0); 6839 } 6840 6841 /* 6842 * Search fdb for a given mac address. 6843 * Returns pointer to the entry if found, else returns NULL. 6844 */ 6845 static vsw_port_t * 6846 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 6847 { 6848 uint64_t key = 0; 6849 vsw_port_t *port = NULL; 6850 6851 D1(vswp, "%s: enter", __func__); 6852 6853 KEY_HASH(key, ehp->ether_dhost); 6854 6855 D2(vswp, "%s: key = 0x%llx", __func__, key); 6856 6857 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 6858 (mod_hash_val_t *)&port) != 0) { 6859 return (NULL); 6860 } 6861 6862 D1(vswp, "%s: exit", __func__); 6863 6864 return (port); 6865 } 6866 6867 /* 6868 * Add or remove multicast address(es). 6869 * 6870 * Returns 0 on success, 1 on failure. 6871 */ 6872 static int 6873 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 6874 { 6875 mcst_addr_t *mcst_p = NULL; 6876 vsw_t *vswp = port->p_vswp; 6877 uint64_t addr = 0x0; 6878 int i, ret; 6879 6880 D1(vswp, "%s: enter", __func__); 6881 6882 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 6883 6884 if (vswp->mh == NULL) 6885 return (1); 6886 6887 for (i = 0; i < mcst_pkt->count; i++) { 6888 /* 6889 * Convert address into form that can be used 6890 * as hash table key. 6891 */ 6892 KEY_HASH(addr, mcst_pkt->mca[i]); 6893 6894 /* 6895 * Add or delete the specified address/port combination. 6896 */ 6897 if (mcst_pkt->set == 0x1) { 6898 D3(vswp, "%s: adding multicast address 0x%llx for " 6899 "port %ld", __func__, addr, port->p_instance); 6900 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6901 /* 6902 * Update the list of multicast 6903 * addresses contained within the 6904 * port structure to include this new 6905 * one. 6906 */ 6907 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 6908 KM_NOSLEEP); 6909 if (mcst_p == NULL) { 6910 DERR(vswp, "%s: unable to alloc mem", 6911 __func__); 6912 return (1); 6913 } 6914 6915 mcst_p->nextp = NULL; 6916 mcst_p->addr = addr; 6917 6918 mutex_enter(&port->mca_lock); 6919 mcst_p->nextp = port->mcap; 6920 port->mcap = mcst_p; 6921 mutex_exit(&port->mca_lock); 6922 6923 /* 6924 * Program the address into HW. If the addr 6925 * has already been programmed then the MAC 6926 * just increments a ref counter (which is 6927 * used when the address is being deleted) 6928 */ 6929 ret = mac_multicst_add(vswp->mh, 6930 (uchar_t *)&mcst_pkt->mca[i]); 6931 if (ret) { 6932 cmn_err(CE_WARN, "!unable to add " 6933 "multicast address"); 6934 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6935 addr, port); 6936 vsw_del_addr(VSW_VNETPORT, port, addr); 6937 return (ret); 6938 } 6939 6940 } else { 6941 DERR(vswp, "%s: error adding multicast " 6942 "address 0x%llx for port %ld", 6943 __func__, addr, port->p_instance); 6944 return (1); 6945 } 6946 } else { 6947 /* 6948 * Delete an entry from the multicast hash 6949 * table and update the address list 6950 * appropriately. 6951 */ 6952 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6953 D3(vswp, "%s: deleting multicast address " 6954 "0x%llx for port %ld", __func__, addr, 6955 port->p_instance); 6956 6957 vsw_del_addr(VSW_VNETPORT, port, addr); 6958 6959 /* 6960 * Remove the address from HW. The address 6961 * will actually only be removed once the ref 6962 * count within the MAC layer has dropped to 6963 * zero. I.e. we can safely call this fn even 6964 * if other ports are interested in this 6965 * address. 6966 */ 6967 (void) mac_multicst_remove(vswp->mh, 6968 (uchar_t *)&mcst_pkt->mca[i]); 6969 6970 } else { 6971 DERR(vswp, "%s: error deleting multicast " 6972 "addr 0x%llx for port %ld", 6973 __func__, addr, port->p_instance); 6974 return (1); 6975 } 6976 } 6977 } 6978 D1(vswp, "%s: exit", __func__); 6979 return (0); 6980 } 6981 6982 /* 6983 * Add a new multicast entry. 6984 * 6985 * Search hash table based on address. If match found then 6986 * update associated val (which is chain of ports), otherwise 6987 * create new key/val (addr/port) pair and insert into table. 6988 */ 6989 static int 6990 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6991 { 6992 int dup = 0; 6993 int rv = 0; 6994 mfdb_ent_t *ment = NULL; 6995 mfdb_ent_t *tmp_ent = NULL; 6996 mfdb_ent_t *new_ent = NULL; 6997 void *tgt = NULL; 6998 6999 if (devtype == VSW_VNETPORT) { 7000 /* 7001 * Being invoked from a vnet. 7002 */ 7003 ASSERT(arg != NULL); 7004 tgt = arg; 7005 D2(NULL, "%s: port %d : address 0x%llx", __func__, 7006 ((vsw_port_t *)arg)->p_instance, addr); 7007 } else { 7008 /* 7009 * We are being invoked via the m_multicst mac entry 7010 * point. 7011 */ 7012 D2(NULL, "%s: address 0x%llx", __func__, addr); 7013 tgt = (void *)vswp; 7014 } 7015 7016 WRITE_ENTER(&vswp->mfdbrw); 7017 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7018 (mod_hash_val_t *)&ment) != 0) { 7019 7020 /* address not currently in table */ 7021 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7022 ment->d_addr = (void *)tgt; 7023 ment->d_type = devtype; 7024 ment->nextp = NULL; 7025 7026 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 7027 (mod_hash_val_t)ment) != 0) { 7028 DERR(vswp, "%s: hash table insertion failed", __func__); 7029 kmem_free(ment, sizeof (mfdb_ent_t)); 7030 rv = 1; 7031 } else { 7032 D2(vswp, "%s: added initial entry for 0x%llx to " 7033 "table", __func__, addr); 7034 } 7035 } else { 7036 /* 7037 * Address in table. Check to see if specified port 7038 * is already associated with the address. If not add 7039 * it now. 7040 */ 7041 tmp_ent = ment; 7042 while (tmp_ent != NULL) { 7043 if (tmp_ent->d_addr == (void *)tgt) { 7044 if (devtype == VSW_VNETPORT) { 7045 DERR(vswp, "%s: duplicate port entry " 7046 "found for portid %ld and key " 7047 "0x%llx", __func__, 7048 ((vsw_port_t *)arg)->p_instance, 7049 addr); 7050 } else { 7051 DERR(vswp, "%s: duplicate entry found" 7052 "for key 0x%llx", 7053 __func__, addr); 7054 } 7055 rv = 1; 7056 dup = 1; 7057 break; 7058 } 7059 tmp_ent = tmp_ent->nextp; 7060 } 7061 7062 /* 7063 * Port not on list so add it to end now. 7064 */ 7065 if (0 == dup) { 7066 D2(vswp, "%s: added entry for 0x%llx to table", 7067 __func__, addr); 7068 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7069 new_ent->d_addr = (void *)tgt; 7070 new_ent->d_type = devtype; 7071 new_ent->nextp = NULL; 7072 7073 tmp_ent = ment; 7074 while (tmp_ent->nextp != NULL) 7075 tmp_ent = tmp_ent->nextp; 7076 7077 tmp_ent->nextp = new_ent; 7078 } 7079 } 7080 7081 RW_EXIT(&vswp->mfdbrw); 7082 return (rv); 7083 } 7084 7085 /* 7086 * Remove a multicast entry from the hashtable. 7087 * 7088 * Search hash table based on address. If match found, scan 7089 * list of ports associated with address. If specified port 7090 * found remove it from list. 7091 */ 7092 static int 7093 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7094 { 7095 mfdb_ent_t *ment = NULL; 7096 mfdb_ent_t *curr_p, *prev_p; 7097 void *tgt = NULL; 7098 7099 D1(vswp, "%s: enter", __func__); 7100 7101 if (devtype == VSW_VNETPORT) { 7102 tgt = (vsw_port_t *)arg; 7103 D2(vswp, "%s: removing port %d from mFDB for address" 7104 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 7105 addr); 7106 } else { 7107 D2(vswp, "%s: removing entry", __func__); 7108 tgt = (void *)vswp; 7109 } 7110 7111 WRITE_ENTER(&vswp->mfdbrw); 7112 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7113 (mod_hash_val_t *)&ment) != 0) { 7114 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 7115 RW_EXIT(&vswp->mfdbrw); 7116 return (1); 7117 } 7118 7119 prev_p = curr_p = ment; 7120 7121 while (curr_p != NULL) { 7122 if (curr_p->d_addr == (void *)tgt) { 7123 if (devtype == VSW_VNETPORT) { 7124 D2(vswp, "%s: port %d found", __func__, 7125 ((vsw_port_t *)tgt)->p_instance); 7126 } else { 7127 D2(vswp, "%s: instance found", __func__); 7128 } 7129 7130 if (prev_p == curr_p) { 7131 /* 7132 * head of list, if no other element is in 7133 * list then destroy this entry, otherwise 7134 * just replace it with updated value. 7135 */ 7136 ment = curr_p->nextp; 7137 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7138 if (ment == NULL) { 7139 (void) mod_hash_destroy(vswp->mfdb, 7140 (mod_hash_val_t)addr); 7141 } else { 7142 (void) mod_hash_replace(vswp->mfdb, 7143 (mod_hash_key_t)addr, 7144 (mod_hash_val_t)ment); 7145 } 7146 } else { 7147 /* 7148 * Not head of list, no need to do 7149 * replacement, just adjust list pointers. 7150 */ 7151 prev_p->nextp = curr_p->nextp; 7152 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7153 } 7154 break; 7155 } 7156 7157 prev_p = curr_p; 7158 curr_p = curr_p->nextp; 7159 } 7160 7161 RW_EXIT(&vswp->mfdbrw); 7162 7163 D1(vswp, "%s: exit", __func__); 7164 7165 return (0); 7166 } 7167 7168 /* 7169 * Port is being deleted, but has registered an interest in one 7170 * or more multicast groups. Using the list of addresses maintained 7171 * within the port structure find the appropriate entry in the hash 7172 * table and remove this port from the list of interested ports. 7173 */ 7174 static void 7175 vsw_del_mcst_port(vsw_port_t *port) 7176 { 7177 mcst_addr_t *mcst_p = NULL; 7178 vsw_t *vswp = port->p_vswp; 7179 7180 D1(vswp, "%s: enter", __func__); 7181 7182 mutex_enter(&port->mca_lock); 7183 while (port->mcap != NULL) { 7184 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7185 port->mcap->addr, port); 7186 7187 mcst_p = port->mcap->nextp; 7188 kmem_free(port->mcap, sizeof (mcst_addr_t)); 7189 port->mcap = mcst_p; 7190 } 7191 mutex_exit(&port->mca_lock); 7192 7193 D1(vswp, "%s: exit", __func__); 7194 } 7195 7196 /* 7197 * This vsw instance is detaching, but has registered an interest in one 7198 * or more multicast groups. Using the list of addresses maintained 7199 * within the vsw structure find the appropriate entry in the hash 7200 * table and remove this instance from the list of interested ports. 7201 */ 7202 static void 7203 vsw_del_mcst_vsw(vsw_t *vswp) 7204 { 7205 mcst_addr_t *next_p = NULL; 7206 7207 D1(vswp, "%s: enter", __func__); 7208 7209 mutex_enter(&vswp->mca_lock); 7210 7211 while (vswp->mcap != NULL) { 7212 DERR(vswp, "%s: deleting addr 0x%llx", 7213 __func__, vswp->mcap->addr); 7214 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 7215 vswp->mcap->addr, NULL); 7216 7217 next_p = vswp->mcap->nextp; 7218 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 7219 vswp->mcap = next_p; 7220 } 7221 7222 vswp->mcap = NULL; 7223 mutex_exit(&vswp->mca_lock); 7224 7225 D1(vswp, "%s: exit", __func__); 7226 } 7227 7228 7229 /* 7230 * Remove the specified address from the list of address maintained 7231 * in this port node. 7232 */ 7233 static void 7234 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 7235 { 7236 vsw_t *vswp = NULL; 7237 vsw_port_t *port = NULL; 7238 mcst_addr_t *prev_p = NULL; 7239 mcst_addr_t *curr_p = NULL; 7240 7241 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 7242 __func__, devtype, addr); 7243 7244 if (devtype == VSW_VNETPORT) { 7245 port = (vsw_port_t *)arg; 7246 mutex_enter(&port->mca_lock); 7247 prev_p = curr_p = port->mcap; 7248 } else { 7249 vswp = (vsw_t *)arg; 7250 mutex_enter(&vswp->mca_lock); 7251 prev_p = curr_p = vswp->mcap; 7252 } 7253 7254 while (curr_p != NULL) { 7255 if (curr_p->addr == addr) { 7256 D2(NULL, "%s: address found", __func__); 7257 /* match found */ 7258 if (prev_p == curr_p) { 7259 /* list head */ 7260 if (devtype == VSW_VNETPORT) 7261 port->mcap = curr_p->nextp; 7262 else 7263 vswp->mcap = curr_p->nextp; 7264 } else { 7265 prev_p->nextp = curr_p->nextp; 7266 } 7267 kmem_free(curr_p, sizeof (mcst_addr_t)); 7268 break; 7269 } else { 7270 prev_p = curr_p; 7271 curr_p = curr_p->nextp; 7272 } 7273 } 7274 7275 if (devtype == VSW_VNETPORT) 7276 mutex_exit(&port->mca_lock); 7277 else 7278 mutex_exit(&vswp->mca_lock); 7279 7280 D1(NULL, "%s: exit", __func__); 7281 } 7282 7283 /* 7284 * Creates a descriptor ring (dring) and links it into the 7285 * link of outbound drings for this channel. 7286 * 7287 * Returns NULL if creation failed. 7288 */ 7289 static dring_info_t * 7290 vsw_create_dring(vsw_ldc_t *ldcp) 7291 { 7292 vsw_private_desc_t *priv_addr = NULL; 7293 vsw_t *vswp = ldcp->ldc_vswp; 7294 ldc_mem_info_t minfo; 7295 dring_info_t *dp, *tp; 7296 int i; 7297 7298 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 7299 7300 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 7301 7302 /* create public section of ring */ 7303 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 7304 VSW_PUB_SIZE, &dp->handle)) != 0) { 7305 7306 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 7307 "failed", ldcp->ldc_id); 7308 goto create_fail_exit; 7309 } 7310 7311 ASSERT(dp->handle != NULL); 7312 7313 /* 7314 * Get the base address of the public section of the ring. 7315 */ 7316 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 7317 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 7318 ldcp->ldc_id); 7319 goto dring_fail_exit; 7320 } else { 7321 ASSERT(minfo.vaddr != 0); 7322 dp->pub_addr = minfo.vaddr; 7323 } 7324 7325 dp->num_descriptors = VSW_RING_NUM_EL; 7326 dp->descriptor_size = VSW_PUB_SIZE; 7327 dp->options = VIO_TX_DRING; 7328 dp->ncookies = 1; /* guaranteed by ldc */ 7329 7330 /* 7331 * create private portion of ring 7332 */ 7333 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 7334 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 7335 7336 if (vsw_setup_ring(ldcp, dp)) { 7337 DERR(vswp, "%s: unable to setup ring", __func__); 7338 goto dring_fail_exit; 7339 } 7340 7341 /* haven't used any descriptors yet */ 7342 dp->end_idx = 0; 7343 dp->last_ack_recv = -1; 7344 7345 /* bind dring to the channel */ 7346 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 7347 LDC_SHADOW_MAP, LDC_MEM_RW, 7348 &dp->cookie[0], &dp->ncookies)) != 0) { 7349 DERR(vswp, "vsw_create_dring: unable to bind to channel " 7350 "%lld", ldcp->ldc_id); 7351 goto dring_fail_exit; 7352 } 7353 7354 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 7355 dp->restart_reqd = B_TRUE; 7356 7357 /* 7358 * Only ever create rings for outgoing lane. Link it onto 7359 * end of list. 7360 */ 7361 if (ldcp->lane_out.dringp == NULL) { 7362 D2(vswp, "vsw_create_dring: adding first outbound ring"); 7363 ldcp->lane_out.dringp = dp; 7364 } else { 7365 tp = ldcp->lane_out.dringp; 7366 while (tp->next != NULL) 7367 tp = tp->next; 7368 7369 tp->next = dp; 7370 } 7371 7372 return (dp); 7373 7374 dring_fail_exit: 7375 (void) ldc_mem_dring_destroy(dp->handle); 7376 7377 create_fail_exit: 7378 if (dp->priv_addr != NULL) { 7379 priv_addr = dp->priv_addr; 7380 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7381 if (priv_addr->memhandle != NULL) 7382 (void) ldc_mem_free_handle( 7383 priv_addr->memhandle); 7384 priv_addr++; 7385 } 7386 kmem_free(dp->priv_addr, 7387 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 7388 } 7389 mutex_destroy(&dp->dlock); 7390 7391 kmem_free(dp, sizeof (dring_info_t)); 7392 return (NULL); 7393 } 7394 7395 /* 7396 * Create a ring consisting of just a private portion and link 7397 * it into the list of rings for the outbound lane. 7398 * 7399 * These type of rings are used primarily for temporary data 7400 * storage (i.e. as data buffers). 7401 */ 7402 void 7403 vsw_create_privring(vsw_ldc_t *ldcp) 7404 { 7405 dring_info_t *dp, *tp; 7406 vsw_t *vswp = ldcp->ldc_vswp; 7407 7408 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7409 7410 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 7411 7412 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 7413 7414 /* no public section */ 7415 dp->pub_addr = NULL; 7416 7417 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 7418 VSW_RING_NUM_EL), KM_SLEEP); 7419 7420 dp->num_descriptors = VSW_RING_NUM_EL; 7421 7422 if (vsw_setup_ring(ldcp, dp)) { 7423 DERR(vswp, "%s: setup of ring failed", __func__); 7424 kmem_free(dp->priv_addr, 7425 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 7426 mutex_destroy(&dp->dlock); 7427 kmem_free(dp, sizeof (dring_info_t)); 7428 return; 7429 } 7430 7431 /* haven't used any descriptors yet */ 7432 dp->end_idx = 0; 7433 7434 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 7435 dp->restart_reqd = B_TRUE; 7436 7437 /* 7438 * Only ever create rings for outgoing lane. Link it onto 7439 * end of list. 7440 */ 7441 if (ldcp->lane_out.dringp == NULL) { 7442 D2(vswp, "%s: adding first outbound privring", __func__); 7443 ldcp->lane_out.dringp = dp; 7444 } else { 7445 tp = ldcp->lane_out.dringp; 7446 while (tp->next != NULL) 7447 tp = tp->next; 7448 7449 tp->next = dp; 7450 } 7451 7452 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7453 } 7454 7455 /* 7456 * Setup the descriptors in the dring. Returns 0 on success, 1 on 7457 * failure. 7458 */ 7459 int 7460 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 7461 { 7462 vnet_public_desc_t *pub_addr = NULL; 7463 vsw_private_desc_t *priv_addr = NULL; 7464 vsw_t *vswp = ldcp->ldc_vswp; 7465 uint64_t *tmpp; 7466 uint64_t offset = 0; 7467 uint32_t ncookies = 0; 7468 static char *name = "vsw_setup_ring"; 7469 int i, j, nc, rv; 7470 7471 priv_addr = dp->priv_addr; 7472 pub_addr = dp->pub_addr; 7473 7474 /* public section may be null but private should never be */ 7475 ASSERT(priv_addr != NULL); 7476 7477 /* 7478 * Allocate the region of memory which will be used to hold 7479 * the data the descriptors will refer to. 7480 */ 7481 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 7482 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 7483 7484 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 7485 dp->data_sz, dp->data_addr); 7486 7487 tmpp = (uint64_t *)dp->data_addr; 7488 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 7489 7490 /* 7491 * Initialise some of the private and public (if they exist) 7492 * descriptor fields. 7493 */ 7494 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7495 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 7496 7497 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 7498 &priv_addr->memhandle)) != 0) { 7499 DERR(vswp, "%s: alloc mem handle failed", name); 7500 goto setup_ring_cleanup; 7501 } 7502 7503 priv_addr->datap = (void *)tmpp; 7504 7505 rv = ldc_mem_bind_handle(priv_addr->memhandle, 7506 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 7507 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 7508 &(priv_addr->memcookie[0]), &ncookies); 7509 if (rv != 0) { 7510 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 7511 "(rv %d)", name, ldcp->ldc_id, rv); 7512 goto setup_ring_cleanup; 7513 } 7514 priv_addr->bound = 1; 7515 7516 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 7517 name, i, priv_addr->memcookie[0].addr, 7518 priv_addr->memcookie[0].size); 7519 7520 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 7521 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 7522 "invalid num of cookies (%d) for size 0x%llx", 7523 name, ldcp->ldc_id, ncookies, 7524 VSW_RING_EL_DATA_SZ); 7525 7526 goto setup_ring_cleanup; 7527 } else { 7528 for (j = 1; j < ncookies; j++) { 7529 rv = ldc_mem_nextcookie(priv_addr->memhandle, 7530 &(priv_addr->memcookie[j])); 7531 if (rv != 0) { 7532 DERR(vswp, "%s: ldc_mem_nextcookie " 7533 "failed rv (%d)", name, rv); 7534 goto setup_ring_cleanup; 7535 } 7536 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 7537 "size 0x%llx", name, j, 7538 priv_addr->memcookie[j].addr, 7539 priv_addr->memcookie[j].size); 7540 } 7541 7542 } 7543 priv_addr->ncookies = ncookies; 7544 priv_addr->dstate = VIO_DESC_FREE; 7545 7546 if (pub_addr != NULL) { 7547 7548 /* link pub and private sides */ 7549 priv_addr->descp = pub_addr; 7550 7551 pub_addr->ncookies = priv_addr->ncookies; 7552 7553 for (nc = 0; nc < pub_addr->ncookies; nc++) { 7554 bcopy(&priv_addr->memcookie[nc], 7555 &pub_addr->memcookie[nc], 7556 sizeof (ldc_mem_cookie_t)); 7557 } 7558 7559 pub_addr->hdr.dstate = VIO_DESC_FREE; 7560 pub_addr++; 7561 } 7562 7563 /* 7564 * move to next element in the dring and the next 7565 * position in the data buffer. 7566 */ 7567 priv_addr++; 7568 tmpp += offset; 7569 } 7570 7571 return (0); 7572 7573 setup_ring_cleanup: 7574 priv_addr = dp->priv_addr; 7575 7576 for (j = 0; j < i; j++) { 7577 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 7578 (void) ldc_mem_free_handle(priv_addr->memhandle); 7579 7580 mutex_destroy(&priv_addr->dstate_lock); 7581 7582 priv_addr++; 7583 } 7584 kmem_free(dp->data_addr, dp->data_sz); 7585 7586 return (1); 7587 } 7588 7589 /* 7590 * Searches the private section of a ring for a free descriptor, 7591 * starting at the location of the last free descriptor found 7592 * previously. 7593 * 7594 * Returns 0 if free descriptor is available, and updates state 7595 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 7596 * 7597 * FUTURE: might need to return contiguous range of descriptors 7598 * as dring info msg assumes all will be contiguous. 7599 */ 7600 static int 7601 vsw_dring_find_free_desc(dring_info_t *dringp, 7602 vsw_private_desc_t **priv_p, int *idx) 7603 { 7604 vsw_private_desc_t *addr = NULL; 7605 int num = VSW_RING_NUM_EL; 7606 int ret = 1; 7607 7608 D1(NULL, "%s enter\n", __func__); 7609 7610 ASSERT(dringp->priv_addr != NULL); 7611 7612 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 7613 __func__, dringp, dringp->end_idx); 7614 7615 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 7616 7617 mutex_enter(&addr->dstate_lock); 7618 if (addr->dstate == VIO_DESC_FREE) { 7619 addr->dstate = VIO_DESC_READY; 7620 *priv_p = addr; 7621 *idx = dringp->end_idx; 7622 dringp->end_idx = (dringp->end_idx + 1) % num; 7623 ret = 0; 7624 7625 } 7626 mutex_exit(&addr->dstate_lock); 7627 7628 /* ring full */ 7629 if (ret == 1) { 7630 D2(NULL, "%s: no desp free: started at %d", __func__, 7631 dringp->end_idx); 7632 } 7633 7634 D1(NULL, "%s: exit\n", __func__); 7635 7636 return (ret); 7637 } 7638 7639 /* 7640 * Map from a dring identifier to the ring itself. Returns 7641 * pointer to ring or NULL if no match found. 7642 */ 7643 static dring_info_t * 7644 vsw_ident2dring(lane_t *lane, uint64_t ident) 7645 { 7646 dring_info_t *dp = NULL; 7647 7648 if ((dp = lane->dringp) == NULL) { 7649 return (NULL); 7650 } else { 7651 if (dp->ident == ident) 7652 return (dp); 7653 7654 while (dp != NULL) { 7655 if (dp->ident == ident) 7656 break; 7657 dp = dp->next; 7658 } 7659 } 7660 7661 return (dp); 7662 } 7663 7664 /* 7665 * Set the default lane attributes. These are copied into 7666 * the attr msg we send to our peer. If they are not acceptable 7667 * then (currently) the handshake ends. 7668 */ 7669 static void 7670 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 7671 { 7672 bzero(lp, sizeof (lane_t)); 7673 7674 READ_ENTER(&vswp->if_lockrw); 7675 ether_copy(&(vswp->if_addr), &(lp->addr)); 7676 RW_EXIT(&vswp->if_lockrw); 7677 7678 lp->mtu = VSW_MTU; 7679 lp->addr_type = ADDR_TYPE_MAC; 7680 lp->xfer_mode = VIO_DRING_MODE; 7681 lp->ack_freq = 0; /* for shared mode */ 7682 7683 mutex_enter(&lp->seq_lock); 7684 lp->seq_num = VNET_ISS; 7685 mutex_exit(&lp->seq_lock); 7686 } 7687 7688 /* 7689 * Verify that the attributes are acceptable. 7690 * 7691 * FUTURE: If some attributes are not acceptable, change them 7692 * our desired values. 7693 */ 7694 static int 7695 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 7696 { 7697 int ret = 0; 7698 7699 D1(NULL, "vsw_check_attr enter\n"); 7700 7701 /* 7702 * Note we currently only support in-band descriptors 7703 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 7704 */ 7705 if ((pkt->xfer_mode != VIO_DESC_MODE) && 7706 (pkt->xfer_mode != VIO_DRING_MODE)) { 7707 D2(NULL, "vsw_check_attr: unknown mode %x\n", 7708 pkt->xfer_mode); 7709 ret = 1; 7710 } 7711 7712 /* Only support MAC addresses at moment. */ 7713 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 7714 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 7715 "or address 0x%llx\n", pkt->addr_type, 7716 pkt->addr); 7717 ret = 1; 7718 } 7719 7720 /* 7721 * MAC address supplied by device should match that stored 7722 * in the vsw-port OBP node. Need to decide what to do if they 7723 * don't match, for the moment just warn but don't fail. 7724 */ 7725 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 7726 DERR(NULL, "vsw_check_attr: device supplied address " 7727 "0x%llx doesn't match node address 0x%llx\n", 7728 pkt->addr, port->p_macaddr); 7729 } 7730 7731 /* 7732 * Ack freq only makes sense in pkt mode, in shared 7733 * mode the ring descriptors say whether or not to 7734 * send back an ACK. 7735 */ 7736 if ((pkt->xfer_mode == VIO_DRING_MODE) && 7737 (pkt->ack_freq > 0)) { 7738 D2(NULL, "vsw_check_attr: non zero ack freq " 7739 " in SHM mode\n"); 7740 ret = 1; 7741 } 7742 7743 /* 7744 * Note: for the moment we only support ETHER 7745 * frames. This may change in the future. 7746 */ 7747 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 7748 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 7749 pkt->mtu); 7750 ret = 1; 7751 } 7752 7753 D1(NULL, "vsw_check_attr exit\n"); 7754 7755 return (ret); 7756 } 7757 7758 /* 7759 * Returns 1 if there is a problem, 0 otherwise. 7760 */ 7761 static int 7762 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 7763 { 7764 _NOTE(ARGUNUSED(pkt)) 7765 7766 int ret = 0; 7767 7768 D1(NULL, "vsw_check_dring_info enter\n"); 7769 7770 if ((pkt->num_descriptors == 0) || 7771 (pkt->descriptor_size == 0) || 7772 (pkt->ncookies != 1)) { 7773 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 7774 ret = 1; 7775 } 7776 7777 D1(NULL, "vsw_check_dring_info exit\n"); 7778 7779 return (ret); 7780 } 7781 7782 /* 7783 * Returns 1 if two memory cookies match. Otherwise returns 0. 7784 */ 7785 static int 7786 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 7787 { 7788 if ((m1->addr != m2->addr) || 7789 (m2->size != m2->size)) { 7790 return (0); 7791 } else { 7792 return (1); 7793 } 7794 } 7795 7796 /* 7797 * Returns 1 if ring described in reg message matches that 7798 * described by dring_info structure. Otherwise returns 0. 7799 */ 7800 static int 7801 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 7802 { 7803 if ((msg->descriptor_size != dp->descriptor_size) || 7804 (msg->num_descriptors != dp->num_descriptors) || 7805 (msg->ncookies != dp->ncookies) || 7806 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 7807 return (0); 7808 } else { 7809 return (1); 7810 } 7811 7812 } 7813 7814 static caddr_t 7815 vsw_print_ethaddr(uint8_t *a, char *ebuf) 7816 { 7817 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 7818 a[0], a[1], a[2], a[3], a[4], a[5]); 7819 return (ebuf); 7820 } 7821 7822 /* 7823 * Reset and free all the resources associated with 7824 * the channel. 7825 */ 7826 static void 7827 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 7828 { 7829 dring_info_t *dp, *dpp; 7830 lane_t *lp = NULL; 7831 int rv = 0; 7832 7833 ASSERT(ldcp != NULL); 7834 7835 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 7836 7837 if (dir == INBOUND) { 7838 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 7839 " of channel %lld", __func__, ldcp->ldc_id); 7840 lp = &ldcp->lane_in; 7841 } else { 7842 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 7843 " of channel %lld", __func__, ldcp->ldc_id); 7844 lp = &ldcp->lane_out; 7845 } 7846 7847 lp->lstate = VSW_LANE_INACTIV; 7848 mutex_enter(&lp->seq_lock); 7849 lp->seq_num = VNET_ISS; 7850 mutex_exit(&lp->seq_lock); 7851 if (lp->dringp) { 7852 if (dir == INBOUND) { 7853 dp = lp->dringp; 7854 while (dp != NULL) { 7855 dpp = dp->next; 7856 if (dp->handle != NULL) 7857 (void) ldc_mem_dring_unmap(dp->handle); 7858 kmem_free(dp, sizeof (dring_info_t)); 7859 dp = dpp; 7860 } 7861 } else { 7862 /* 7863 * unbind, destroy exported dring, free dring struct 7864 */ 7865 dp = lp->dringp; 7866 rv = vsw_free_ring(dp); 7867 } 7868 if (rv == 0) { 7869 lp->dringp = NULL; 7870 } 7871 } 7872 7873 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 7874 } 7875 7876 /* 7877 * Free ring and all associated resources. 7878 */ 7879 static int 7880 vsw_free_ring(dring_info_t *dp) 7881 { 7882 vsw_private_desc_t *paddr = NULL; 7883 dring_info_t *dpp; 7884 int i, rv = 1; 7885 7886 while (dp != NULL) { 7887 mutex_enter(&dp->dlock); 7888 dpp = dp->next; 7889 if (dp->priv_addr != NULL) { 7890 /* 7891 * First unbind and free the memory handles 7892 * stored in each descriptor within the ring. 7893 */ 7894 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7895 paddr = (vsw_private_desc_t *) 7896 dp->priv_addr + i; 7897 if (paddr->memhandle != NULL) { 7898 if (paddr->bound == 1) { 7899 rv = ldc_mem_unbind_handle( 7900 paddr->memhandle); 7901 7902 if (rv != 0) { 7903 DERR(NULL, "error " 7904 "unbinding handle for " 7905 "ring 0x%llx at pos %d", 7906 dp, i); 7907 mutex_exit(&dp->dlock); 7908 return (rv); 7909 } 7910 paddr->bound = 0; 7911 } 7912 7913 rv = ldc_mem_free_handle( 7914 paddr->memhandle); 7915 if (rv != 0) { 7916 DERR(NULL, "error freeing " 7917 "handle for ring " 7918 "0x%llx at pos %d", 7919 dp, i); 7920 mutex_exit(&dp->dlock); 7921 return (rv); 7922 } 7923 paddr->memhandle = NULL; 7924 } 7925 mutex_destroy(&paddr->dstate_lock); 7926 } 7927 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 7928 * VSW_RING_NUM_EL)); 7929 } 7930 7931 /* 7932 * Now unbind and destroy the ring itself. 7933 */ 7934 if (dp->handle != NULL) { 7935 (void) ldc_mem_dring_unbind(dp->handle); 7936 (void) ldc_mem_dring_destroy(dp->handle); 7937 } 7938 7939 if (dp->data_addr != NULL) { 7940 kmem_free(dp->data_addr, dp->data_sz); 7941 } 7942 7943 mutex_exit(&dp->dlock); 7944 mutex_destroy(&dp->dlock); 7945 mutex_destroy(&dp->restart_lock); 7946 kmem_free(dp, sizeof (dring_info_t)); 7947 7948 dp = dpp; 7949 } 7950 return (0); 7951 } 7952 7953 /* 7954 * Debugging routines 7955 */ 7956 static void 7957 display_state(void) 7958 { 7959 vsw_t *vswp; 7960 vsw_port_list_t *plist; 7961 vsw_port_t *port; 7962 vsw_ldc_list_t *ldcl; 7963 vsw_ldc_t *ldcp; 7964 7965 cmn_err(CE_NOTE, "***** system state *****"); 7966 7967 for (vswp = vsw_head; vswp; vswp = vswp->next) { 7968 plist = &vswp->plist; 7969 READ_ENTER(&plist->lockrw); 7970 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 7971 vswp->instance, plist->num_ports); 7972 7973 for (port = plist->head; port != NULL; port = port->p_next) { 7974 ldcl = &port->p_ldclist; 7975 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 7976 port->p_instance, ldcl->num_ldcs); 7977 READ_ENTER(&ldcl->lockrw); 7978 ldcp = ldcl->head; 7979 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 7980 cmn_err(CE_CONT, "chan %lu : dev %d : " 7981 "status %d : phase %u\n", 7982 ldcp->ldc_id, ldcp->dev_class, 7983 ldcp->ldc_status, ldcp->hphase); 7984 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 7985 "psession %lu\n", 7986 ldcp->ldc_id, 7987 ldcp->local_session, 7988 ldcp->peer_session); 7989 7990 cmn_err(CE_CONT, "Inbound lane:\n"); 7991 display_lane(&ldcp->lane_in); 7992 cmn_err(CE_CONT, "Outbound lane:\n"); 7993 display_lane(&ldcp->lane_out); 7994 } 7995 RW_EXIT(&ldcl->lockrw); 7996 } 7997 RW_EXIT(&plist->lockrw); 7998 } 7999 cmn_err(CE_NOTE, "***** system state *****"); 8000 } 8001 8002 static void 8003 display_lane(lane_t *lp) 8004 { 8005 dring_info_t *drp; 8006 8007 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 8008 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 8009 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 8010 lp->addr_type, lp->addr, lp->xfer_mode); 8011 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 8012 8013 cmn_err(CE_CONT, "Dring info:\n"); 8014 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 8015 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 8016 drp->num_descriptors, drp->descriptor_size); 8017 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 8018 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 8019 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 8020 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 8021 drp->ident, drp->end_idx); 8022 display_ring(drp); 8023 } 8024 } 8025 8026 static void 8027 display_ring(dring_info_t *dringp) 8028 { 8029 uint64_t i; 8030 uint64_t priv_count = 0; 8031 uint64_t pub_count = 0; 8032 vnet_public_desc_t *pub_addr = NULL; 8033 vsw_private_desc_t *priv_addr = NULL; 8034 8035 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8036 if (dringp->pub_addr != NULL) { 8037 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 8038 8039 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 8040 pub_count++; 8041 } 8042 8043 if (dringp->priv_addr != NULL) { 8044 priv_addr = 8045 (vsw_private_desc_t *)dringp->priv_addr + i; 8046 8047 if (priv_addr->dstate == VIO_DESC_FREE) 8048 priv_count++; 8049 } 8050 } 8051 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 8052 i, priv_count, pub_count); 8053 } 8054 8055 static void 8056 dump_flags(uint64_t state) 8057 { 8058 int i; 8059 8060 typedef struct flag_name { 8061 int flag_val; 8062 char *flag_name; 8063 } flag_name_t; 8064 8065 flag_name_t flags[] = { 8066 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 8067 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 8068 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 8069 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 8070 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 8071 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 8072 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 8073 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 8074 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 8075 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 8076 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 8077 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 8078 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 8079 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 8080 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 8081 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 8082 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 8083 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 8084 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 8085 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 8086 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 8087 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 8088 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 8089 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 8090 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 8091 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 8092 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 8093 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 8094 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 8095 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 8096 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 8097 8098 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 8099 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 8100 if (state & flags[i].flag_val) 8101 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 8102 } 8103 } 8104