1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static void vsw_get_md_properties(vsw_t *vswp); 81 static int vsw_get_physaddr(vsw_t *); 82 static int vsw_setup_layer2(vsw_t *); 83 static int vsw_setup_layer3(vsw_t *); 84 85 /* MAC Ring table functions. */ 86 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 87 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 88 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 89 static void vsw_queue_stop(vsw_queue_t *vqp); 90 static vsw_queue_t *vsw_queue_create(); 91 static void vsw_queue_destroy(vsw_queue_t *vqp); 92 93 /* MAC layer routines */ 94 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 95 mac_resource_t *mrp); 96 static int vsw_get_hw_maddr(vsw_t *); 97 static int vsw_set_hw(vsw_t *, vsw_port_t *); 98 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *); 99 static int vsw_unset_hw(vsw_t *, vsw_port_t *); 100 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *); 101 static int vsw_reconfig_hw(vsw_t *); 102 static int vsw_mac_attach(vsw_t *vswp); 103 static void vsw_mac_detach(vsw_t *vswp); 104 105 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 106 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 107 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 108 static int vsw_mac_register(vsw_t *); 109 static int vsw_mac_unregister(vsw_t *); 110 static int vsw_m_stat(void *, uint_t, uint64_t *); 111 static void vsw_m_stop(void *arg); 112 static int vsw_m_start(void *arg); 113 static int vsw_m_unicst(void *arg, const uint8_t *); 114 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 115 static int vsw_m_promisc(void *arg, boolean_t); 116 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 117 118 /* MDEG routines */ 119 static void vsw_mdeg_register(vsw_t *vswp); 120 static void vsw_mdeg_unregister(vsw_t *vswp); 121 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 122 123 /* Port add/deletion routines */ 124 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 125 static int vsw_port_attach(vsw_t *vswp, int p_instance, 126 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 127 static int vsw_detach_ports(vsw_t *vswp); 128 static int vsw_port_detach(vsw_t *vswp, int p_instance); 129 static int vsw_port_delete(vsw_port_t *port); 130 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 131 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 132 static int vsw_init_ldcs(vsw_port_t *port); 133 static int vsw_uninit_ldcs(vsw_port_t *port); 134 static int vsw_ldc_init(vsw_ldc_t *ldcp); 135 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 136 static int vsw_drain_ldcs(vsw_port_t *port); 137 static int vsw_drain_port_taskq(vsw_port_t *port); 138 static void vsw_marker_task(void *); 139 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 140 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 141 142 /* Interrupt routines */ 143 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 144 145 /* Handshake routines */ 146 static void vsw_restart_ldc(vsw_ldc_t *); 147 static void vsw_restart_handshake(vsw_ldc_t *); 148 static void vsw_handle_reset(vsw_ldc_t *); 149 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 150 static void vsw_next_milestone(vsw_ldc_t *); 151 static int vsw_supported_version(vio_ver_msg_t *); 152 153 /* Data processing routines */ 154 static void vsw_process_pkt(void *); 155 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 156 static void vsw_process_ctrl_pkt(void *); 157 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 158 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 159 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 160 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 161 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 162 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 163 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 164 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 165 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 166 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 167 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 168 169 /* Switching/data transmit routines */ 170 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 171 vsw_port_t *port, mac_resource_handle_t); 172 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 173 vsw_port_t *port, mac_resource_handle_t); 174 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 175 vsw_port_t *port); 176 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 177 vsw_port_t *port); 178 static int vsw_portsend(vsw_port_t *, mblk_t *); 179 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 180 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 181 182 /* Packet creation routines */ 183 static void vsw_send_ver(void *); 184 static void vsw_send_attr(vsw_ldc_t *); 185 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 186 static void vsw_send_dring_info(vsw_ldc_t *); 187 static void vsw_send_rdx(vsw_ldc_t *); 188 189 static void vsw_send_msg(vsw_ldc_t *, void *, int); 190 191 /* Forwarding database (FDB) routines */ 192 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 193 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 194 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 195 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 196 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 197 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 198 static void vsw_del_addr(uint8_t, void *, uint64_t); 199 static void vsw_del_mcst_port(vsw_port_t *); 200 static void vsw_del_mcst_vsw(vsw_t *); 201 202 /* Dring routines */ 203 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 204 static void vsw_create_privring(vsw_ldc_t *); 205 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 206 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 207 int *); 208 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 209 210 static void vsw_set_lane_attr(vsw_t *, lane_t *); 211 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 212 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 213 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 214 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 215 216 /* Misc support routines */ 217 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 218 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 219 static int vsw_free_ring(dring_info_t *); 220 221 222 /* Debugging routines */ 223 static void dump_flags(uint64_t); 224 static void display_state(void); 225 static void display_lane(lane_t *); 226 static void display_ring(dring_info_t *); 227 228 int vsw_num_handshakes = 3; /* # of handshake attempts */ 229 int vsw_wretries = 100; /* # of write attempts */ 230 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 231 int vsw_desc_delay = 0; /* delay in us */ 232 int vsw_read_attempts = 5; /* # of reads of descriptor */ 233 234 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 235 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 236 237 238 /* 239 * mode specific frame switching function 240 */ 241 void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, 242 mac_resource_handle_t); 243 244 static mac_callbacks_t vsw_m_callbacks = { 245 0, 246 vsw_m_stat, 247 vsw_m_start, 248 vsw_m_stop, 249 vsw_m_promisc, 250 vsw_m_multicst, 251 vsw_m_unicst, 252 vsw_m_tx, 253 NULL, 254 NULL, 255 NULL 256 }; 257 258 static struct cb_ops vsw_cb_ops = { 259 nulldev, /* cb_open */ 260 nulldev, /* cb_close */ 261 nodev, /* cb_strategy */ 262 nodev, /* cb_print */ 263 nodev, /* cb_dump */ 264 nodev, /* cb_read */ 265 nodev, /* cb_write */ 266 nodev, /* cb_ioctl */ 267 nodev, /* cb_devmap */ 268 nodev, /* cb_mmap */ 269 nodev, /* cb_segmap */ 270 nochpoll, /* cb_chpoll */ 271 ddi_prop_op, /* cb_prop_op */ 272 NULL, /* cb_stream */ 273 D_MP, /* cb_flag */ 274 CB_REV, /* rev */ 275 nodev, /* int (*cb_aread)() */ 276 nodev /* int (*cb_awrite)() */ 277 }; 278 279 static struct dev_ops vsw_ops = { 280 DEVO_REV, /* devo_rev */ 281 0, /* devo_refcnt */ 282 vsw_getinfo, /* devo_getinfo */ 283 nulldev, /* devo_identify */ 284 nulldev, /* devo_probe */ 285 vsw_attach, /* devo_attach */ 286 vsw_detach, /* devo_detach */ 287 nodev, /* devo_reset */ 288 &vsw_cb_ops, /* devo_cb_ops */ 289 (struct bus_ops *)NULL, /* devo_bus_ops */ 290 ddi_power /* devo_power */ 291 }; 292 293 extern struct mod_ops mod_driverops; 294 static struct modldrv vswmodldrv = { 295 &mod_driverops, 296 "sun4v Virtual Switch Driver %I%", 297 &vsw_ops, 298 }; 299 300 #define LDC_ENTER_LOCK(ldcp) \ 301 mutex_enter(&((ldcp)->ldc_cblock));\ 302 mutex_enter(&((ldcp)->ldc_txlock)); 303 #define LDC_EXIT_LOCK(ldcp) \ 304 mutex_exit(&((ldcp)->ldc_txlock));\ 305 mutex_exit(&((ldcp)->ldc_cblock)); 306 307 /* Driver soft state ptr */ 308 static void *vsw_state; 309 310 /* 311 * Linked list of "vsw_t" structures - one per instance. 312 */ 313 vsw_t *vsw_head = NULL; 314 krwlock_t vsw_rw; 315 316 /* 317 * Property names 318 */ 319 static char vdev_propname[] = "virtual-device"; 320 static char vsw_propname[] = "virtual-network-switch"; 321 static char physdev_propname[] = "vsw-phys-dev"; 322 static char smode_propname[] = "vsw-switch-mode"; 323 static char macaddr_propname[] = "local-mac-address"; 324 static char remaddr_propname[] = "remote-mac-address"; 325 static char ldcids_propname[] = "ldc-ids"; 326 static char chan_propname[] = "channel-endpoint"; 327 static char id_propname[] = "id"; 328 static char reg_propname[] = "reg"; 329 330 /* supported versions */ 331 static ver_sup_t vsw_versions[] = { {1, 0} }; 332 333 /* 334 * Matching criteria passed to the MDEG to register interest 335 * in changes to 'virtual-device-port' nodes identified by their 336 * 'id' property. 337 */ 338 static md_prop_match_t vport_prop_match[] = { 339 { MDET_PROP_VAL, "id" }, 340 { MDET_LIST_END, NULL } 341 }; 342 343 static mdeg_node_match_t vport_match = { "virtual-device-port", 344 vport_prop_match }; 345 346 /* 347 * Specification of an MD node passed to the MDEG to filter any 348 * 'vport' nodes that do not belong to the specified node. This 349 * template is copied for each vsw instance and filled in with 350 * the appropriate 'cfg-handle' value before being passed to the MDEG. 351 */ 352 static mdeg_prop_spec_t vsw_prop_template[] = { 353 { MDET_PROP_STR, "name", vsw_propname }, 354 { MDET_PROP_VAL, "cfg-handle", NULL }, 355 { MDET_LIST_END, NULL, NULL } 356 }; 357 358 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 359 360 /* 361 * From /etc/system enable/disable thread per ring. This is a mode 362 * selection that is done a vsw driver attach time. 363 */ 364 boolean_t vsw_multi_ring_enable = B_FALSE; 365 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 366 367 /* 368 * Print debug messages - set to 0x1f to enable all msgs 369 * or 0x0 to turn all off. 370 */ 371 int vswdbg = 0x0; 372 373 /* 374 * debug levels: 375 * 0x01: Function entry/exit tracing 376 * 0x02: Internal function messages 377 * 0x04: Verbose internal messages 378 * 0x08: Warning messages 379 * 0x10: Error messages 380 */ 381 382 static void 383 vswdebug(vsw_t *vswp, const char *fmt, ...) 384 { 385 char buf[512]; 386 va_list ap; 387 388 va_start(ap, fmt); 389 (void) vsprintf(buf, fmt, ap); 390 va_end(ap); 391 392 if (vswp == NULL) 393 cmn_err(CE_CONT, "%s\n", buf); 394 else 395 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 396 } 397 398 /* 399 * For the moment the state dump routines have their own 400 * private flag. 401 */ 402 #define DUMP_STATE 0 403 404 #if DUMP_STATE 405 406 #define DUMP_TAG(tag) \ 407 { \ 408 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 409 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 410 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 411 } 412 413 #define DUMP_TAG_PTR(tag) \ 414 { \ 415 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 416 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 417 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 418 } 419 420 #define DUMP_FLAGS(flags) dump_flags(flags); 421 #define DISPLAY_STATE() display_state() 422 423 #else 424 425 #define DUMP_TAG(tag) 426 #define DUMP_TAG_PTR(tag) 427 #define DUMP_FLAGS(state) 428 #define DISPLAY_STATE() 429 430 #endif /* DUMP_STATE */ 431 432 #ifdef DEBUG 433 434 #define D1 \ 435 if (vswdbg & 0x01) \ 436 vswdebug 437 438 #define D2 \ 439 if (vswdbg & 0x02) \ 440 vswdebug 441 442 #define D3 \ 443 if (vswdbg & 0x04) \ 444 vswdebug 445 446 #define DWARN \ 447 if (vswdbg & 0x08) \ 448 vswdebug 449 450 #define DERR \ 451 if (vswdbg & 0x10) \ 452 vswdebug 453 454 #else 455 456 #define DERR if (0) vswdebug 457 #define DWARN if (0) vswdebug 458 #define D1 if (0) vswdebug 459 #define D2 if (0) vswdebug 460 #define D3 if (0) vswdebug 461 462 #endif /* DEBUG */ 463 464 static struct modlinkage modlinkage = { 465 MODREV_1, 466 &vswmodldrv, 467 NULL 468 }; 469 470 int 471 _init(void) 472 { 473 int status; 474 475 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 476 477 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 478 if (status != 0) { 479 return (status); 480 } 481 482 mac_init_ops(&vsw_ops, "vsw"); 483 status = mod_install(&modlinkage); 484 if (status != 0) { 485 ddi_soft_state_fini(&vsw_state); 486 } 487 return (status); 488 } 489 490 int 491 _fini(void) 492 { 493 int status; 494 495 status = mod_remove(&modlinkage); 496 if (status != 0) 497 return (status); 498 mac_fini_ops(&vsw_ops); 499 ddi_soft_state_fini(&vsw_state); 500 501 rw_destroy(&vsw_rw); 502 503 return (status); 504 } 505 506 int 507 _info(struct modinfo *modinfop) 508 { 509 return (mod_info(&modlinkage, modinfop)); 510 } 511 512 static int 513 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 514 { 515 vsw_t *vswp; 516 int instance, i; 517 char hashname[MAXNAMELEN]; 518 char qname[TASKQ_NAMELEN]; 519 int rv = 1; 520 enum { PROG_init = 0x00, 521 PROG_if_lock = 0x01, 522 PROG_fdb = 0x02, 523 PROG_mfdb = 0x04, 524 PROG_report_dev = 0x08, 525 PROG_plist = 0x10, 526 PROG_taskq = 0x20} 527 progress; 528 529 progress = PROG_init; 530 531 switch (cmd) { 532 case DDI_ATTACH: 533 break; 534 case DDI_RESUME: 535 /* nothing to do for this non-device */ 536 return (DDI_SUCCESS); 537 case DDI_PM_RESUME: 538 default: 539 return (DDI_FAILURE); 540 } 541 542 instance = ddi_get_instance(dip); 543 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 544 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 545 return (DDI_FAILURE); 546 } 547 vswp = ddi_get_soft_state(vsw_state, instance); 548 549 if (vswp == NULL) { 550 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 551 goto vsw_attach_fail; 552 } 553 554 vswp->dip = dip; 555 vswp->instance = instance; 556 ddi_set_driver_private(dip, (caddr_t)vswp); 557 558 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 559 progress |= PROG_if_lock; 560 561 /* 562 * Get the various properties such as physical device name 563 * (vsw-phys-dev), switch mode etc from the MD. 564 */ 565 vsw_get_md_properties(vswp); 566 567 /* setup the unicast forwarding database */ 568 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 569 vswp->instance); 570 D2(vswp, "creating unicast hash table (%s)...", hashname); 571 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 572 mod_hash_null_valdtor, sizeof (void *)); 573 574 progress |= PROG_fdb; 575 576 /* setup the multicast fowarding database */ 577 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 578 vswp->instance); 579 D2(vswp, "creating multicast hash table %s)...", hashname); 580 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 581 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 582 mod_hash_null_valdtor, sizeof (void *)); 583 584 progress |= PROG_mfdb; 585 586 /* 587 * create lock protecting list of multicast addresses 588 * which could come via m_multicst() entry point when plumbed. 589 */ 590 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 591 vswp->mcap = NULL; 592 593 ddi_report_dev(vswp->dip); 594 595 progress |= PROG_report_dev; 596 597 WRITE_ENTER(&vsw_rw); 598 vswp->next = vsw_head; 599 vsw_head = vswp; 600 RW_EXIT(&vsw_rw); 601 602 /* setup the port list */ 603 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 604 vswp->plist.head = NULL; 605 606 progress |= PROG_plist; 607 608 /* 609 * Create the taskq which will process all the VIO 610 * control messages. 611 */ 612 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 613 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 614 TASKQ_DEFAULTPRI, 0)) == NULL) { 615 cmn_err(CE_WARN, "Unable to create task queue"); 616 goto vsw_attach_fail; 617 } 618 619 progress |= PROG_taskq; 620 621 /* select best switching mode */ 622 for (i = 0; i < vswp->smode_num; i++) { 623 vswp->smode_idx = i; 624 switch (vswp->smode[i]) { 625 case VSW_LAYER2: 626 case VSW_LAYER2_PROMISC: 627 rv = vsw_setup_layer2(vswp); 628 break; 629 630 case VSW_LAYER3: 631 rv = vsw_setup_layer3(vswp); 632 break; 633 634 default: 635 DERR(vswp, "unknown switch mode"); 636 rv = 1; 637 break; 638 } 639 640 if (rv == 0) 641 break; 642 } 643 644 if (rv == 1) { 645 cmn_err(CE_WARN, "Unable to setup switching mode"); 646 goto vsw_attach_fail; 647 } 648 649 D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); 650 651 /* 652 * Register with the MAC layer as a network device so 653 * we can be plumbed if desired. 654 * 655 * Do this in both layer 2 and layer 3 mode. 656 */ 657 vswp->if_state &= ~VSW_IF_UP; 658 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 659 if (vsw_mac_register(vswp) != 0) { 660 cmn_err(CE_WARN, "Unable to register as provider " 661 " with MAC layer, continuing with attach"); 662 } 663 } 664 665 /* prevent auto-detaching */ 666 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 667 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 668 cmn_err(CE_NOTE, "Unable to set \"%s\" property for " 669 "instance %u", DDI_NO_AUTODETACH, instance); 670 } 671 672 /* 673 * Now we have everything setup, register for MD change 674 * events. 675 */ 676 vsw_mdeg_register(vswp); 677 678 return (DDI_SUCCESS); 679 680 vsw_attach_fail: 681 DERR(NULL, "vsw_attach: failed"); 682 683 if (progress & PROG_taskq) 684 ddi_taskq_destroy(vswp->taskq_p); 685 686 if (progress & PROG_plist) 687 rw_destroy(&vswp->plist.lockrw); 688 689 if (progress & PROG_report_dev) { 690 ddi_remove_minor_node(dip, NULL); 691 mutex_destroy(&vswp->mca_lock); 692 } 693 694 if (progress & PROG_mfdb) { 695 mod_hash_destroy_hash(vswp->mfdb); 696 vswp->mfdb = NULL; 697 rw_destroy(&vswp->mfdbrw); 698 } 699 700 if (progress & PROG_fdb) { 701 mod_hash_destroy_hash(vswp->fdb); 702 vswp->fdb = NULL; 703 } 704 705 if (progress & PROG_if_lock) 706 rw_destroy(&vswp->if_lockrw); 707 708 ddi_soft_state_free(vsw_state, instance); 709 return (DDI_FAILURE); 710 } 711 712 static int 713 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 714 { 715 vio_mblk_pool_t *poolp, *npoolp; 716 vsw_t **vswpp, *vswp; 717 int instance; 718 719 instance = ddi_get_instance(dip); 720 vswp = ddi_get_soft_state(vsw_state, instance); 721 722 if (vswp == NULL) { 723 return (DDI_FAILURE); 724 } 725 726 switch (cmd) { 727 case DDI_DETACH: 728 break; 729 case DDI_SUSPEND: 730 case DDI_PM_SUSPEND: 731 default: 732 return (DDI_FAILURE); 733 } 734 735 D2(vswp, "detaching instance %d", instance); 736 737 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 738 if (vsw_mac_unregister(vswp) != 0) { 739 cmn_err(CE_WARN, "Unable to detach from MAC layer"); 740 return (DDI_FAILURE); 741 } 742 rw_destroy(&vswp->if_lockrw); 743 } 744 745 vsw_mdeg_unregister(vswp); 746 747 /* remove mac layer callback */ 748 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 749 mac_rx_remove(vswp->mh, vswp->mrh); 750 vswp->mrh = NULL; 751 } 752 753 if (vsw_detach_ports(vswp) != 0) { 754 cmn_err(CE_WARN, "Unable to detach ports"); 755 return (DDI_FAILURE); 756 } 757 758 /* 759 * Now that the ports have been deleted, stop and close 760 * the physical device. 761 */ 762 if (vswp->mh != NULL) { 763 if (vswp->mstarted) 764 mac_stop(vswp->mh); 765 if (vswp->mresources) 766 mac_resource_set(vswp->mh, NULL, NULL); 767 mac_close(vswp->mh); 768 769 vswp->mh = NULL; 770 vswp->txinfo = NULL; 771 } 772 773 /* 774 * Destroy any free pools that may still exist. 775 */ 776 poolp = vswp->rxh; 777 while (poolp != NULL) { 778 npoolp = vswp->rxh = poolp->nextp; 779 if (vio_destroy_mblks(poolp) != 0) { 780 vswp->rxh = poolp; 781 return (DDI_FAILURE); 782 } 783 poolp = npoolp; 784 } 785 786 /* 787 * Remove this instance from any entries it may be on in 788 * the hash table by using the list of addresses maintained 789 * in the vsw_t structure. 790 */ 791 vsw_del_mcst_vsw(vswp); 792 793 vswp->mcap = NULL; 794 mutex_destroy(&vswp->mca_lock); 795 796 /* 797 * By now any pending tasks have finished and the underlying 798 * ldc's have been destroyed, so its safe to delete the control 799 * message taskq. 800 */ 801 if (vswp->taskq_p != NULL) 802 ddi_taskq_destroy(vswp->taskq_p); 803 804 /* 805 * At this stage all the data pointers in the hash table 806 * should be NULL, as all the ports have been removed and will 807 * have deleted themselves from the port lists which the data 808 * pointers point to. Hence we can destroy the table using the 809 * default destructors. 810 */ 811 D2(vswp, "vsw_detach: destroying hash tables.."); 812 mod_hash_destroy_hash(vswp->fdb); 813 vswp->fdb = NULL; 814 815 WRITE_ENTER(&vswp->mfdbrw); 816 mod_hash_destroy_hash(vswp->mfdb); 817 vswp->mfdb = NULL; 818 RW_EXIT(&vswp->mfdbrw); 819 rw_destroy(&vswp->mfdbrw); 820 821 ddi_remove_minor_node(dip, NULL); 822 823 rw_destroy(&vswp->plist.lockrw); 824 WRITE_ENTER(&vsw_rw); 825 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 826 if (*vswpp == vswp) { 827 *vswpp = vswp->next; 828 break; 829 } 830 } 831 RW_EXIT(&vsw_rw); 832 ddi_soft_state_free(vsw_state, instance); 833 834 return (DDI_SUCCESS); 835 } 836 837 static int 838 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 839 { 840 _NOTE(ARGUNUSED(dip)) 841 842 vsw_t *vswp = NULL; 843 dev_t dev = (dev_t)arg; 844 int instance; 845 846 instance = getminor(dev); 847 848 switch (infocmd) { 849 case DDI_INFO_DEVT2DEVINFO: 850 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 851 *result = NULL; 852 return (DDI_FAILURE); 853 } 854 *result = vswp->dip; 855 return (DDI_SUCCESS); 856 857 case DDI_INFO_DEVT2INSTANCE: 858 *result = (void *)(uintptr_t)instance; 859 return (DDI_SUCCESS); 860 861 default: 862 *result = NULL; 863 return (DDI_FAILURE); 864 } 865 } 866 867 /* 868 * Get the properties from our MD node. 869 */ 870 static void 871 vsw_get_md_properties(vsw_t *vswp) 872 { 873 md_t *mdp = NULL; 874 int num_nodes = 0; 875 int len = 0, listsz = 0; 876 int num_vdev = 0; 877 int i, idx; 878 boolean_t found_node = B_FALSE; 879 char *smode = NULL; 880 char *curr_mode = NULL; 881 char *physname = NULL; 882 char *node_name = NULL; 883 char *dev; 884 uint64_t macaddr = 0; 885 uint64_t md_inst, obp_inst; 886 mde_cookie_t *listp = NULL; 887 mde_cookie_t rootnode; 888 889 D1(vswp, "%s: enter", __func__); 890 891 /* 892 * Further down we compare the obp 'reg' property to the 893 * 'cfg-handle' property in the vsw MD node to determine 894 * if the node refers to this particular instance. So if 895 * we can't read the obp value then there is no point 896 * in proceeding further. 897 */ 898 if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, 899 DDI_PROP_DONTPASS, reg_propname) != 1) { 900 cmn_err(CE_WARN, "Unable to read %s property " 901 "from OBP device node", reg_propname); 902 return; 903 } 904 905 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 906 DDI_PROP_DONTPASS, reg_propname, 0); 907 908 D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); 909 910 if ((mdp = md_get_handle()) == NULL) { 911 DERR(vswp, "%s: unable to init MD", __func__); 912 return; 913 } 914 915 if ((num_nodes = md_node_count(mdp)) <= 0) { 916 DERR(vswp, "%s: invalid number of nodes found %d", 917 __func__, num_nodes); 918 (void) md_fini_handle(mdp); 919 return; 920 } 921 922 D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); 923 924 /* allocate enough space for node list */ 925 listsz = num_nodes * sizeof (mde_cookie_t); 926 listp = kmem_zalloc(listsz, KM_SLEEP); 927 928 rootnode = md_root_node(mdp); 929 930 /* Get the list of virtual devices */ 931 num_vdev = md_scan_dag(mdp, rootnode, 932 md_find_name(mdp, vdev_propname), 933 md_find_name(mdp, "fwd"), listp); 934 935 if (num_vdev <= 0) { 936 DERR(vswp, "%s: didn't find any virtual-device nodes in MD", 937 __func__); 938 goto md_prop_exit; 939 } 940 941 D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); 942 943 /* Look for the virtual switch nodes in the list */ 944 for (idx = 0; idx < num_vdev; idx++) { 945 if (md_get_prop_str(mdp, listp[idx], 946 "name", &node_name) != 0) { 947 DERR(vswp, "%s: unable to get node name", __func__); 948 continue; 949 950 } 951 952 if (strcmp(node_name, vsw_propname) == 0) { 953 /* Virtual switch node */ 954 if (md_get_prop_val(mdp, listp[idx], 955 "cfg-handle", &md_inst) != 0) { 956 DERR(vswp, "%s: unable to get cfg-handle from" 957 " node %d", __func__, idx); 958 goto md_prop_exit; 959 } else if (md_inst == obp_inst) { 960 D2(vswp, "%s: found matching node (%d)" 961 " 0x%llx == 0x%llx", __func__, idx, 962 md_inst, obp_inst); 963 found_node = B_TRUE; 964 break; 965 } 966 } 967 } 968 969 if (!found_node) { 970 DWARN(vswp, "%s: couldn't find correct vsw node", __func__); 971 goto md_prop_exit; 972 } 973 974 /* 975 * Now, having found the correct node, get the various properties. 976 */ 977 978 if (md_get_prop_data(mdp, listp[idx], physdev_propname, 979 (uint8_t **)(&physname), &len) != 0) { 980 cmn_err(CE_WARN, "%s: unable to get name(s) of physical " 981 "device(s) from MD", __func__); 982 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 983 cmn_err(CE_WARN, "%s is too long a device name", physname); 984 } else { 985 (void) strncpy(vswp->physname, physname, strlen(physname) + 1); 986 vswp->mdprops |= VSW_MD_PHYSNAME; 987 D2(vswp, "%s: using first device specified (%s)", 988 __func__, vswp->physname); 989 } 990 991 #ifdef DEBUG 992 /* 993 * As a temporary measure to aid testing we check to see if there 994 * is a vsw.conf file present. If there is we use the value of the 995 * vsw_physname property in the file as the name of the physical 996 * device, overriding the value from the MD. 997 * 998 * There may be multiple devices listed, but for the moment 999 * we just use the first one. 1000 */ 1001 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 1002 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 1003 if ((strlen(dev) + 1) > LIFNAMSIZ) { 1004 cmn_err(CE_WARN, "%s is too long a device name", dev); 1005 } else { 1006 cmn_err(CE_NOTE, "%s: using device name (%s) from " 1007 "config file", __func__, dev); 1008 1009 (void) strncpy(vswp->physname, dev, strlen(dev) + 1); 1010 vswp->mdprops |= VSW_MD_PHYSNAME; 1011 } 1012 1013 ddi_prop_free(dev); 1014 1015 } 1016 #endif 1017 1018 /* mac address for vswitch device itself */ 1019 if (md_get_prop_val(mdp, listp[idx], 1020 macaddr_propname, &macaddr) != 0) { 1021 cmn_err(CE_WARN, "!Unable to get MAC address from MD"); 1022 1023 /* 1024 * Fallback to using the mac address of the physical 1025 * device. 1026 */ 1027 if (vsw_get_physaddr(vswp) == 0) { 1028 cmn_err(CE_NOTE, "!Using MAC address from physical " 1029 "device (%s)", vswp->physname); 1030 } 1031 } else { 1032 READ_ENTER(&vswp->if_lockrw); 1033 for (i = ETHERADDRL - 1; i >= 0; i--) { 1034 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 1035 macaddr >>= 8; 1036 } 1037 RW_EXIT(&vswp->if_lockrw); 1038 vswp->mdprops |= VSW_MD_MACADDR; 1039 } 1040 1041 /* 1042 * Get the switch-mode property. The modes are listed in 1043 * decreasing order of preference, i.e. prefered mode is 1044 * first item in list. 1045 */ 1046 len = 0; 1047 vswp->smode_num = 0; 1048 if (md_get_prop_data(mdp, listp[idx], smode_propname, 1049 (uint8_t **)(&smode), &len) != 0) { 1050 /* 1051 * Unable to get switch-mode property from MD, nothing 1052 * more we can do. 1053 */ 1054 cmn_err(CE_WARN, "!unable to get switch mode property"); 1055 goto md_prop_exit; 1056 } 1057 1058 curr_mode = smode; 1059 /* 1060 * Modes of operation: 1061 * 'switched' - layer 2 switching, underlying HW in 1062 * programmed mode. 1063 * 'promiscuous' - layer 2 switching, underlying HW in 1064 * promiscuous mode. 1065 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 1066 * in non-promiscuous mode. 1067 */ 1068 while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) { 1069 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 1070 if (strcmp(curr_mode, "switched") == 0) { 1071 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1072 } else if (strcmp(curr_mode, "promiscuous") == 0) { 1073 vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC; 1074 } else if (strcmp(curr_mode, "routed") == 0) { 1075 vswp->smode[vswp->smode_num++] = VSW_LAYER3; 1076 } else { 1077 cmn_err(CE_WARN, "Unknown switch mode %s, setting to" 1078 " default switched mode", curr_mode); 1079 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1080 } 1081 curr_mode += strlen(curr_mode) + 1; 1082 } 1083 1084 D2(vswp, "%d switching modes specified", vswp->smode_num); 1085 1086 if (vswp->smode_num > 0) 1087 vswp->mdprops |= VSW_MD_SMODE; 1088 1089 md_prop_exit: 1090 (void) md_fini_handle(mdp); 1091 1092 kmem_free(listp, listsz); 1093 1094 D1(vswp, "%s: exit", __func__); 1095 } 1096 1097 /* 1098 * Get the mac address of the physical device. 1099 * 1100 * Returns 0 on success, 1 on failure. 1101 */ 1102 static int 1103 vsw_get_physaddr(vsw_t *vswp) 1104 { 1105 mac_handle_t mh; 1106 char drv[LIFNAMSIZ]; 1107 uint_t ddi_instance; 1108 1109 D1(vswp, "%s: enter", __func__); 1110 1111 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1112 return (1); 1113 1114 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1115 cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname); 1116 return (1); 1117 } 1118 1119 READ_ENTER(&vswp->if_lockrw); 1120 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1121 RW_EXIT(&vswp->if_lockrw); 1122 1123 mac_close(mh); 1124 1125 vswp->mdprops |= VSW_DEV_MACADDR; 1126 1127 D1(vswp, "%s: exit", __func__); 1128 1129 return (0); 1130 } 1131 1132 /* 1133 * Check to see if the card supports the setting of multiple unicst 1134 * addresses. 1135 * 1136 * Returns 0 if card supports the programming of multiple unicast addresses 1137 * and there are free address slots available, otherwise returns 1. 1138 */ 1139 static int 1140 vsw_get_hw_maddr(vsw_t *vswp) 1141 { 1142 D1(vswp, "%s: enter", __func__); 1143 1144 if (vswp->mh == NULL) { 1145 return (1); 1146 } 1147 1148 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1149 DWARN(vswp, "Unable to get capabilities of" 1150 " underlying device (%s)", vswp->physname); 1151 return (1); 1152 } 1153 1154 if (vswp->maddr.maddr_naddrfree == 0) { 1155 cmn_err(CE_WARN, 1156 "!device %s has no free unicast address slots", 1157 vswp->physname); 1158 return (1); 1159 } 1160 1161 D2(vswp, "%s: %d addrs : %d free", __func__, 1162 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1163 1164 D1(vswp, "%s: exit", __func__); 1165 1166 return (0); 1167 } 1168 1169 /* 1170 * Setup for layer 2 switching. 1171 * 1172 * Returns 0 on success, 1 on failure. 1173 */ 1174 static int 1175 vsw_setup_layer2(vsw_t *vswp) 1176 { 1177 D1(vswp, "%s: enter", __func__); 1178 1179 vsw_switch_frame = vsw_switch_l2_frame; 1180 1181 /* 1182 * Attempt to link into the MAC layer so we can get 1183 * and send packets out over the physical adapter. 1184 */ 1185 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1186 if (vsw_mac_attach(vswp) != 0) { 1187 /* 1188 * Registration with the MAC layer has failed, 1189 * so return 1 so that can fall back to next 1190 * prefered switching method. 1191 */ 1192 cmn_err(CE_WARN, "!Unable to join as MAC layer " 1193 "client"); 1194 return (1); 1195 } 1196 1197 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1198 /* 1199 * Verify that underlying device can support multiple 1200 * unicast mac addresses, and has free capacity. 1201 */ 1202 if (vsw_get_hw_maddr(vswp) != 0) { 1203 cmn_err(CE_WARN, "!unable to setup switching"); 1204 vsw_mac_detach(vswp); 1205 return (1); 1206 } 1207 } 1208 1209 } else { 1210 /* 1211 * No physical device name found in MD which is 1212 * required for layer 2. 1213 */ 1214 cmn_err(CE_WARN, "!no physical device name specified"); 1215 return (1); 1216 } 1217 1218 D1(vswp, "%s: exit", __func__); 1219 1220 return (0); 1221 } 1222 1223 static int 1224 vsw_setup_layer3(vsw_t *vswp) 1225 { 1226 D1(vswp, "%s: enter", __func__); 1227 1228 D2(vswp, "%s: operating in layer 3 mode", __func__); 1229 vsw_switch_frame = vsw_switch_l3_frame; 1230 1231 D1(vswp, "%s: exit", __func__); 1232 1233 return (0); 1234 } 1235 1236 /* 1237 * Link into the MAC layer to gain access to the services provided by 1238 * the underlying physical device driver (which should also have 1239 * registered with the MAC layer). 1240 * 1241 * Only when in layer 2 mode. 1242 */ 1243 static int 1244 vsw_mac_attach(vsw_t *vswp) 1245 { 1246 char drv[LIFNAMSIZ]; 1247 uint_t ddi_instance; 1248 1249 D1(vswp, "%s: enter", __func__); 1250 1251 vswp->mh = NULL; 1252 vswp->mrh = NULL; 1253 vswp->mstarted = B_FALSE; 1254 vswp->mresources = B_FALSE; 1255 1256 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1257 1258 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1259 cmn_err(CE_WARN, "invalid device name: %s", vswp->physname); 1260 goto mac_fail_exit; 1261 } 1262 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1263 cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); 1264 goto mac_fail_exit; 1265 } 1266 1267 ASSERT(vswp->mh != NULL); 1268 1269 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1270 1271 if (vsw_multi_ring_enable) { 1272 vsw_mac_ring_tbl_init(vswp); 1273 1274 /* 1275 * Register our receive callback. 1276 */ 1277 vswp->mrh = mac_rx_add(vswp->mh, 1278 vsw_rx_queue_cb, (void *)vswp); 1279 1280 /* 1281 * Register our mac resource callback. 1282 */ 1283 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1284 vswp->mresources = B_TRUE; 1285 1286 /* 1287 * Get the ring resources available to us from 1288 * the mac below us. 1289 */ 1290 mac_resources(vswp->mh); 1291 } else { 1292 /* 1293 * Just register our rx callback function 1294 */ 1295 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1296 } 1297 1298 ASSERT(vswp->mrh != NULL); 1299 1300 /* Get the MAC tx fn */ 1301 vswp->txinfo = mac_tx_get(vswp->mh); 1302 1303 /* start the interface */ 1304 if (mac_start(vswp->mh) != 0) { 1305 cmn_err(CE_WARN, "could not start mac interface"); 1306 goto mac_fail_exit; 1307 } 1308 1309 vswp->mstarted = B_TRUE; 1310 1311 D1(vswp, "%s: exit", __func__); 1312 return (0); 1313 1314 mac_fail_exit: 1315 vsw_mac_detach(vswp); 1316 1317 D1(vswp, "%s: exit", __func__); 1318 return (1); 1319 } 1320 1321 static void 1322 vsw_mac_detach(vsw_t *vswp) 1323 { 1324 D1(vswp, "vsw_mac_detach: enter"); 1325 1326 ASSERT(vswp != NULL); 1327 ASSERT(vswp->mh != NULL); 1328 1329 if (vsw_multi_ring_enable) { 1330 vsw_mac_ring_tbl_destroy(vswp); 1331 } 1332 1333 if (vswp->mstarted) 1334 mac_stop(vswp->mh); 1335 if (vswp->mrh != NULL) 1336 mac_rx_remove(vswp->mh, vswp->mrh); 1337 if (vswp->mresources) 1338 mac_resource_set(vswp->mh, NULL, NULL); 1339 mac_close(vswp->mh); 1340 1341 vswp->mrh = NULL; 1342 vswp->mh = NULL; 1343 vswp->txinfo = NULL; 1344 vswp->mstarted = B_FALSE; 1345 1346 D1(vswp, "vsw_mac_detach: exit"); 1347 } 1348 1349 /* 1350 * Depending on the mode specified, the capabilites and capacity 1351 * of the underlying device setup the physical device. 1352 * 1353 * If in layer 3 mode, then do nothing. 1354 * 1355 * If in layer 2 programmed mode attempt to program the unicast address 1356 * associated with the port into the physical device. If this is not 1357 * possible due to resource exhaustion or simply because the device does 1358 * not support multiple unicast addresses then if required fallback onto 1359 * putting the card into promisc mode. 1360 * 1361 * If in promisc mode then simply set the card into promisc mode. 1362 * 1363 * Returns 0 success, 1 on failure. 1364 */ 1365 static int 1366 vsw_set_hw(vsw_t *vswp, vsw_port_t *port) 1367 { 1368 mac_multi_addr_t mac_addr; 1369 void *mah; 1370 int err; 1371 1372 D1(vswp, "%s: enter", __func__); 1373 1374 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1375 return (0); 1376 1377 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1378 return (vsw_set_hw_promisc(vswp, port)); 1379 } 1380 1381 if (vswp->maddr.maddr_handle == NULL) 1382 return (1); 1383 1384 mah = vswp->maddr.maddr_handle; 1385 1386 /* 1387 * Attempt to program the unicast address into the HW. 1388 */ 1389 mac_addr.mma_addrlen = ETHERADDRL; 1390 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1391 1392 err = vswp->maddr.maddr_add(mah, &mac_addr); 1393 if (err != 0) { 1394 cmn_err(CE_WARN, "!failed to program addr " 1395 "%x:%x:%x:%x:%x:%x for port %d into device %s " 1396 ": err %d", port->p_macaddr.ether_addr_octet[0], 1397 port->p_macaddr.ether_addr_octet[1], 1398 port->p_macaddr.ether_addr_octet[2], 1399 port->p_macaddr.ether_addr_octet[3], 1400 port->p_macaddr.ether_addr_octet[4], 1401 port->p_macaddr.ether_addr_octet[5], 1402 port->p_instance, vswp->physname, err); 1403 1404 /* 1405 * Mark that attempt should be made to re-config sometime 1406 * in future if a port is deleted. 1407 */ 1408 vswp->recfg_reqd = B_TRUE; 1409 1410 /* 1411 * Only 1 mode specified, nothing more to do. 1412 */ 1413 if (vswp->smode_num == 1) 1414 return (err); 1415 1416 /* 1417 * If promiscuous was next mode specified try to 1418 * set the card into that mode. 1419 */ 1420 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1421 (vswp->smode[vswp->smode_idx + 1] 1422 == VSW_LAYER2_PROMISC)) { 1423 vswp->smode_idx += 1; 1424 return (vsw_set_hw_promisc(vswp, port)); 1425 } 1426 return (err); 1427 } 1428 1429 port->addr_slot = mac_addr.mma_slot; 1430 port->addr_set = VSW_ADDR_HW; 1431 1432 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d " 1433 "into slot %d of device %s", 1434 port->p_macaddr.ether_addr_octet[0], 1435 port->p_macaddr.ether_addr_octet[1], 1436 port->p_macaddr.ether_addr_octet[2], 1437 port->p_macaddr.ether_addr_octet[3], 1438 port->p_macaddr.ether_addr_octet[4], 1439 port->p_macaddr.ether_addr_octet[5], 1440 port->p_instance, port->addr_slot, vswp->physname); 1441 1442 D1(vswp, "%s: exit", __func__); 1443 1444 return (0); 1445 } 1446 1447 /* 1448 * If in layer 3 mode do nothing. 1449 * 1450 * If in layer 2 switched mode remove the address from the physical 1451 * device. 1452 * 1453 * If in layer 2 promiscuous mode disable promisc mode. 1454 * 1455 * Returns 0 on success. 1456 */ 1457 static int 1458 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port) 1459 { 1460 int err; 1461 void *mah; 1462 1463 D1(vswp, "%s: enter", __func__); 1464 1465 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1466 return (0); 1467 1468 if (port->addr_set == VSW_ADDR_PROMISC) { 1469 return (vsw_unset_hw_promisc(vswp, port)); 1470 } 1471 1472 if (port->addr_set == VSW_ADDR_HW) { 1473 if (vswp->mh == NULL) 1474 return (1); 1475 1476 if (vswp->maddr.maddr_handle == NULL) 1477 return (1); 1478 1479 mah = vswp->maddr.maddr_handle; 1480 1481 err = vswp->maddr.maddr_remove(mah, port->addr_slot); 1482 if (err != 0) { 1483 cmn_err(CE_WARN, "!Unable to remove addr " 1484 "%x:%x:%x:%x:%x:%x for port %d from device %s" 1485 " : (err %d)", 1486 port->p_macaddr.ether_addr_octet[0], 1487 port->p_macaddr.ether_addr_octet[1], 1488 port->p_macaddr.ether_addr_octet[2], 1489 port->p_macaddr.ether_addr_octet[3], 1490 port->p_macaddr.ether_addr_octet[4], 1491 port->p_macaddr.ether_addr_octet[5], 1492 port->p_instance, vswp->physname, err); 1493 return (err); 1494 } 1495 1496 port->addr_set = VSW_ADDR_UNSET; 1497 1498 D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for " 1499 "port %d from device %s", 1500 port->p_macaddr.ether_addr_octet[0], 1501 port->p_macaddr.ether_addr_octet[1], 1502 port->p_macaddr.ether_addr_octet[2], 1503 port->p_macaddr.ether_addr_octet[3], 1504 port->p_macaddr.ether_addr_octet[4], 1505 port->p_macaddr.ether_addr_octet[5], 1506 port->p_instance, vswp->physname); 1507 } 1508 1509 D1(vswp, "%s: exit", __func__); 1510 return (0); 1511 } 1512 1513 /* 1514 * Set network card into promisc mode. 1515 * 1516 * Returns 0 on success, 1 on failure. 1517 */ 1518 static int 1519 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1520 { 1521 D1(vswp, "%s: enter", __func__); 1522 1523 if (vswp->mh == NULL) 1524 return (1); 1525 1526 if (vswp->promisc_cnt++ == 0) { 1527 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1528 vswp->promisc_cnt--; 1529 return (1); 1530 } 1531 cmn_err(CE_NOTE, "!switching device %s into promiscuous mode", 1532 vswp->physname); 1533 } 1534 port->addr_set = VSW_ADDR_PROMISC; 1535 1536 D1(vswp, "%s: exit", __func__); 1537 1538 return (0); 1539 } 1540 1541 /* 1542 * Turn off promiscuous mode on network card. 1543 * 1544 * Returns 0 on success, 1 on failure. 1545 */ 1546 static int 1547 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1548 { 1549 vsw_port_list_t *plist = &vswp->plist; 1550 1551 D1(vswp, "%s: enter", __func__); 1552 1553 if (vswp->mh == NULL) 1554 return (1); 1555 1556 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1557 1558 if (--vswp->promisc_cnt == 0) { 1559 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1560 vswp->promisc_cnt++; 1561 return (1); 1562 } 1563 1564 /* 1565 * We are exiting promisc mode either because we were 1566 * only in promisc mode because we had failed over from 1567 * switched mode due to HW resource issues, or the user 1568 * wanted the card in promisc mode for all the ports and 1569 * the last port is now being deleted. Tweak the message 1570 * accordingly. 1571 */ 1572 if (plist->num_ports != 0) { 1573 cmn_err(CE_NOTE, "!switching device %s back to " 1574 "programmed mode", vswp->physname); 1575 } else { 1576 cmn_err(CE_NOTE, "!switching device %s out of " 1577 "promiscuous mode", vswp->physname); 1578 } 1579 } 1580 port->addr_set = VSW_ADDR_UNSET; 1581 1582 D1(vswp, "%s: exit", __func__); 1583 return (0); 1584 } 1585 1586 /* 1587 * Determine whether or not we are operating in our prefered 1588 * mode and if not whether the physical resources now allow us 1589 * to operate in it. 1590 * 1591 * Should only be invoked after port which is being deleted has been 1592 * removed from the port list. 1593 */ 1594 static int 1595 vsw_reconfig_hw(vsw_t *vswp) 1596 { 1597 vsw_port_list_t *plist = &vswp->plist; 1598 mac_multi_addr_t mac_addr; 1599 vsw_port_t *tp; 1600 void *mah; 1601 int rv = 0; 1602 int s_idx; 1603 1604 D1(vswp, "%s: enter", __func__); 1605 1606 if (vswp->maddr.maddr_handle == NULL) 1607 return (1); 1608 1609 /* 1610 * Check if there are now sufficient HW resources to 1611 * attempt a re-config. 1612 */ 1613 if (plist->num_ports > vswp->maddr.maddr_naddrfree) 1614 return (1); 1615 1616 /* 1617 * If we are in layer 2 (i.e. switched) or would like to be 1618 * in layer 2 then check if any ports need to be programmed 1619 * into the HW. 1620 * 1621 * This can happen in two cases - switched was specified as 1622 * the prefered mode of operation but we exhausted the HW 1623 * resources and so failed over to the next specifed mode, 1624 * or switched was the only mode specified so after HW 1625 * resources were exhausted there was nothing more we 1626 * could do. 1627 */ 1628 if (vswp->smode_idx > 0) 1629 s_idx = vswp->smode_idx - 1; 1630 else 1631 s_idx = vswp->smode_idx; 1632 1633 if (vswp->smode[s_idx] == VSW_LAYER2) { 1634 mah = vswp->maddr.maddr_handle; 1635 1636 D2(vswp, "%s: attempting reconfig..", __func__); 1637 1638 /* 1639 * Scan the port list for any port whose address has not 1640 * be programmed in HW - there should be a max of one. 1641 */ 1642 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1643 if (tp->addr_set != VSW_ADDR_HW) { 1644 mac_addr.mma_addrlen = ETHERADDRL; 1645 ether_copy(&tp->p_macaddr, &mac_addr.mma_addr); 1646 1647 rv = vswp->maddr.maddr_add(mah, &mac_addr); 1648 if (rv != 0) { 1649 DWARN(vswp, "Error setting addr in " 1650 "HW for port %d err %d", 1651 tp->p_instance, rv); 1652 goto reconfig_err_exit; 1653 } 1654 tp->addr_slot = mac_addr.mma_slot; 1655 1656 D2(vswp, "re-programmed port %d " 1657 "addr %x:%x:%x:%x:%x:%x into slot %d" 1658 " of device %s", tp->p_instance, 1659 tp->p_macaddr.ether_addr_octet[0], 1660 tp->p_macaddr.ether_addr_octet[1], 1661 tp->p_macaddr.ether_addr_octet[2], 1662 tp->p_macaddr.ether_addr_octet[3], 1663 tp->p_macaddr.ether_addr_octet[4], 1664 tp->p_macaddr.ether_addr_octet[5], 1665 tp->addr_slot, vswp->physname); 1666 1667 /* 1668 * If up to now we had to put the card into 1669 * promisc mode to see this address, we 1670 * can now safely disable promisc mode. 1671 */ 1672 if (tp->addr_set == VSW_ADDR_PROMISC) 1673 (void) vsw_unset_hw_promisc(vswp, tp); 1674 1675 tp->addr_set = VSW_ADDR_HW; 1676 } 1677 } 1678 1679 /* no further re-config needed */ 1680 vswp->recfg_reqd = B_FALSE; 1681 1682 vswp->smode_idx = s_idx; 1683 1684 return (0); 1685 } 1686 1687 reconfig_err_exit: 1688 return (rv); 1689 } 1690 1691 static void 1692 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 1693 { 1694 ringp->ring_state = VSW_MAC_RING_FREE; 1695 ringp->ring_arg = NULL; 1696 ringp->ring_blank = NULL; 1697 ringp->ring_vqp = NULL; 1698 ringp->ring_vswp = vswp; 1699 } 1700 1701 static void 1702 vsw_mac_ring_tbl_init(vsw_t *vswp) 1703 { 1704 int i; 1705 1706 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 1707 1708 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 1709 vswp->mac_ring_tbl = 1710 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), 1711 KM_SLEEP); 1712 1713 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 1714 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 1715 } 1716 1717 static void 1718 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 1719 { 1720 int i; 1721 1722 mutex_enter(&vswp->mac_ring_lock); 1723 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1724 if (vswp->mac_ring_tbl[i].ring_state != VSW_MAC_RING_FREE) { 1725 /* 1726 * Destroy the queue. 1727 */ 1728 vsw_queue_stop(vswp->mac_ring_tbl[i].ring_vqp); 1729 vsw_queue_destroy(vswp->mac_ring_tbl[i].ring_vqp); 1730 1731 /* 1732 * Re-initialize the structure. 1733 */ 1734 vsw_mac_ring_tbl_entry_init(vswp, 1735 &vswp->mac_ring_tbl[i]); 1736 } 1737 } 1738 mutex_exit(&vswp->mac_ring_lock); 1739 1740 mutex_destroy(&vswp->mac_ring_lock); 1741 kmem_free(vswp->mac_ring_tbl, 1742 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 1743 vswp->mac_ring_tbl_sz = 0; 1744 } 1745 1746 /* 1747 * Handle resource add callbacks from the driver below. 1748 */ 1749 static mac_resource_handle_t 1750 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 1751 { 1752 vsw_t *vswp = (vsw_t *)arg; 1753 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 1754 vsw_mac_ring_t *ringp; 1755 vsw_queue_t *vqp; 1756 int i; 1757 1758 ASSERT(vswp != NULL); 1759 ASSERT(mrp != NULL); 1760 ASSERT(vswp->mac_ring_tbl != NULL); 1761 1762 D1(vswp, "%s: enter", __func__); 1763 1764 /* 1765 * Check to make sure we have the correct resource type. 1766 */ 1767 if (mrp->mr_type != MAC_RX_FIFO) 1768 return (NULL); 1769 1770 /* 1771 * Find a open entry in the ring table. 1772 */ 1773 mutex_enter(&vswp->mac_ring_lock); 1774 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1775 ringp = &vswp->mac_ring_tbl[i]; 1776 1777 /* 1778 * Check for an empty slot, if found, then setup queue 1779 * and thread. 1780 */ 1781 if (ringp->ring_state == VSW_MAC_RING_FREE) { 1782 /* 1783 * Create the queue for this ring. 1784 */ 1785 vqp = vsw_queue_create(); 1786 1787 /* 1788 * Initialize the ring data structure. 1789 */ 1790 ringp->ring_vqp = vqp; 1791 ringp->ring_arg = mrfp->mrf_arg; 1792 ringp->ring_blank = mrfp->mrf_blank; 1793 ringp->ring_state = VSW_MAC_RING_INUSE; 1794 1795 /* 1796 * Create the worker thread. 1797 */ 1798 vqp->vq_worker = thread_create(NULL, 0, 1799 vsw_queue_worker, ringp, 0, &p0, 1800 TS_RUN, minclsyspri); 1801 if (vqp->vq_worker == NULL) { 1802 vsw_queue_destroy(vqp); 1803 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1804 ringp = NULL; 1805 } 1806 1807 mutex_exit(&vswp->mac_ring_lock); 1808 D1(vswp, "%s: exit", __func__); 1809 return ((mac_resource_handle_t)ringp); 1810 } 1811 } 1812 mutex_exit(&vswp->mac_ring_lock); 1813 1814 /* 1815 * No slots in the ring table available. 1816 */ 1817 D1(vswp, "%s: exit", __func__); 1818 return (NULL); 1819 } 1820 1821 static void 1822 vsw_queue_stop(vsw_queue_t *vqp) 1823 { 1824 mutex_enter(&vqp->vq_lock); 1825 1826 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 1827 vqp->vq_state = VSW_QUEUE_STOP; 1828 cv_signal(&vqp->vq_cv); 1829 1830 while (vqp->vq_state != VSW_QUEUE_DRAINED) 1831 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1832 } 1833 1834 mutex_exit(&vqp->vq_lock); 1835 } 1836 1837 static vsw_queue_t * 1838 vsw_queue_create() 1839 { 1840 vsw_queue_t *vqp; 1841 1842 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 1843 1844 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 1845 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 1846 vqp->vq_first = NULL; 1847 vqp->vq_last = NULL; 1848 vqp->vq_state = VSW_QUEUE_STOP; 1849 1850 return (vqp); 1851 } 1852 1853 static void 1854 vsw_queue_destroy(vsw_queue_t *vqp) 1855 { 1856 cv_destroy(&vqp->vq_cv); 1857 mutex_destroy(&vqp->vq_lock); 1858 kmem_free(vqp, sizeof (vsw_queue_t)); 1859 } 1860 1861 static void 1862 vsw_queue_worker(vsw_mac_ring_t *rrp) 1863 { 1864 mblk_t *mp; 1865 vsw_queue_t *vqp = rrp->ring_vqp; 1866 vsw_t *vswp = rrp->ring_vswp; 1867 1868 mutex_enter(&vqp->vq_lock); 1869 1870 ASSERT(vqp->vq_state == VSW_QUEUE_STOP); 1871 1872 /* 1873 * Set the state to running, since the thread is now active. 1874 */ 1875 vqp->vq_state = VSW_QUEUE_RUNNING; 1876 1877 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 1878 /* 1879 * Wait for work to do or the state has changed 1880 * to not running. 1881 */ 1882 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 1883 (vqp->vq_first == NULL)) { 1884 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1885 } 1886 1887 /* 1888 * Process packets that we received from the interface. 1889 */ 1890 if (vqp->vq_first != NULL) { 1891 mp = vqp->vq_first; 1892 1893 vqp->vq_first = NULL; 1894 vqp->vq_last = NULL; 1895 1896 mutex_exit(&vqp->vq_lock); 1897 1898 /* switch the chain of packets received */ 1899 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 1900 1901 mutex_enter(&vqp->vq_lock); 1902 } 1903 } 1904 1905 /* 1906 * We are drained and signal we are done. 1907 */ 1908 vqp->vq_state = VSW_QUEUE_DRAINED; 1909 cv_signal(&vqp->vq_cv); 1910 1911 /* 1912 * Exit lock and drain the remaining packets. 1913 */ 1914 mutex_exit(&vqp->vq_lock); 1915 1916 /* 1917 * Exit the thread 1918 */ 1919 thread_exit(); 1920 } 1921 1922 /* 1923 * static void 1924 * vsw_rx_queue_cb() - Receive callback routine when 1925 * vsw_multi_ring_enable is non-zero. Queue the packets 1926 * to a packet queue for a worker thread to process. 1927 */ 1928 static void 1929 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 1930 { 1931 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 1932 vsw_t *vswp = (vsw_t *)arg; 1933 vsw_queue_t *vqp; 1934 mblk_t *bp, *last; 1935 1936 ASSERT(mrh != NULL); 1937 ASSERT(vswp != NULL); 1938 ASSERT(mp != NULL); 1939 1940 D1(vswp, "%s: enter", __func__); 1941 1942 /* 1943 * Find the last element in the mblk chain. 1944 */ 1945 bp = mp; 1946 do { 1947 last = bp; 1948 bp = bp->b_next; 1949 } while (bp != NULL); 1950 1951 /* Get the queue for the packets */ 1952 vqp = ringp->ring_vqp; 1953 1954 /* 1955 * Grab the lock such we can queue the packets. 1956 */ 1957 mutex_enter(&vqp->vq_lock); 1958 1959 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 1960 freemsg(mp); 1961 goto vsw_rx_queue_cb_exit; 1962 } 1963 1964 /* 1965 * Add the mblk chain to the queue. If there 1966 * is some mblks in the queue, then add the new 1967 * chain to the end. 1968 */ 1969 if (vqp->vq_first == NULL) 1970 vqp->vq_first = mp; 1971 else 1972 vqp->vq_last->b_next = mp; 1973 1974 vqp->vq_last = last; 1975 1976 /* 1977 * Signal the worker thread that there is work to 1978 * do. 1979 */ 1980 cv_signal(&vqp->vq_cv); 1981 1982 /* 1983 * Let go of the lock and exit. 1984 */ 1985 vsw_rx_queue_cb_exit: 1986 mutex_exit(&vqp->vq_lock); 1987 D1(vswp, "%s: exit", __func__); 1988 } 1989 1990 /* 1991 * receive callback routine. Invoked by MAC layer when there 1992 * are pkts being passed up from physical device. 1993 * 1994 * PERF: It may be more efficient when the card is in promisc 1995 * mode to check the dest address of the pkts here (against 1996 * the FDB) rather than checking later. Needs to be investigated. 1997 */ 1998 static void 1999 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2000 { 2001 _NOTE(ARGUNUSED(mrh)) 2002 2003 vsw_t *vswp = (vsw_t *)arg; 2004 2005 ASSERT(vswp != NULL); 2006 2007 D1(vswp, "vsw_rx_cb: enter"); 2008 2009 /* switch the chain of packets received */ 2010 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2011 2012 D1(vswp, "vsw_rx_cb: exit"); 2013 } 2014 2015 /* 2016 * Send a message out over the physical device via the MAC layer. 2017 * 2018 * Returns any mblks that it was unable to transmit. 2019 */ 2020 static mblk_t * 2021 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2022 { 2023 const mac_txinfo_t *mtp; 2024 mblk_t *nextp; 2025 2026 if (vswp->mh == NULL) { 2027 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2028 return (mp); 2029 } else { 2030 for (;;) { 2031 nextp = mp->b_next; 2032 mp->b_next = NULL; 2033 2034 mtp = vswp->txinfo; 2035 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2036 mp->b_next = nextp; 2037 break; 2038 } 2039 2040 if ((mp = nextp) == NULL) 2041 break; 2042 2043 } 2044 2045 } 2046 2047 return (mp); 2048 } 2049 2050 /* 2051 * Register with the MAC layer as a network device, so we 2052 * can be plumbed if necessary. 2053 */ 2054 static int 2055 vsw_mac_register(vsw_t *vswp) 2056 { 2057 mac_register_t *macp; 2058 int rv; 2059 2060 D1(vswp, "%s: enter", __func__); 2061 2062 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2063 return (EINVAL); 2064 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2065 macp->m_driver = vswp; 2066 macp->m_dip = vswp->dip; 2067 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2068 macp->m_callbacks = &vsw_m_callbacks; 2069 macp->m_min_sdu = 0; 2070 macp->m_max_sdu = ETHERMTU; 2071 rv = mac_register(macp, &vswp->if_mh); 2072 mac_free(macp); 2073 if (rv == 0) 2074 vswp->if_state |= VSW_IF_REG; 2075 2076 D1(vswp, "%s: exit", __func__); 2077 2078 return (rv); 2079 } 2080 2081 static int 2082 vsw_mac_unregister(vsw_t *vswp) 2083 { 2084 int rv = 0; 2085 2086 D1(vswp, "%s: enter", __func__); 2087 2088 WRITE_ENTER(&vswp->if_lockrw); 2089 2090 if (vswp->if_state & VSW_IF_REG) { 2091 rv = mac_unregister(vswp->if_mh); 2092 if (rv != 0) { 2093 DWARN(vswp, "%s: unable to unregister from MAC " 2094 "framework", __func__); 2095 2096 RW_EXIT(&vswp->if_lockrw); 2097 D1(vswp, "%s: fail exit", __func__); 2098 return (rv); 2099 } 2100 2101 /* mark i/f as down and unregistered */ 2102 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2103 } 2104 RW_EXIT(&vswp->if_lockrw); 2105 2106 vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR); 2107 2108 D1(vswp, "%s: exit", __func__); 2109 2110 return (rv); 2111 } 2112 2113 static int 2114 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2115 { 2116 vsw_t *vswp = (vsw_t *)arg; 2117 2118 D1(vswp, "%s: enter", __func__); 2119 2120 if (vswp->mh == NULL) 2121 return (EINVAL); 2122 2123 /* return stats from underlying device */ 2124 *val = mac_stat_get(vswp->mh, stat); 2125 return (0); 2126 } 2127 2128 static void 2129 vsw_m_stop(void *arg) 2130 { 2131 vsw_t *vswp = (vsw_t *)arg; 2132 2133 D1(vswp, "%s: enter", __func__); 2134 2135 WRITE_ENTER(&vswp->if_lockrw); 2136 vswp->if_state &= ~VSW_IF_UP; 2137 RW_EXIT(&vswp->if_lockrw); 2138 2139 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2140 } 2141 2142 static int 2143 vsw_m_start(void *arg) 2144 { 2145 vsw_t *vswp = (vsw_t *)arg; 2146 2147 D1(vswp, "%s: enter", __func__); 2148 2149 WRITE_ENTER(&vswp->if_lockrw); 2150 vswp->if_state |= VSW_IF_UP; 2151 RW_EXIT(&vswp->if_lockrw); 2152 2153 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2154 return (0); 2155 } 2156 2157 /* 2158 * Change the local interface address. 2159 */ 2160 static int 2161 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2162 { 2163 vsw_t *vswp = (vsw_t *)arg; 2164 2165 D1(vswp, "%s: enter", __func__); 2166 2167 WRITE_ENTER(&vswp->if_lockrw); 2168 ether_copy(macaddr, &vswp->if_addr); 2169 RW_EXIT(&vswp->if_lockrw); 2170 2171 D1(vswp, "%s: exit", __func__); 2172 2173 return (0); 2174 } 2175 2176 static int 2177 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2178 { 2179 vsw_t *vswp = (vsw_t *)arg; 2180 mcst_addr_t *mcst_p = NULL; 2181 uint64_t addr = 0x0; 2182 int i, ret = 0; 2183 2184 D1(vswp, "%s: enter", __func__); 2185 2186 /* 2187 * Convert address into form that can be used 2188 * as hash table key. 2189 */ 2190 for (i = 0; i < ETHERADDRL; i++) { 2191 addr = (addr << 8) | mca[i]; 2192 } 2193 2194 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2195 2196 if (add) { 2197 D2(vswp, "%s: adding multicast", __func__); 2198 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2199 /* 2200 * Update the list of multicast addresses 2201 * contained within the vsw_t structure to 2202 * include this new one. 2203 */ 2204 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2205 if (mcst_p == NULL) { 2206 DERR(vswp, "%s unable to alloc mem", __func__); 2207 return (1); 2208 } 2209 mcst_p->addr = addr; 2210 2211 mutex_enter(&vswp->mca_lock); 2212 mcst_p->nextp = vswp->mcap; 2213 vswp->mcap = mcst_p; 2214 mutex_exit(&vswp->mca_lock); 2215 2216 /* 2217 * Call into the underlying driver to program the 2218 * address into HW. 2219 */ 2220 if (vswp->mh != NULL) { 2221 ret = mac_multicst_add(vswp->mh, mca); 2222 if (ret != 0) { 2223 cmn_err(CE_WARN, "!unable to add " 2224 "multicast address"); 2225 goto vsw_remove_addr; 2226 } 2227 } 2228 } else { 2229 cmn_err(CE_WARN, "!unable to add multicast address"); 2230 } 2231 return (ret); 2232 } 2233 2234 vsw_remove_addr: 2235 2236 D2(vswp, "%s: removing multicast", __func__); 2237 /* 2238 * Remove the address from the hash table.. 2239 */ 2240 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2241 2242 /* 2243 * ..and then from the list maintained in the 2244 * vsw_t structure. 2245 */ 2246 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2247 2248 if (vswp->mh != NULL) 2249 (void) mac_multicst_remove(vswp->mh, mca); 2250 } 2251 2252 D1(vswp, "%s: exit", __func__); 2253 2254 return (0); 2255 } 2256 2257 static int 2258 vsw_m_promisc(void *arg, boolean_t on) 2259 { 2260 vsw_t *vswp = (vsw_t *)arg; 2261 2262 D1(vswp, "%s: enter", __func__); 2263 2264 WRITE_ENTER(&vswp->if_lockrw); 2265 if (on) 2266 vswp->if_state |= VSW_IF_PROMISC; 2267 else 2268 vswp->if_state &= ~VSW_IF_PROMISC; 2269 RW_EXIT(&vswp->if_lockrw); 2270 2271 D1(vswp, "%s: exit", __func__); 2272 2273 return (0); 2274 } 2275 2276 static mblk_t * 2277 vsw_m_tx(void *arg, mblk_t *mp) 2278 { 2279 vsw_t *vswp = (vsw_t *)arg; 2280 2281 D1(vswp, "%s: enter", __func__); 2282 2283 vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2284 2285 D1(vswp, "%s: exit", __func__); 2286 2287 return (NULL); 2288 } 2289 2290 /* 2291 * Register for machine description (MD) updates. 2292 */ 2293 static void 2294 vsw_mdeg_register(vsw_t *vswp) 2295 { 2296 mdeg_prop_spec_t *pspecp; 2297 mdeg_node_spec_t *inst_specp; 2298 mdeg_handle_t mdeg_hdl; 2299 size_t templatesz; 2300 int inst, rv; 2301 2302 D1(vswp, "%s: enter", __func__); 2303 2304 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 2305 DDI_PROP_DONTPASS, reg_propname, -1); 2306 if (inst == -1) { 2307 DERR(vswp, "%s: unable to get %s property", 2308 __func__, reg_propname); 2309 return; 2310 } 2311 2312 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 2313 2314 /* 2315 * Allocate and initialize a per-instance copy 2316 * of the global property spec array that will 2317 * uniquely identify this vsw instance. 2318 */ 2319 templatesz = sizeof (vsw_prop_template); 2320 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2321 2322 bcopy(vsw_prop_template, pspecp, templatesz); 2323 2324 VSW_SET_MDEG_PROP_INST(pspecp, inst); 2325 2326 /* initialize the complete prop spec structure */ 2327 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2328 inst_specp->namep = "virtual-device"; 2329 inst_specp->specp = pspecp; 2330 2331 /* perform the registration */ 2332 rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, 2333 (void *)vswp, &mdeg_hdl); 2334 2335 if (rv != MDEG_SUCCESS) { 2336 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2337 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2338 kmem_free(pspecp, templatesz); 2339 return; 2340 } 2341 2342 /* save off data that will be needed later */ 2343 vswp->inst_spec = inst_specp; 2344 vswp->mdeg_hdl = mdeg_hdl; 2345 2346 D1(vswp, "%s: exit", __func__); 2347 } 2348 2349 static void 2350 vsw_mdeg_unregister(vsw_t *vswp) 2351 { 2352 D1(vswp, "vsw_mdeg_unregister: enter"); 2353 2354 (void) mdeg_unregister(vswp->mdeg_hdl); 2355 2356 if (vswp->inst_spec->specp != NULL) { 2357 (void) kmem_free(vswp->inst_spec->specp, 2358 sizeof (vsw_prop_template)); 2359 vswp->inst_spec->specp = NULL; 2360 } 2361 2362 if (vswp->inst_spec != NULL) { 2363 (void) kmem_free(vswp->inst_spec, 2364 sizeof (mdeg_node_spec_t)); 2365 vswp->inst_spec = NULL; 2366 } 2367 2368 D1(vswp, "vsw_mdeg_unregister: exit"); 2369 } 2370 2371 static int 2372 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2373 { 2374 vsw_t *vswp; 2375 int idx; 2376 md_t *mdp; 2377 mde_cookie_t node; 2378 uint64_t inst; 2379 2380 if (resp == NULL) 2381 return (MDEG_FAILURE); 2382 2383 vswp = (vsw_t *)cb_argp; 2384 2385 D1(vswp, "%s: added %d : removed %d : matched %d", 2386 __func__, resp->added.nelem, resp->removed.nelem, 2387 resp->match_prev.nelem); 2388 2389 /* process added ports */ 2390 for (idx = 0; idx < resp->added.nelem; idx++) { 2391 mdp = resp->added.mdp; 2392 node = resp->added.mdep[idx]; 2393 2394 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2395 2396 if (vsw_port_add(vswp, mdp, &node) != 0) { 2397 cmn_err(CE_WARN, "Unable to add new port (0x%lx)", 2398 node); 2399 } 2400 } 2401 2402 /* process removed ports */ 2403 for (idx = 0; idx < resp->removed.nelem; idx++) { 2404 mdp = resp->removed.mdp; 2405 node = resp->removed.mdep[idx]; 2406 2407 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2408 DERR(vswp, "%s: prop(%s) not found port(%d)", 2409 __func__, id_propname, idx); 2410 continue; 2411 } 2412 2413 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2414 2415 if (vsw_port_detach(vswp, inst) != 0) { 2416 cmn_err(CE_WARN, "Unable to remove port %ld", inst); 2417 } 2418 } 2419 2420 /* 2421 * Currently no support for updating already active ports. 2422 * So, ignore the match_curr and match_priv arrays for now. 2423 */ 2424 2425 D1(vswp, "%s: exit", __func__); 2426 2427 return (MDEG_SUCCESS); 2428 } 2429 2430 /* 2431 * Add a new port to the system. 2432 * 2433 * Returns 0 on success, 1 on failure. 2434 */ 2435 int 2436 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 2437 { 2438 uint64_t ldc_id; 2439 uint8_t *addrp; 2440 int i, addrsz; 2441 int num_nodes = 0, nchan = 0; 2442 int listsz = 0; 2443 mde_cookie_t *listp = NULL; 2444 struct ether_addr ea; 2445 uint64_t macaddr; 2446 uint64_t inst = 0; 2447 vsw_port_t *port; 2448 2449 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 2450 DWARN(vswp, "%s: prop(%s) not found", __func__, 2451 id_propname); 2452 return (1); 2453 } 2454 2455 /* 2456 * Find the channel endpoint node(s) (which should be under this 2457 * port node) which contain the channel id(s). 2458 */ 2459 if ((num_nodes = md_node_count(mdp)) <= 0) { 2460 DERR(vswp, "%s: invalid number of nodes found (%d)", 2461 __func__, num_nodes); 2462 return (1); 2463 } 2464 2465 /* allocate enough space for node list */ 2466 listsz = num_nodes * sizeof (mde_cookie_t); 2467 listp = kmem_zalloc(listsz, KM_SLEEP); 2468 2469 nchan = md_scan_dag(mdp, *node, 2470 md_find_name(mdp, chan_propname), 2471 md_find_name(mdp, "fwd"), listp); 2472 2473 if (nchan <= 0) { 2474 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 2475 kmem_free(listp, listsz); 2476 return (1); 2477 } 2478 2479 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 2480 2481 /* use property from first node found */ 2482 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 2483 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 2484 id_propname); 2485 kmem_free(listp, listsz); 2486 return (1); 2487 } 2488 2489 /* don't need list any more */ 2490 kmem_free(listp, listsz); 2491 2492 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 2493 2494 /* read mac-address property */ 2495 if (md_get_prop_data(mdp, *node, remaddr_propname, 2496 &addrp, &addrsz)) { 2497 DWARN(vswp, "%s: prop(%s) not found", 2498 __func__, remaddr_propname); 2499 return (1); 2500 } 2501 2502 if (addrsz < ETHERADDRL) { 2503 DWARN(vswp, "%s: invalid address size", __func__); 2504 return (1); 2505 } 2506 2507 macaddr = *((uint64_t *)addrp); 2508 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 2509 2510 for (i = ETHERADDRL - 1; i >= 0; i--) { 2511 ea.ether_addr_octet[i] = macaddr & 0xFF; 2512 macaddr >>= 8; 2513 } 2514 2515 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 2516 DERR(vswp, "%s: failed to attach port", __func__); 2517 return (1); 2518 } 2519 2520 port = vsw_lookup_port(vswp, (int)inst); 2521 2522 /* just successfuly created the port, so it should exist */ 2523 ASSERT(port != NULL); 2524 2525 return (0); 2526 } 2527 2528 /* 2529 * Attach the specified port. 2530 * 2531 * Returns 0 on success, 1 on failure. 2532 */ 2533 static int 2534 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 2535 struct ether_addr *macaddr) 2536 { 2537 vsw_port_list_t *plist = &vswp->plist; 2538 vsw_port_t *port, **prev_port; 2539 int i; 2540 2541 D1(vswp, "%s: enter : port %d", __func__, p_instance); 2542 2543 /* port already exists? */ 2544 READ_ENTER(&plist->lockrw); 2545 for (port = plist->head; port != NULL; port = port->p_next) { 2546 if (port->p_instance == p_instance) { 2547 DWARN(vswp, "%s: port instance %d already attached", 2548 __func__, p_instance); 2549 RW_EXIT(&plist->lockrw); 2550 return (1); 2551 } 2552 } 2553 RW_EXIT(&plist->lockrw); 2554 2555 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 2556 port->p_vswp = vswp; 2557 port->p_instance = p_instance; 2558 port->p_ldclist.num_ldcs = 0; 2559 port->p_ldclist.head = NULL; 2560 port->addr_set = VSW_ADDR_UNSET; 2561 2562 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 2563 2564 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 2565 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 2566 2567 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 2568 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 2569 2570 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 2571 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 2572 port->state = VSW_PORT_INIT; 2573 2574 if (nids > VSW_PORT_MAX_LDCS) { 2575 D2(vswp, "%s: using first of %d ldc ids", 2576 __func__, nids); 2577 nids = VSW_PORT_MAX_LDCS; 2578 } 2579 2580 D2(vswp, "%s: %d nids", __func__, nids); 2581 for (i = 0; i < nids; i++) { 2582 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 2583 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 2584 DERR(vswp, "%s: ldc_attach failed", __func__); 2585 2586 rw_destroy(&port->p_ldclist.lockrw); 2587 2588 cv_destroy(&port->ref_cv); 2589 mutex_destroy(&port->ref_lock); 2590 2591 cv_destroy(&port->state_cv); 2592 mutex_destroy(&port->state_lock); 2593 2594 mutex_destroy(&port->tx_lock); 2595 mutex_destroy(&port->mca_lock); 2596 kmem_free(port, sizeof (vsw_port_t)); 2597 return (1); 2598 } 2599 } 2600 2601 ether_copy(macaddr, &port->p_macaddr); 2602 2603 WRITE_ENTER(&plist->lockrw); 2604 2605 /* create the fdb entry for this port/mac address */ 2606 (void) vsw_add_fdb(vswp, port); 2607 2608 (void) vsw_set_hw(vswp, port); 2609 2610 /* link it into the list of ports for this vsw instance */ 2611 prev_port = (vsw_port_t **)(&plist->head); 2612 port->p_next = *prev_port; 2613 *prev_port = port; 2614 plist->num_ports++; 2615 RW_EXIT(&plist->lockrw); 2616 2617 /* 2618 * Initialise the port and any ldc's under it. 2619 */ 2620 (void) vsw_init_ldcs(port); 2621 2622 D1(vswp, "%s: exit", __func__); 2623 return (0); 2624 } 2625 2626 /* 2627 * Detach the specified port. 2628 * 2629 * Returns 0 on success, 1 on failure. 2630 */ 2631 static int 2632 vsw_port_detach(vsw_t *vswp, int p_instance) 2633 { 2634 vsw_port_t *port = NULL; 2635 vsw_port_list_t *plist = &vswp->plist; 2636 2637 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 2638 2639 WRITE_ENTER(&plist->lockrw); 2640 2641 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 2642 RW_EXIT(&plist->lockrw); 2643 return (1); 2644 } 2645 2646 if (vsw_plist_del_node(vswp, port)) { 2647 RW_EXIT(&plist->lockrw); 2648 return (1); 2649 } 2650 2651 /* Remove address if was programmed into HW. */ 2652 (void) vsw_unset_hw(vswp, port); 2653 2654 /* Remove the fdb entry for this port/mac address */ 2655 (void) vsw_del_fdb(vswp, port); 2656 2657 /* Remove any multicast addresses.. */ 2658 vsw_del_mcst_port(port); 2659 2660 /* 2661 * No longer need to hold writer lock on port list now 2662 * that we have unlinked the target port from the list. 2663 */ 2664 RW_EXIT(&plist->lockrw); 2665 2666 READ_ENTER(&plist->lockrw); 2667 2668 if (vswp->recfg_reqd) 2669 (void) vsw_reconfig_hw(vswp); 2670 2671 RW_EXIT(&plist->lockrw); 2672 2673 if (vsw_port_delete(port)) { 2674 return (1); 2675 } 2676 2677 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 2678 return (0); 2679 } 2680 2681 /* 2682 * Detach all active ports. 2683 * 2684 * Returns 0 on success, 1 on failure. 2685 */ 2686 static int 2687 vsw_detach_ports(vsw_t *vswp) 2688 { 2689 vsw_port_list_t *plist = &vswp->plist; 2690 vsw_port_t *port = NULL; 2691 2692 D1(vswp, "%s: enter", __func__); 2693 2694 WRITE_ENTER(&plist->lockrw); 2695 2696 while ((port = plist->head) != NULL) { 2697 if (vsw_plist_del_node(vswp, port)) { 2698 DERR(vswp, "%s: Error deleting port %d" 2699 " from port list", __func__, 2700 port->p_instance); 2701 RW_EXIT(&plist->lockrw); 2702 return (1); 2703 } 2704 2705 /* Remove address if was programmed into HW. */ 2706 (void) vsw_unset_hw(vswp, port); 2707 2708 /* Remove the fdb entry for this port/mac address */ 2709 (void) vsw_del_fdb(vswp, port); 2710 2711 /* Remove any multicast addresses.. */ 2712 vsw_del_mcst_port(port); 2713 2714 /* 2715 * No longer need to hold the lock on the port list 2716 * now that we have unlinked the target port from the 2717 * list. 2718 */ 2719 RW_EXIT(&plist->lockrw); 2720 if (vsw_port_delete(port)) { 2721 DERR(vswp, "%s: Error deleting port %d", 2722 __func__, port->p_instance); 2723 return (1); 2724 } 2725 WRITE_ENTER(&plist->lockrw); 2726 } 2727 RW_EXIT(&plist->lockrw); 2728 2729 D1(vswp, "%s: exit", __func__); 2730 2731 return (0); 2732 } 2733 2734 /* 2735 * Delete the specified port. 2736 * 2737 * Returns 0 on success, 1 on failure. 2738 */ 2739 static int 2740 vsw_port_delete(vsw_port_t *port) 2741 { 2742 vsw_ldc_list_t *ldcl; 2743 vsw_t *vswp = port->p_vswp; 2744 2745 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 2746 2747 (void) vsw_uninit_ldcs(port); 2748 2749 /* 2750 * Wait for any pending ctrl msg tasks which reference this 2751 * port to finish. 2752 */ 2753 if (vsw_drain_port_taskq(port)) 2754 return (1); 2755 2756 /* 2757 * Wait for port reference count to hit zero. 2758 */ 2759 mutex_enter(&port->ref_lock); 2760 while (port->ref_cnt != 0) 2761 cv_wait(&port->ref_cv, &port->ref_lock); 2762 mutex_exit(&port->ref_lock); 2763 2764 /* 2765 * Wait for any active callbacks to finish 2766 */ 2767 if (vsw_drain_ldcs(port)) 2768 return (1); 2769 2770 ldcl = &port->p_ldclist; 2771 WRITE_ENTER(&ldcl->lockrw); 2772 while (ldcl->num_ldcs > 0) { 2773 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 2774 cmn_err(CE_WARN, "unable to detach ldc %ld", 2775 ldcl->head->ldc_id); 2776 RW_EXIT(&ldcl->lockrw); 2777 return (1); 2778 } 2779 } 2780 RW_EXIT(&ldcl->lockrw); 2781 2782 rw_destroy(&port->p_ldclist.lockrw); 2783 2784 mutex_destroy(&port->mca_lock); 2785 mutex_destroy(&port->tx_lock); 2786 cv_destroy(&port->ref_cv); 2787 mutex_destroy(&port->ref_lock); 2788 2789 cv_destroy(&port->state_cv); 2790 mutex_destroy(&port->state_lock); 2791 2792 kmem_free(port, sizeof (vsw_port_t)); 2793 2794 D1(vswp, "%s: exit", __func__); 2795 2796 return (0); 2797 } 2798 2799 /* 2800 * Attach a logical domain channel (ldc) under a specified port. 2801 * 2802 * Returns 0 on success, 1 on failure. 2803 */ 2804 static int 2805 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 2806 { 2807 vsw_t *vswp = port->p_vswp; 2808 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2809 vsw_ldc_t *ldcp = NULL; 2810 ldc_attr_t attr; 2811 ldc_status_t istatus; 2812 int status = DDI_FAILURE; 2813 int rv; 2814 enum { PROG_init = 0x0, PROG_mblks = 0x1, 2815 PROG_callback = 0x2} 2816 progress; 2817 2818 progress = PROG_init; 2819 2820 D1(vswp, "%s: enter", __func__); 2821 2822 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 2823 if (ldcp == NULL) { 2824 DERR(vswp, "%s: kmem_zalloc failed", __func__); 2825 return (1); 2826 } 2827 ldcp->ldc_id = ldc_id; 2828 2829 /* allocate pool of receive mblks */ 2830 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 2831 if (rv) { 2832 DWARN(vswp, "%s: unable to create free mblk pool for" 2833 " channel %ld (rv %d)", __func__, ldc_id, rv); 2834 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2835 return (1); 2836 } 2837 2838 progress |= PROG_mblks; 2839 2840 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 2841 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 2842 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 2843 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 2844 2845 /* required for handshake with peer */ 2846 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 2847 ldcp->peer_session = 0; 2848 ldcp->session_status = 0; 2849 2850 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 2851 ldcp->hss_id = 1; /* Initial handshake session id */ 2852 2853 /* only set for outbound lane, inbound set by peer */ 2854 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 2855 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 2856 vsw_set_lane_attr(vswp, &ldcp->lane_out); 2857 2858 attr.devclass = LDC_DEV_NT_SVC; 2859 attr.instance = ddi_get_instance(vswp->dip); 2860 attr.mode = LDC_MODE_UNRELIABLE; 2861 attr.mtu = VSW_LDC_MTU; 2862 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 2863 if (status != 0) { 2864 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 2865 __func__, ldc_id, status); 2866 goto ldc_attach_fail; 2867 } 2868 2869 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 2870 if (status != 0) { 2871 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 2872 __func__, ldc_id, status); 2873 (void) ldc_fini(ldcp->ldc_handle); 2874 goto ldc_attach_fail; 2875 } 2876 2877 progress |= PROG_callback; 2878 2879 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 2880 2881 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2882 DERR(vswp, "%s: ldc_status failed", __func__); 2883 mutex_destroy(&ldcp->status_lock); 2884 goto ldc_attach_fail; 2885 } 2886 2887 ldcp->ldc_status = istatus; 2888 ldcp->ldc_port = port; 2889 ldcp->ldc_vswp = vswp; 2890 2891 /* link it into the list of channels for this port */ 2892 WRITE_ENTER(&ldcl->lockrw); 2893 ldcp->ldc_next = ldcl->head; 2894 ldcl->head = ldcp; 2895 ldcl->num_ldcs++; 2896 RW_EXIT(&ldcl->lockrw); 2897 2898 D1(vswp, "%s: exit", __func__); 2899 return (0); 2900 2901 ldc_attach_fail: 2902 mutex_destroy(&ldcp->ldc_txlock); 2903 mutex_destroy(&ldcp->ldc_cblock); 2904 2905 cv_destroy(&ldcp->drain_cv); 2906 2907 if (progress & PROG_callback) { 2908 (void) ldc_unreg_callback(ldcp->ldc_handle); 2909 } 2910 2911 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 2912 if (vio_destroy_mblks(ldcp->rxh) != 0) { 2913 /* 2914 * Something odd has happened, as the destroy 2915 * will only fail if some mblks have been allocated 2916 * from the pool already (which shouldn't happen) 2917 * and have not been returned. 2918 * 2919 * Add the pool pointer to a list maintained in 2920 * the device instance. Another attempt will be made 2921 * to free the pool when the device itself detaches. 2922 */ 2923 cmn_err(CE_WARN, "Creation of ldc channel %ld failed" 2924 " and cannot destroy associated mblk pool", 2925 ldc_id); 2926 ldcp->rxh->nextp = vswp->rxh; 2927 vswp->rxh = ldcp->rxh; 2928 } 2929 } 2930 mutex_destroy(&ldcp->drain_cv_lock); 2931 mutex_destroy(&ldcp->hss_lock); 2932 2933 mutex_destroy(&ldcp->lane_in.seq_lock); 2934 mutex_destroy(&ldcp->lane_out.seq_lock); 2935 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2936 2937 return (1); 2938 } 2939 2940 /* 2941 * Detach a logical domain channel (ldc) belonging to a 2942 * particular port. 2943 * 2944 * Returns 0 on success, 1 on failure. 2945 */ 2946 static int 2947 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 2948 { 2949 vsw_t *vswp = port->p_vswp; 2950 vsw_ldc_t *ldcp, *prev_ldcp; 2951 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2952 int rv; 2953 2954 prev_ldcp = ldcl->head; 2955 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 2956 if (ldcp->ldc_id == ldc_id) { 2957 break; 2958 } 2959 } 2960 2961 /* specified ldc id not found */ 2962 if (ldcp == NULL) { 2963 DERR(vswp, "%s: ldcp = NULL", __func__); 2964 return (1); 2965 } 2966 2967 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 2968 2969 /* 2970 * Before we can close the channel we must release any mapped 2971 * resources (e.g. drings). 2972 */ 2973 vsw_free_lane_resources(ldcp, INBOUND); 2974 vsw_free_lane_resources(ldcp, OUTBOUND); 2975 2976 /* 2977 * If the close fails we are in serious trouble, as won't 2978 * be able to delete the parent port. 2979 */ 2980 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 2981 DERR(vswp, "%s: error %d closing channel %lld", 2982 __func__, rv, ldcp->ldc_id); 2983 return (1); 2984 } 2985 2986 (void) ldc_fini(ldcp->ldc_handle); 2987 2988 ldcp->ldc_status = LDC_INIT; 2989 ldcp->ldc_handle = NULL; 2990 ldcp->ldc_vswp = NULL; 2991 2992 if (ldcp->rxh != NULL) { 2993 if (vio_destroy_mblks(ldcp->rxh)) { 2994 /* 2995 * Mostly likely some mblks are still in use and 2996 * have not been returned to the pool. Add the pool 2997 * to the list maintained in the device instance. 2998 * Another attempt will be made to destroy the pool 2999 * when the device detaches. 3000 */ 3001 ldcp->rxh->nextp = vswp->rxh; 3002 vswp->rxh = ldcp->rxh; 3003 } 3004 } 3005 3006 /* unlink it from the list */ 3007 prev_ldcp = ldcp->ldc_next; 3008 ldcl->num_ldcs--; 3009 3010 mutex_destroy(&ldcp->ldc_txlock); 3011 mutex_destroy(&ldcp->ldc_cblock); 3012 cv_destroy(&ldcp->drain_cv); 3013 mutex_destroy(&ldcp->drain_cv_lock); 3014 mutex_destroy(&ldcp->hss_lock); 3015 mutex_destroy(&ldcp->lane_in.seq_lock); 3016 mutex_destroy(&ldcp->lane_out.seq_lock); 3017 mutex_destroy(&ldcp->status_lock); 3018 3019 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3020 3021 return (0); 3022 } 3023 3024 /* 3025 * Open and attempt to bring up the channel. Note that channel 3026 * can only be brought up if peer has also opened channel. 3027 * 3028 * Returns 0 if can open and bring up channel, otherwise 3029 * returns 1. 3030 */ 3031 static int 3032 vsw_ldc_init(vsw_ldc_t *ldcp) 3033 { 3034 vsw_t *vswp = ldcp->ldc_vswp; 3035 ldc_status_t istatus = 0; 3036 int rv; 3037 3038 D1(vswp, "%s: enter", __func__); 3039 3040 LDC_ENTER_LOCK(ldcp); 3041 3042 /* don't start at 0 in case clients don't like that */ 3043 ldcp->next_ident = 1; 3044 3045 rv = ldc_open(ldcp->ldc_handle); 3046 if (rv != 0) { 3047 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 3048 __func__, ldcp->ldc_id, rv); 3049 LDC_EXIT_LOCK(ldcp); 3050 return (1); 3051 } 3052 3053 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3054 DERR(vswp, "%s: unable to get status", __func__); 3055 LDC_EXIT_LOCK(ldcp); 3056 return (1); 3057 3058 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 3059 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 3060 __func__, ldcp->ldc_id, istatus); 3061 LDC_EXIT_LOCK(ldcp); 3062 return (1); 3063 } 3064 3065 mutex_enter(&ldcp->status_lock); 3066 ldcp->ldc_status = istatus; 3067 mutex_exit(&ldcp->status_lock); 3068 3069 rv = ldc_up(ldcp->ldc_handle); 3070 if (rv != 0) { 3071 /* 3072 * Not a fatal error for ldc_up() to fail, as peer 3073 * end point may simply not be ready yet. 3074 */ 3075 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3076 ldcp->ldc_id, rv); 3077 LDC_EXIT_LOCK(ldcp); 3078 return (1); 3079 } 3080 3081 /* 3082 * ldc_up() call is non-blocking so need to explicitly 3083 * check channel status to see if in fact the channel 3084 * is UP. 3085 */ 3086 mutex_enter(&ldcp->status_lock); 3087 istatus = ldcp->ldc_status; 3088 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3089 DERR(vswp, "%s: unable to get status", __func__); 3090 mutex_exit(&ldcp->status_lock); 3091 LDC_EXIT_LOCK(ldcp); 3092 return (1); 3093 3094 } 3095 mutex_exit(&ldcp->status_lock); 3096 LDC_EXIT_LOCK(ldcp); 3097 3098 if ((istatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { 3099 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 3100 ldcp->ldc_id, istatus); 3101 vsw_restart_handshake(ldcp); 3102 } 3103 3104 D1(vswp, "%s: exit", __func__); 3105 return (0); 3106 } 3107 3108 /* disable callbacks on the channel */ 3109 static int 3110 vsw_ldc_uninit(vsw_ldc_t *ldcp) 3111 { 3112 vsw_t *vswp = ldcp->ldc_vswp; 3113 int rv; 3114 3115 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 3116 3117 LDC_ENTER_LOCK(ldcp); 3118 3119 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 3120 if (rv != 0) { 3121 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 3122 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 3123 LDC_EXIT_LOCK(ldcp); 3124 return (1); 3125 } 3126 3127 mutex_enter(&ldcp->status_lock); 3128 ldcp->ldc_status = LDC_INIT; 3129 mutex_exit(&ldcp->status_lock); 3130 3131 LDC_EXIT_LOCK(ldcp); 3132 3133 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 3134 3135 return (0); 3136 } 3137 3138 static int 3139 vsw_init_ldcs(vsw_port_t *port) 3140 { 3141 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3142 vsw_ldc_t *ldcp; 3143 3144 READ_ENTER(&ldcl->lockrw); 3145 ldcp = ldcl->head; 3146 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3147 (void) vsw_ldc_init(ldcp); 3148 } 3149 RW_EXIT(&ldcl->lockrw); 3150 3151 return (0); 3152 } 3153 3154 static int 3155 vsw_uninit_ldcs(vsw_port_t *port) 3156 { 3157 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3158 vsw_ldc_t *ldcp; 3159 3160 D1(NULL, "vsw_uninit_ldcs: enter\n"); 3161 3162 READ_ENTER(&ldcl->lockrw); 3163 ldcp = ldcl->head; 3164 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3165 (void) vsw_ldc_uninit(ldcp); 3166 } 3167 RW_EXIT(&ldcl->lockrw); 3168 3169 D1(NULL, "vsw_uninit_ldcs: exit\n"); 3170 3171 return (0); 3172 } 3173 3174 /* 3175 * Wait until the callback(s) associated with the ldcs under the specified 3176 * port have completed. 3177 * 3178 * Prior to this function being invoked each channel under this port 3179 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3180 * 3181 * A short explaination of what we are doing below.. 3182 * 3183 * The simplest approach would be to have a reference counter in 3184 * the ldc structure which is increment/decremented by the callbacks as 3185 * they use the channel. The drain function could then simply disable any 3186 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 3187 * there is a tiny window here - before the callback is able to get the lock 3188 * on the channel it is interrupted and this function gets to execute. It 3189 * sees that the ref count is zero and believes its free to delete the 3190 * associated data structures. 3191 * 3192 * We get around this by taking advantage of the fact that before the ldc 3193 * framework invokes a callback it sets a flag to indicate that there is a 3194 * callback active (or about to become active). If when we attempt to 3195 * unregister a callback when this active flag is set then the unregister 3196 * will fail with EWOULDBLOCK. 3197 * 3198 * If the unregister fails we do a cv_timedwait. We will either be signaled 3199 * by the callback as it is exiting (note we have to wait a short period to 3200 * allow the callback to return fully to the ldc framework and it to clear 3201 * the active flag), or by the timer expiring. In either case we again attempt 3202 * the unregister. We repeat this until we can succesfully unregister the 3203 * callback. 3204 * 3205 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 3206 * the case where the callback has finished but the ldc framework has not yet 3207 * cleared the active flag. In this case we would never get a cv_signal. 3208 */ 3209 static int 3210 vsw_drain_ldcs(vsw_port_t *port) 3211 { 3212 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3213 vsw_ldc_t *ldcp; 3214 vsw_t *vswp = port->p_vswp; 3215 3216 D1(vswp, "%s: enter", __func__); 3217 3218 READ_ENTER(&ldcl->lockrw); 3219 3220 ldcp = ldcl->head; 3221 3222 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3223 /* 3224 * If we can unregister the channel callback then we 3225 * know that there is no callback either running or 3226 * scheduled to run for this channel so move on to next 3227 * channel in the list. 3228 */ 3229 mutex_enter(&ldcp->drain_cv_lock); 3230 3231 /* prompt active callbacks to quit */ 3232 ldcp->drain_state = VSW_LDC_DRAINING; 3233 3234 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 3235 D2(vswp, "%s: unreg callback for chan %ld", __func__, 3236 ldcp->ldc_id); 3237 mutex_exit(&ldcp->drain_cv_lock); 3238 continue; 3239 } else { 3240 /* 3241 * If we end up here we know that either 1) a callback 3242 * is currently executing, 2) is about to start (i.e. 3243 * the ldc framework has set the active flag but 3244 * has not actually invoked the callback yet, or 3) 3245 * has finished and has returned to the ldc framework 3246 * but the ldc framework has not yet cleared the 3247 * active bit. 3248 * 3249 * Wait for it to finish. 3250 */ 3251 while (ldc_unreg_callback(ldcp->ldc_handle) 3252 == EWOULDBLOCK) 3253 (void) cv_timedwait(&ldcp->drain_cv, 3254 &ldcp->drain_cv_lock, lbolt + hz); 3255 3256 mutex_exit(&ldcp->drain_cv_lock); 3257 D2(vswp, "%s: unreg callback for chan %ld after " 3258 "timeout", __func__, ldcp->ldc_id); 3259 } 3260 } 3261 RW_EXIT(&ldcl->lockrw); 3262 3263 D1(vswp, "%s: exit", __func__); 3264 return (0); 3265 } 3266 3267 /* 3268 * Wait until all tasks which reference this port have completed. 3269 * 3270 * Prior to this function being invoked each channel under this port 3271 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3272 */ 3273 static int 3274 vsw_drain_port_taskq(vsw_port_t *port) 3275 { 3276 vsw_t *vswp = port->p_vswp; 3277 3278 D1(vswp, "%s: enter", __func__); 3279 3280 /* 3281 * Mark the port as in the process of being detached, and 3282 * dispatch a marker task to the queue so we know when all 3283 * relevant tasks have completed. 3284 */ 3285 mutex_enter(&port->state_lock); 3286 port->state = VSW_PORT_DETACHING; 3287 3288 if ((vswp->taskq_p == NULL) || 3289 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 3290 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 3291 DERR(vswp, "%s: unable to dispatch marker task", 3292 __func__); 3293 mutex_exit(&port->state_lock); 3294 return (1); 3295 } 3296 3297 /* 3298 * Wait for the marker task to finish. 3299 */ 3300 while (port->state != VSW_PORT_DETACHABLE) 3301 cv_wait(&port->state_cv, &port->state_lock); 3302 3303 mutex_exit(&port->state_lock); 3304 3305 D1(vswp, "%s: exit", __func__); 3306 3307 return (0); 3308 } 3309 3310 static void 3311 vsw_marker_task(void *arg) 3312 { 3313 vsw_port_t *port = arg; 3314 vsw_t *vswp = port->p_vswp; 3315 3316 D1(vswp, "%s: enter", __func__); 3317 3318 mutex_enter(&port->state_lock); 3319 3320 /* 3321 * No further tasks should be dispatched which reference 3322 * this port so ok to mark it as safe to detach. 3323 */ 3324 port->state = VSW_PORT_DETACHABLE; 3325 3326 cv_signal(&port->state_cv); 3327 3328 mutex_exit(&port->state_lock); 3329 3330 D1(vswp, "%s: exit", __func__); 3331 } 3332 3333 static vsw_port_t * 3334 vsw_lookup_port(vsw_t *vswp, int p_instance) 3335 { 3336 vsw_port_list_t *plist = &vswp->plist; 3337 vsw_port_t *port; 3338 3339 for (port = plist->head; port != NULL; port = port->p_next) { 3340 if (port->p_instance == p_instance) { 3341 D2(vswp, "vsw_lookup_port: found p_instance\n"); 3342 return (port); 3343 } 3344 } 3345 3346 return (NULL); 3347 } 3348 3349 /* 3350 * Search for and remove the specified port from the port 3351 * list. Returns 0 if able to locate and remove port, otherwise 3352 * returns 1. 3353 */ 3354 static int 3355 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 3356 { 3357 vsw_port_list_t *plist = &vswp->plist; 3358 vsw_port_t *curr_p, *prev_p; 3359 3360 if (plist->head == NULL) 3361 return (1); 3362 3363 curr_p = prev_p = plist->head; 3364 3365 while (curr_p != NULL) { 3366 if (curr_p == port) { 3367 if (prev_p == curr_p) { 3368 plist->head = curr_p->p_next; 3369 } else { 3370 prev_p->p_next = curr_p->p_next; 3371 } 3372 plist->num_ports--; 3373 break; 3374 } else { 3375 prev_p = curr_p; 3376 curr_p = curr_p->p_next; 3377 } 3378 } 3379 return (0); 3380 } 3381 3382 /* 3383 * Interrupt handler for ldc messages. 3384 */ 3385 static uint_t 3386 vsw_ldc_cb(uint64_t event, caddr_t arg) 3387 { 3388 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3389 vsw_t *vswp = ldcp->ldc_vswp; 3390 ldc_status_t lstatus; 3391 int rv; 3392 3393 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3394 3395 mutex_enter(&ldcp->ldc_cblock); 3396 3397 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 3398 mutex_exit(&ldcp->ldc_cblock); 3399 return (LDC_SUCCESS); 3400 } 3401 3402 mutex_enter(&ldcp->status_lock); 3403 lstatus = ldcp->ldc_status; 3404 rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); 3405 mutex_exit(&ldcp->status_lock); 3406 if (rv != 0) { 3407 cmn_err(CE_WARN, "Unable to read channel state"); 3408 goto vsw_cb_exit; 3409 } 3410 3411 if (event & LDC_EVT_UP) { 3412 /* 3413 * Channel has come up, get the state and then start 3414 * the handshake. 3415 */ 3416 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 3417 __func__, ldcp->ldc_id, event, lstatus); 3418 D2(vswp, "%s: UP: old status %ld : cur status %ld", 3419 __func__, lstatus, ldcp->ldc_status); 3420 if ((ldcp->ldc_status != lstatus) && 3421 (ldcp->ldc_status == LDC_UP)) { 3422 vsw_restart_handshake(ldcp); 3423 } 3424 3425 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3426 } 3427 3428 if (event & LDC_EVT_READ) { 3429 /* 3430 * Data available for reading. 3431 */ 3432 D2(vswp, "%s: id(ld) event(%llx) data READ", 3433 __func__, ldcp->ldc_id, event); 3434 3435 vsw_process_pkt(ldcp); 3436 3437 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3438 3439 goto vsw_cb_exit; 3440 } 3441 3442 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 3443 D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET", 3444 __func__, ldcp->ldc_id, event); 3445 3446 /* attempt to restart the connection */ 3447 vsw_restart_ldc(ldcp); 3448 3449 /* 3450 * vsw_restart_ldc() will attempt to bring the channel 3451 * back up. Check here to see if that succeeded. 3452 */ 3453 mutex_enter(&ldcp->status_lock); 3454 lstatus = ldcp->ldc_status; 3455 rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status); 3456 mutex_exit(&ldcp->status_lock); 3457 if (rv != 0) { 3458 DERR(vswp, "%s: unable to read status for channel %ld", 3459 __func__, ldcp->ldc_id); 3460 goto vsw_cb_exit; 3461 } 3462 3463 D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET event:" 3464 " old status %ld : cur status %ld", __func__, 3465 ldcp->ldc_id, event, lstatus, ldcp->ldc_status); 3466 3467 /* 3468 * If channel was not previously UP then (re)start the 3469 * handshake. 3470 */ 3471 if ((ldcp->ldc_status == LDC_UP) && (lstatus != LDC_UP)) { 3472 D2(vswp, "%s: channel %ld now UP, restarting " 3473 "handshake", __func__, ldcp->ldc_id); 3474 vsw_restart_handshake(ldcp); 3475 } 3476 } 3477 3478 /* 3479 * Catch either LDC_EVT_WRITE which we don't support or any 3480 * unknown event. 3481 */ 3482 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 3483 | LDC_EVT_DOWN | LDC_EVT_READ)) { 3484 3485 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 3486 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3487 } 3488 3489 vsw_cb_exit: 3490 mutex_exit(&ldcp->ldc_cblock); 3491 3492 /* 3493 * Let the drain function know we are finishing if it 3494 * is waiting. 3495 */ 3496 mutex_enter(&ldcp->drain_cv_lock); 3497 if (ldcp->drain_state == VSW_LDC_DRAINING) 3498 cv_signal(&ldcp->drain_cv); 3499 mutex_exit(&ldcp->drain_cv_lock); 3500 3501 return (LDC_SUCCESS); 3502 } 3503 3504 /* 3505 * Restart the connection with our peer. Free any existing 3506 * data structures and then attempt to bring channel back 3507 * up. 3508 */ 3509 static void 3510 vsw_restart_ldc(vsw_ldc_t *ldcp) 3511 { 3512 int rv; 3513 vsw_t *vswp = ldcp->ldc_vswp; 3514 vsw_port_t *port; 3515 vsw_ldc_list_t *ldcl; 3516 3517 D1(vswp, "%s: enter", __func__); 3518 3519 port = ldcp->ldc_port; 3520 ldcl = &port->p_ldclist; 3521 3522 READ_ENTER(&ldcl->lockrw); 3523 3524 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 3525 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 3526 3527 vsw_free_lane_resources(ldcp, INBOUND); 3528 vsw_free_lane_resources(ldcp, OUTBOUND); 3529 RW_EXIT(&ldcl->lockrw); 3530 3531 ldcp->lane_in.lstate = 0; 3532 ldcp->lane_out.lstate = 0; 3533 3534 /* 3535 * Remove parent port from any multicast groups 3536 * it may have registered with. Client must resend 3537 * multicast add command after handshake completes. 3538 */ 3539 (void) vsw_del_fdb(vswp, port); 3540 3541 vsw_del_mcst_port(port); 3542 3543 ldcp->peer_session = 0; 3544 ldcp->session_status = 0; 3545 ldcp->hcnt = 0; 3546 ldcp->hphase = VSW_MILESTONE0; 3547 3548 rv = ldc_up(ldcp->ldc_handle); 3549 if (rv != 0) { 3550 /* 3551 * Not a fatal error for ldc_up() to fail, as peer 3552 * end point may simply not be ready yet. 3553 */ 3554 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3555 ldcp->ldc_id, rv); 3556 } 3557 3558 D1(vswp, "%s: exit", __func__); 3559 } 3560 3561 /* 3562 * (Re)start a handshake with our peer by sending them 3563 * our version info. 3564 */ 3565 static void 3566 vsw_restart_handshake(vsw_ldc_t *ldcp) 3567 { 3568 vsw_t *vswp = ldcp->ldc_vswp; 3569 3570 D1(vswp, "vsw_restart_handshake: enter"); 3571 3572 if (ldcp->hphase != VSW_MILESTONE0) { 3573 vsw_restart_ldc(ldcp); 3574 } 3575 3576 /* 3577 * We now increment the transaction group id. This allows 3578 * us to identify and disard any tasks which are still pending 3579 * on the taskq and refer to the handshake session we are about 3580 * to restart. These stale messages no longer have any real 3581 * meaning. 3582 */ 3583 mutex_enter(&ldcp->hss_lock); 3584 ldcp->hss_id++; 3585 mutex_exit(&ldcp->hss_lock); 3586 3587 if (ldcp->hcnt++ > vsw_num_handshakes) { 3588 cmn_err(CE_WARN, "exceeded number of permitted " 3589 "handshake attempts (%d) on channel %ld", 3590 ldcp->hcnt, ldcp->ldc_id); 3591 return; 3592 } 3593 3594 if ((vswp->taskq_p == NULL) || 3595 (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 3596 DDI_NOSLEEP) != DDI_SUCCESS)) { 3597 cmn_err(CE_WARN, "Can't dispatch version handshake task"); 3598 } 3599 3600 D1(vswp, "vsw_restart_handshake: exit"); 3601 } 3602 3603 /* 3604 * Deal appropriately with a ECONNRESET event encountered in a ldc_* 3605 * call. 3606 */ 3607 static void 3608 vsw_handle_reset(vsw_ldc_t *ldcp) 3609 { 3610 vsw_t *vswp = ldcp->ldc_vswp; 3611 ldc_status_t lstatus; 3612 3613 D1(vswp, "%s: enter", __func__); 3614 3615 mutex_enter(&ldcp->status_lock); 3616 lstatus = ldcp->ldc_status; 3617 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3618 DERR(vswp, "%s: unable to read status for channel %ld", 3619 __func__, ldcp->ldc_id); 3620 mutex_exit(&ldcp->status_lock); 3621 return; 3622 } 3623 mutex_exit(&ldcp->status_lock); 3624 3625 /* 3626 * Check the channel's previous recorded state to 3627 * determine if this is the first ECONNRESET event 3628 * we've gotten for this particular channel (i.e. was 3629 * previously up but is no longer). If so, terminate 3630 * the channel. 3631 */ 3632 if ((ldcp->ldc_status != LDC_UP) && (lstatus == LDC_UP)) { 3633 vsw_restart_ldc(ldcp); 3634 } 3635 3636 /* 3637 * vsw_restart_ldc() will also attempt to bring channel 3638 * back up. Check here if that succeeds. 3639 */ 3640 mutex_enter(&ldcp->status_lock); 3641 lstatus = ldcp->ldc_status; 3642 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3643 DERR(vswp, "%s: unable to read status for channel %ld", 3644 __func__, ldcp->ldc_id); 3645 mutex_exit(&ldcp->status_lock); 3646 return; 3647 } 3648 mutex_exit(&ldcp->status_lock); 3649 3650 /* 3651 * If channel is now up and no one else (i.e. the callback routine) 3652 * has dealt with it then we restart the handshake here. 3653 */ 3654 if ((lstatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) { 3655 vsw_restart_handshake(ldcp); 3656 } 3657 3658 D1(vswp, "%s: exit", __func__); 3659 } 3660 3661 /* 3662 * returns 0 if legal for event signified by flag to have 3663 * occured at the time it did. Otherwise returns 1. 3664 */ 3665 int 3666 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 3667 { 3668 vsw_t *vswp = ldcp->ldc_vswp; 3669 uint64_t state; 3670 uint64_t phase; 3671 3672 if (dir == INBOUND) 3673 state = ldcp->lane_in.lstate; 3674 else 3675 state = ldcp->lane_out.lstate; 3676 3677 phase = ldcp->hphase; 3678 3679 switch (flag) { 3680 case VSW_VER_INFO_RECV: 3681 if (phase > VSW_MILESTONE0) { 3682 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 3683 " when in state %d\n", ldcp->ldc_id, phase); 3684 vsw_restart_handshake(ldcp); 3685 return (1); 3686 } 3687 break; 3688 3689 case VSW_VER_ACK_RECV: 3690 case VSW_VER_NACK_RECV: 3691 if (!(state & VSW_VER_INFO_SENT)) { 3692 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 3693 " or VER_NACK when in state %d\n", 3694 ldcp->ldc_id, phase); 3695 vsw_restart_handshake(ldcp); 3696 return (1); 3697 } else 3698 state &= ~VSW_VER_INFO_SENT; 3699 break; 3700 3701 case VSW_ATTR_INFO_RECV: 3702 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 3703 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 3704 " when in state %d\n", ldcp->ldc_id, phase); 3705 vsw_restart_handshake(ldcp); 3706 return (1); 3707 } 3708 break; 3709 3710 case VSW_ATTR_ACK_RECV: 3711 case VSW_ATTR_NACK_RECV: 3712 if (!(state & VSW_ATTR_INFO_SENT)) { 3713 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 3714 " or ATTR_NACK when in state %d\n", 3715 ldcp->ldc_id, phase); 3716 vsw_restart_handshake(ldcp); 3717 return (1); 3718 } else 3719 state &= ~VSW_ATTR_INFO_SENT; 3720 break; 3721 3722 case VSW_DRING_INFO_RECV: 3723 if (phase < VSW_MILESTONE1) { 3724 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 3725 " when in state %d\n", ldcp->ldc_id, phase); 3726 vsw_restart_handshake(ldcp); 3727 return (1); 3728 } 3729 break; 3730 3731 case VSW_DRING_ACK_RECV: 3732 case VSW_DRING_NACK_RECV: 3733 if (!(state & VSW_DRING_INFO_SENT)) { 3734 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 3735 " or DRING_NACK when in state %d\n", 3736 ldcp->ldc_id, phase); 3737 vsw_restart_handshake(ldcp); 3738 return (1); 3739 } else 3740 state &= ~VSW_DRING_INFO_SENT; 3741 break; 3742 3743 case VSW_RDX_INFO_RECV: 3744 if (phase < VSW_MILESTONE3) { 3745 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 3746 " when in state %d\n", ldcp->ldc_id, phase); 3747 vsw_restart_handshake(ldcp); 3748 return (1); 3749 } 3750 break; 3751 3752 case VSW_RDX_ACK_RECV: 3753 case VSW_RDX_NACK_RECV: 3754 if (!(state & VSW_RDX_INFO_SENT)) { 3755 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 3756 " or RDX_NACK when in state %d\n", 3757 ldcp->ldc_id, phase); 3758 vsw_restart_handshake(ldcp); 3759 return (1); 3760 } else 3761 state &= ~VSW_RDX_INFO_SENT; 3762 break; 3763 3764 case VSW_MCST_INFO_RECV: 3765 if (phase < VSW_MILESTONE3) { 3766 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 3767 " when in state %d\n", ldcp->ldc_id, phase); 3768 vsw_restart_handshake(ldcp); 3769 return (1); 3770 } 3771 break; 3772 3773 default: 3774 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 3775 ldcp->ldc_id, flag); 3776 return (1); 3777 } 3778 3779 if (dir == INBOUND) 3780 ldcp->lane_in.lstate = state; 3781 else 3782 ldcp->lane_out.lstate = state; 3783 3784 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 3785 3786 return (0); 3787 } 3788 3789 void 3790 vsw_next_milestone(vsw_ldc_t *ldcp) 3791 { 3792 vsw_t *vswp = ldcp->ldc_vswp; 3793 3794 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 3795 ldcp->ldc_id, ldcp->hphase); 3796 3797 DUMP_FLAGS(ldcp->lane_in.lstate); 3798 DUMP_FLAGS(ldcp->lane_out.lstate); 3799 3800 switch (ldcp->hphase) { 3801 3802 case VSW_MILESTONE0: 3803 /* 3804 * If we haven't started to handshake with our peer, 3805 * start to do so now. 3806 */ 3807 if (ldcp->lane_out.lstate == 0) { 3808 D2(vswp, "%s: (chan %lld) starting handshake " 3809 "with peer", __func__, ldcp->ldc_id); 3810 vsw_restart_handshake(ldcp); 3811 } 3812 3813 /* 3814 * Only way to pass this milestone is to have successfully 3815 * negotiated version info. 3816 */ 3817 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 3818 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 3819 3820 D2(vswp, "%s: (chan %lld) leaving milestone 0", 3821 __func__, ldcp->ldc_id); 3822 3823 /* 3824 * Next milestone is passed when attribute 3825 * information has been successfully exchanged. 3826 */ 3827 ldcp->hphase = VSW_MILESTONE1; 3828 vsw_send_attr(ldcp); 3829 3830 } 3831 break; 3832 3833 case VSW_MILESTONE1: 3834 /* 3835 * Only way to pass this milestone is to have successfully 3836 * negotiated attribute information. 3837 */ 3838 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 3839 3840 ldcp->hphase = VSW_MILESTONE2; 3841 3842 /* 3843 * If the peer device has said it wishes to 3844 * use descriptor rings then we send it our ring 3845 * info, otherwise we just set up a private ring 3846 * which we use an internal buffer 3847 */ 3848 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 3849 vsw_send_dring_info(ldcp); 3850 } 3851 break; 3852 3853 3854 case VSW_MILESTONE2: 3855 /* 3856 * If peer has indicated in its attribute message that 3857 * it wishes to use descriptor rings then the only way 3858 * to pass this milestone is for us to have received 3859 * valid dring info. 3860 * 3861 * If peer is not using descriptor rings then just fall 3862 * through. 3863 */ 3864 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 3865 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 3866 break; 3867 3868 D2(vswp, "%s: (chan %lld) leaving milestone 2", 3869 __func__, ldcp->ldc_id); 3870 3871 ldcp->hphase = VSW_MILESTONE3; 3872 vsw_send_rdx(ldcp); 3873 break; 3874 3875 case VSW_MILESTONE3: 3876 /* 3877 * Pass this milestone when all paramaters have been 3878 * successfully exchanged and RDX sent in both directions. 3879 * 3880 * Mark outbound lane as available to transmit data. 3881 */ 3882 if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && 3883 (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { 3884 3885 D2(vswp, "%s: (chan %lld) leaving milestone 3", 3886 __func__, ldcp->ldc_id); 3887 D2(vswp, "%s: ** handshake complete (0x%llx : " 3888 "0x%llx) **", __func__, ldcp->lane_in.lstate, 3889 ldcp->lane_out.lstate); 3890 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 3891 ldcp->hphase = VSW_MILESTONE4; 3892 ldcp->hcnt = 0; 3893 DISPLAY_STATE(); 3894 } else { 3895 D2(vswp, "%s: still in milestone 3 (0x%llx :" 3896 " 0x%llx", __func__, ldcp->lane_in.lstate, 3897 ldcp->lane_out.lstate); 3898 } 3899 break; 3900 3901 case VSW_MILESTONE4: 3902 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 3903 ldcp->ldc_id); 3904 break; 3905 3906 default: 3907 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 3908 ldcp->ldc_id, ldcp->hphase); 3909 } 3910 3911 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 3912 ldcp->hphase); 3913 } 3914 3915 /* 3916 * Check if major version is supported. 3917 * 3918 * Returns 0 if finds supported major number, and if necessary 3919 * adjusts the minor field. 3920 * 3921 * Returns 1 if can't match major number exactly. Sets mjor/minor 3922 * to next lowest support values, or to zero if no other values possible. 3923 */ 3924 static int 3925 vsw_supported_version(vio_ver_msg_t *vp) 3926 { 3927 int i; 3928 3929 D1(NULL, "vsw_supported_version: enter"); 3930 3931 for (i = 0; i < VSW_NUM_VER; i++) { 3932 if (vsw_versions[i].ver_major == vp->ver_major) { 3933 /* 3934 * Matching or lower major version found. Update 3935 * minor number if necessary. 3936 */ 3937 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3938 D2(NULL, "%s: adjusting minor value" 3939 " from %d to %d", __func__, 3940 vp->ver_minor, 3941 vsw_versions[i].ver_minor); 3942 vp->ver_minor = vsw_versions[i].ver_minor; 3943 } 3944 3945 return (0); 3946 } 3947 3948 if (vsw_versions[i].ver_major < vp->ver_major) { 3949 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3950 D2(NULL, "%s: adjusting minor value" 3951 " from %d to %d", __func__, 3952 vp->ver_minor, 3953 vsw_versions[i].ver_minor); 3954 vp->ver_minor = vsw_versions[i].ver_minor; 3955 } 3956 return (1); 3957 } 3958 } 3959 3960 /* No match was possible, zero out fields */ 3961 vp->ver_major = 0; 3962 vp->ver_minor = 0; 3963 3964 D1(NULL, "vsw_supported_version: exit"); 3965 3966 return (1); 3967 } 3968 3969 /* 3970 * Main routine for processing messages received over LDC. 3971 */ 3972 static void 3973 vsw_process_pkt(void *arg) 3974 { 3975 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3976 vsw_t *vswp = ldcp->ldc_vswp; 3977 size_t msglen; 3978 vio_msg_tag_t tag; 3979 def_msg_t dmsg; 3980 int rv = 0; 3981 3982 3983 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3984 3985 /* 3986 * If channel is up read messages until channel is empty. 3987 */ 3988 do { 3989 msglen = sizeof (dmsg); 3990 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 3991 3992 if (rv != 0) { 3993 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 3994 "len(%d)\n", __func__, ldcp->ldc_id, 3995 rv, msglen); 3996 } 3997 3998 /* channel has been reset */ 3999 if (rv == ECONNRESET) { 4000 vsw_handle_reset(ldcp); 4001 break; 4002 } 4003 4004 if (msglen == 0) { 4005 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 4006 ldcp->ldc_id); 4007 break; 4008 } 4009 4010 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 4011 ldcp->ldc_id, msglen); 4012 4013 /* 4014 * Figure out what sort of packet we have gotten by 4015 * examining the msg tag, and then switch it appropriately. 4016 */ 4017 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 4018 4019 switch (tag.vio_msgtype) { 4020 case VIO_TYPE_CTRL: 4021 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 4022 break; 4023 case VIO_TYPE_DATA: 4024 vsw_process_data_pkt(ldcp, &dmsg, tag); 4025 break; 4026 case VIO_TYPE_ERR: 4027 vsw_process_err_pkt(ldcp, &dmsg, tag); 4028 break; 4029 default: 4030 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 4031 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 4032 break; 4033 } 4034 } while (msglen); 4035 4036 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4037 } 4038 4039 /* 4040 * Dispatch a task to process a VIO control message. 4041 */ 4042 static void 4043 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 4044 { 4045 vsw_ctrl_task_t *ctaskp = NULL; 4046 vsw_port_t *port = ldcp->ldc_port; 4047 vsw_t *vswp = port->p_vswp; 4048 4049 D1(vswp, "%s: enter", __func__); 4050 4051 /* 4052 * We need to handle RDX ACK messages in-band as once they 4053 * are exchanged it is possible that we will get an 4054 * immediate (legitimate) data packet. 4055 */ 4056 if ((tag.vio_subtype_env == VIO_RDX) && 4057 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 4058 4059 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) 4060 return; 4061 4062 ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; 4063 D2(vswp, "%s (%ld) handling RDX_ACK in place " 4064 "(ostate 0x%llx : hphase %d)", __func__, 4065 ldcp->ldc_id, ldcp->lane_out.lstate, ldcp->hphase); 4066 vsw_next_milestone(ldcp); 4067 return; 4068 } 4069 4070 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 4071 4072 if (ctaskp == NULL) { 4073 DERR(vswp, "%s: unable to alloc space for ctrl" 4074 " msg", __func__); 4075 vsw_restart_handshake(ldcp); 4076 return; 4077 } 4078 4079 ctaskp->ldcp = ldcp; 4080 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 4081 mutex_enter(&ldcp->hss_lock); 4082 ctaskp->hss_id = ldcp->hss_id; 4083 mutex_exit(&ldcp->hss_lock); 4084 4085 /* 4086 * Dispatch task to processing taskq if port is not in 4087 * the process of being detached. 4088 */ 4089 mutex_enter(&port->state_lock); 4090 if (port->state == VSW_PORT_INIT) { 4091 if ((vswp->taskq_p == NULL) || 4092 (ddi_taskq_dispatch(vswp->taskq_p, 4093 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 4094 != DDI_SUCCESS)) { 4095 DERR(vswp, "%s: unable to dispatch task to taskq", 4096 __func__); 4097 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4098 mutex_exit(&port->state_lock); 4099 vsw_restart_handshake(ldcp); 4100 return; 4101 } 4102 } else { 4103 DWARN(vswp, "%s: port %d detaching, not dispatching " 4104 "task", __func__, port->p_instance); 4105 } 4106 4107 mutex_exit(&port->state_lock); 4108 4109 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 4110 ldcp->ldc_id); 4111 D1(vswp, "%s: exit", __func__); 4112 } 4113 4114 /* 4115 * Process a VIO ctrl message. Invoked from taskq. 4116 */ 4117 static void 4118 vsw_process_ctrl_pkt(void *arg) 4119 { 4120 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 4121 vsw_ldc_t *ldcp = ctaskp->ldcp; 4122 vsw_t *vswp = ldcp->ldc_vswp; 4123 vio_msg_tag_t tag; 4124 uint16_t env; 4125 4126 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4127 4128 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 4129 env = tag.vio_subtype_env; 4130 4131 /* stale pkt check */ 4132 mutex_enter(&ldcp->hss_lock); 4133 if (ctaskp->hss_id < ldcp->hss_id) { 4134 DWARN(vswp, "%s: discarding stale packet belonging to" 4135 " earlier (%ld) handshake session", __func__, 4136 ctaskp->hss_id); 4137 mutex_exit(&ldcp->hss_lock); 4138 return; 4139 } 4140 mutex_exit(&ldcp->hss_lock); 4141 4142 /* session id check */ 4143 if (ldcp->session_status & VSW_PEER_SESSION) { 4144 if (ldcp->peer_session != tag.vio_sid) { 4145 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4146 __func__, ldcp->ldc_id, tag.vio_sid); 4147 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4148 vsw_restart_handshake(ldcp); 4149 return; 4150 } 4151 } 4152 4153 /* 4154 * Switch on vio_subtype envelope, then let lower routines 4155 * decide if its an INFO, ACK or NACK packet. 4156 */ 4157 switch (env) { 4158 case VIO_VER_INFO: 4159 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 4160 break; 4161 case VIO_DRING_REG: 4162 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 4163 break; 4164 case VIO_DRING_UNREG: 4165 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 4166 break; 4167 case VIO_ATTR_INFO: 4168 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 4169 break; 4170 case VNET_MCAST_INFO: 4171 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 4172 break; 4173 case VIO_RDX: 4174 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 4175 break; 4176 default: 4177 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4178 __func__, env); 4179 } 4180 4181 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4182 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4183 } 4184 4185 /* 4186 * Version negotiation. We can end up here either because our peer 4187 * has responded to a handshake message we have sent it, or our peer 4188 * has initiated a handshake with us. If its the former then can only 4189 * be ACK or NACK, if its the later can only be INFO. 4190 * 4191 * If its an ACK we move to the next stage of the handshake, namely 4192 * attribute exchange. If its a NACK we see if we can specify another 4193 * version, if we can't we stop. 4194 * 4195 * If it is an INFO we reset all params associated with communication 4196 * in that direction over this channel (remember connection is 4197 * essentially 2 independent simplex channels). 4198 */ 4199 void 4200 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 4201 { 4202 vio_ver_msg_t *ver_pkt; 4203 vsw_t *vswp = ldcp->ldc_vswp; 4204 4205 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4206 4207 /* 4208 * We know this is a ctrl/version packet so 4209 * cast it into the correct structure. 4210 */ 4211 ver_pkt = (vio_ver_msg_t *)pkt; 4212 4213 switch (ver_pkt->tag.vio_subtype) { 4214 case VIO_SUBTYPE_INFO: 4215 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 4216 4217 /* 4218 * Record the session id, which we will use from now 4219 * until we see another VER_INFO msg. Even then the 4220 * session id in most cases will be unchanged, execpt 4221 * if channel was reset. 4222 */ 4223 if ((ldcp->session_status & VSW_PEER_SESSION) && 4224 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 4225 DERR(vswp, "%s: updating session id for chan %lld " 4226 "from %llx to %llx", __func__, ldcp->ldc_id, 4227 ldcp->peer_session, ver_pkt->tag.vio_sid); 4228 } 4229 4230 ldcp->peer_session = ver_pkt->tag.vio_sid; 4231 ldcp->session_status |= VSW_PEER_SESSION; 4232 4233 /* Legal message at this time ? */ 4234 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 4235 return; 4236 4237 /* 4238 * First check the device class. Currently only expect 4239 * to be talking to a network device. In the future may 4240 * also talk to another switch. 4241 */ 4242 if (ver_pkt->dev_class != VDEV_NETWORK) { 4243 DERR(vswp, "%s: illegal device class %d", __func__, 4244 ver_pkt->dev_class); 4245 4246 ver_pkt->tag.vio_sid = ldcp->local_session; 4247 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4248 4249 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4250 4251 vsw_send_msg(ldcp, (void *)ver_pkt, 4252 sizeof (vio_ver_msg_t)); 4253 4254 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4255 vsw_next_milestone(ldcp); 4256 return; 4257 } else { 4258 ldcp->dev_class = ver_pkt->dev_class; 4259 } 4260 4261 /* 4262 * Now check the version. 4263 */ 4264 if (vsw_supported_version(ver_pkt) == 0) { 4265 /* 4266 * Support this major version and possibly 4267 * adjusted minor version. 4268 */ 4269 4270 D2(vswp, "%s: accepted ver %d:%d", __func__, 4271 ver_pkt->ver_major, ver_pkt->ver_minor); 4272 4273 /* Store accepted values */ 4274 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4275 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4276 4277 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4278 4279 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 4280 } else { 4281 /* 4282 * NACK back with the next lower major/minor 4283 * pairing we support (if don't suuport any more 4284 * versions then they will be set to zero. 4285 */ 4286 4287 D2(vswp, "%s: replying with ver %d:%d", __func__, 4288 ver_pkt->ver_major, ver_pkt->ver_minor); 4289 4290 /* Store updated values */ 4291 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4292 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4293 4294 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4295 4296 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4297 } 4298 4299 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4300 ver_pkt->tag.vio_sid = ldcp->local_session; 4301 vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); 4302 4303 vsw_next_milestone(ldcp); 4304 break; 4305 4306 case VIO_SUBTYPE_ACK: 4307 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 4308 4309 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 4310 return; 4311 4312 /* Store updated values */ 4313 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4314 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4315 4316 4317 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 4318 vsw_next_milestone(ldcp); 4319 4320 break; 4321 4322 case VIO_SUBTYPE_NACK: 4323 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 4324 4325 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 4326 return; 4327 4328 /* 4329 * If our peer sent us a NACK with the ver fields set to 4330 * zero then there is nothing more we can do. Otherwise see 4331 * if we support either the version suggested, or a lesser 4332 * one. 4333 */ 4334 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4335 DERR(vswp, "%s: peer unable to negotiate any " 4336 "further.", __func__); 4337 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4338 vsw_next_milestone(ldcp); 4339 return; 4340 } 4341 4342 /* 4343 * Check to see if we support this major version or 4344 * a lower one. If we don't then maj/min will be set 4345 * to zero. 4346 */ 4347 (void) vsw_supported_version(ver_pkt); 4348 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 4349 /* Nothing more we can do */ 4350 DERR(vswp, "%s: version negotiation failed.\n", 4351 __func__); 4352 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 4353 vsw_next_milestone(ldcp); 4354 } else { 4355 /* found a supported major version */ 4356 ldcp->lane_out.ver_major = ver_pkt->ver_major; 4357 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 4358 4359 D2(vswp, "%s: resending with updated values (%x, %x)", 4360 __func__, ver_pkt->ver_major, 4361 ver_pkt->ver_minor); 4362 4363 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 4364 ver_pkt->tag.vio_sid = ldcp->local_session; 4365 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 4366 4367 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4368 4369 vsw_send_msg(ldcp, (void *)ver_pkt, 4370 sizeof (vio_ver_msg_t)); 4371 4372 vsw_next_milestone(ldcp); 4373 4374 } 4375 break; 4376 4377 default: 4378 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4379 ver_pkt->tag.vio_subtype); 4380 } 4381 4382 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 4383 } 4384 4385 /* 4386 * Process an attribute packet. We can end up here either because our peer 4387 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 4388 * peer has sent us an attribute INFO message 4389 * 4390 * If its an ACK we then move to the next stage of the handshake which 4391 * is to send our descriptor ring info to our peer. If its a NACK then 4392 * there is nothing more we can (currently) do. 4393 * 4394 * If we get a valid/acceptable INFO packet (and we have already negotiated 4395 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 4396 * NACK back and reset channel state to INACTIV. 4397 * 4398 * FUTURE: in time we will probably negotiate over attributes, but for 4399 * the moment unacceptable attributes are regarded as a fatal error. 4400 * 4401 */ 4402 void 4403 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 4404 { 4405 vnet_attr_msg_t *attr_pkt; 4406 vsw_t *vswp = ldcp->ldc_vswp; 4407 vsw_port_t *port = ldcp->ldc_port; 4408 uint64_t macaddr = 0; 4409 int i; 4410 4411 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4412 4413 /* 4414 * We know this is a ctrl/attr packet so 4415 * cast it into the correct structure. 4416 */ 4417 attr_pkt = (vnet_attr_msg_t *)pkt; 4418 4419 switch (attr_pkt->tag.vio_subtype) { 4420 case VIO_SUBTYPE_INFO: 4421 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4422 4423 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 4424 return; 4425 4426 /* 4427 * If the attributes are unacceptable then we NACK back. 4428 */ 4429 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 4430 4431 DERR(vswp, "%s (chan %d): invalid attributes", 4432 __func__, ldcp->ldc_id); 4433 4434 vsw_free_lane_resources(ldcp, INBOUND); 4435 4436 attr_pkt->tag.vio_sid = ldcp->local_session; 4437 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4438 4439 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4440 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 4441 vsw_send_msg(ldcp, (void *)attr_pkt, 4442 sizeof (vnet_attr_msg_t)); 4443 4444 vsw_next_milestone(ldcp); 4445 return; 4446 } 4447 4448 /* 4449 * Otherwise store attributes for this lane and update 4450 * lane state. 4451 */ 4452 ldcp->lane_in.mtu = attr_pkt->mtu; 4453 ldcp->lane_in.addr = attr_pkt->addr; 4454 ldcp->lane_in.addr_type = attr_pkt->addr_type; 4455 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 4456 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 4457 4458 macaddr = ldcp->lane_in.addr; 4459 for (i = ETHERADDRL - 1; i >= 0; i--) { 4460 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 4461 macaddr >>= 8; 4462 } 4463 4464 /* create the fdb entry for this port/mac address */ 4465 (void) vsw_add_fdb(vswp, port); 4466 4467 /* setup device specifc xmit routines */ 4468 mutex_enter(&port->tx_lock); 4469 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 4470 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 4471 port->transmit = vsw_dringsend; 4472 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 4473 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 4474 vsw_create_privring(ldcp); 4475 port->transmit = vsw_descrsend; 4476 } 4477 mutex_exit(&port->tx_lock); 4478 4479 attr_pkt->tag.vio_sid = ldcp->local_session; 4480 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4481 4482 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 4483 4484 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 4485 4486 vsw_send_msg(ldcp, (void *)attr_pkt, 4487 sizeof (vnet_attr_msg_t)); 4488 4489 vsw_next_milestone(ldcp); 4490 break; 4491 4492 case VIO_SUBTYPE_ACK: 4493 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4494 4495 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 4496 return; 4497 4498 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 4499 vsw_next_milestone(ldcp); 4500 break; 4501 4502 case VIO_SUBTYPE_NACK: 4503 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4504 4505 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 4506 return; 4507 4508 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 4509 vsw_next_milestone(ldcp); 4510 break; 4511 4512 default: 4513 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4514 attr_pkt->tag.vio_subtype); 4515 } 4516 4517 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4518 } 4519 4520 /* 4521 * Process a dring info packet. We can end up here either because our peer 4522 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 4523 * peer has sent us a dring INFO message. 4524 * 4525 * If we get a valid/acceptable INFO packet (and we have already negotiated 4526 * a version) we ACK back and update the lane state, otherwise we NACK back. 4527 * 4528 * FUTURE: nothing to stop client from sending us info on multiple dring's 4529 * but for the moment we will just use the first one we are given. 4530 * 4531 */ 4532 void 4533 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 4534 { 4535 vio_dring_reg_msg_t *dring_pkt; 4536 vsw_t *vswp = ldcp->ldc_vswp; 4537 ldc_mem_info_t minfo; 4538 dring_info_t *dp, *dbp; 4539 int dring_found = 0; 4540 4541 /* 4542 * We know this is a ctrl/dring packet so 4543 * cast it into the correct structure. 4544 */ 4545 dring_pkt = (vio_dring_reg_msg_t *)pkt; 4546 4547 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4548 4549 switch (dring_pkt->tag.vio_subtype) { 4550 case VIO_SUBTYPE_INFO: 4551 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4552 4553 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 4554 return; 4555 4556 /* 4557 * If the dring params are unacceptable then we NACK back. 4558 */ 4559 if (vsw_check_dring_info(dring_pkt)) { 4560 4561 DERR(vswp, "%s (%lld): invalid dring info", 4562 __func__, ldcp->ldc_id); 4563 4564 vsw_free_lane_resources(ldcp, INBOUND); 4565 4566 dring_pkt->tag.vio_sid = ldcp->local_session; 4567 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4568 4569 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4570 4571 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4572 4573 vsw_send_msg(ldcp, (void *)dring_pkt, 4574 sizeof (vio_dring_reg_msg_t)); 4575 4576 vsw_next_milestone(ldcp); 4577 return; 4578 } 4579 4580 /* 4581 * Otherwise, attempt to map in the dring using the 4582 * cookie. If that succeeds we send back a unique dring 4583 * identifier that the sending side will use in future 4584 * to refer to this descriptor ring. 4585 */ 4586 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 4587 4588 dp->num_descriptors = dring_pkt->num_descriptors; 4589 dp->descriptor_size = dring_pkt->descriptor_size; 4590 dp->options = dring_pkt->options; 4591 dp->ncookies = dring_pkt->ncookies; 4592 4593 /* 4594 * Note: should only get one cookie. Enforced in 4595 * the ldc layer. 4596 */ 4597 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 4598 sizeof (ldc_mem_cookie_t)); 4599 4600 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 4601 dp->num_descriptors, dp->descriptor_size); 4602 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 4603 dp->options, dp->ncookies); 4604 4605 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 4606 dp->ncookies, dp->num_descriptors, 4607 dp->descriptor_size, LDC_SHADOW_MAP, 4608 &(dp->handle))) != 0) { 4609 4610 DERR(vswp, "%s: dring_map failed\n", __func__); 4611 4612 kmem_free(dp, sizeof (dring_info_t)); 4613 vsw_free_lane_resources(ldcp, INBOUND); 4614 4615 dring_pkt->tag.vio_sid = ldcp->local_session; 4616 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4617 4618 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4619 4620 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4621 vsw_send_msg(ldcp, (void *)dring_pkt, 4622 sizeof (vio_dring_reg_msg_t)); 4623 4624 vsw_next_milestone(ldcp); 4625 return; 4626 } 4627 4628 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 4629 4630 DERR(vswp, "%s: dring_addr failed\n", __func__); 4631 4632 kmem_free(dp, sizeof (dring_info_t)); 4633 vsw_free_lane_resources(ldcp, INBOUND); 4634 4635 dring_pkt->tag.vio_sid = ldcp->local_session; 4636 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4637 4638 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4639 4640 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4641 vsw_send_msg(ldcp, (void *)dring_pkt, 4642 sizeof (vio_dring_reg_msg_t)); 4643 4644 vsw_next_milestone(ldcp); 4645 return; 4646 } else { 4647 /* store the address of the pub part of ring */ 4648 dp->pub_addr = minfo.vaddr; 4649 } 4650 4651 /* no private section as we are importing */ 4652 dp->priv_addr = NULL; 4653 4654 /* 4655 * Using simple mono increasing int for ident at 4656 * the moment. 4657 */ 4658 dp->ident = ldcp->next_ident; 4659 ldcp->next_ident++; 4660 4661 dp->end_idx = 0; 4662 dp->next = NULL; 4663 4664 /* 4665 * Link it onto the end of the list of drings 4666 * for this lane. 4667 */ 4668 if (ldcp->lane_in.dringp == NULL) { 4669 D2(vswp, "%s: adding first INBOUND dring", __func__); 4670 ldcp->lane_in.dringp = dp; 4671 } else { 4672 dbp = ldcp->lane_in.dringp; 4673 4674 while (dbp->next != NULL) 4675 dbp = dbp->next; 4676 4677 dbp->next = dp; 4678 } 4679 4680 /* acknowledge it */ 4681 dring_pkt->tag.vio_sid = ldcp->local_session; 4682 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4683 dring_pkt->dring_ident = dp->ident; 4684 4685 vsw_send_msg(ldcp, (void *)dring_pkt, 4686 sizeof (vio_dring_reg_msg_t)); 4687 4688 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 4689 vsw_next_milestone(ldcp); 4690 break; 4691 4692 case VIO_SUBTYPE_ACK: 4693 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4694 4695 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 4696 return; 4697 4698 /* 4699 * Peer is acknowledging our dring info and will have 4700 * sent us a dring identifier which we will use to 4701 * refer to this ring w.r.t. our peer. 4702 */ 4703 dp = ldcp->lane_out.dringp; 4704 if (dp != NULL) { 4705 /* 4706 * Find the ring this ident should be associated 4707 * with. 4708 */ 4709 if (vsw_dring_match(dp, dring_pkt)) { 4710 dring_found = 1; 4711 4712 } else while (dp != NULL) { 4713 if (vsw_dring_match(dp, dring_pkt)) { 4714 dring_found = 1; 4715 break; 4716 } 4717 dp = dp->next; 4718 } 4719 4720 if (dring_found == 0) { 4721 DERR(NULL, "%s: unrecognised ring cookie", 4722 __func__); 4723 vsw_restart_handshake(ldcp); 4724 return; 4725 } 4726 4727 } else { 4728 DERR(vswp, "%s: DRING ACK received but no drings " 4729 "allocated", __func__); 4730 vsw_restart_handshake(ldcp); 4731 return; 4732 } 4733 4734 /* store ident */ 4735 dp->ident = dring_pkt->dring_ident; 4736 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 4737 vsw_next_milestone(ldcp); 4738 break; 4739 4740 case VIO_SUBTYPE_NACK: 4741 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4742 4743 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 4744 return; 4745 4746 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 4747 vsw_next_milestone(ldcp); 4748 break; 4749 4750 default: 4751 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4752 dring_pkt->tag.vio_subtype); 4753 } 4754 4755 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4756 } 4757 4758 /* 4759 * Process a request from peer to unregister a dring. 4760 * 4761 * For the moment we just restart the handshake if our 4762 * peer endpoint attempts to unregister a dring. 4763 */ 4764 void 4765 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 4766 { 4767 vsw_t *vswp = ldcp->ldc_vswp; 4768 vio_dring_unreg_msg_t *dring_pkt; 4769 4770 /* 4771 * We know this is a ctrl/dring packet so 4772 * cast it into the correct structure. 4773 */ 4774 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 4775 4776 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4777 4778 switch (dring_pkt->tag.vio_subtype) { 4779 case VIO_SUBTYPE_INFO: 4780 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4781 4782 DWARN(vswp, "%s: restarting handshake..", __func__); 4783 vsw_restart_handshake(ldcp); 4784 break; 4785 4786 case VIO_SUBTYPE_ACK: 4787 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4788 4789 DWARN(vswp, "%s: restarting handshake..", __func__); 4790 vsw_restart_handshake(ldcp); 4791 break; 4792 4793 case VIO_SUBTYPE_NACK: 4794 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4795 4796 DWARN(vswp, "%s: restarting handshake..", __func__); 4797 vsw_restart_handshake(ldcp); 4798 break; 4799 4800 default: 4801 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4802 dring_pkt->tag.vio_subtype); 4803 vsw_restart_handshake(ldcp); 4804 } 4805 4806 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4807 } 4808 4809 #define SND_MCST_NACK(ldcp, pkt) \ 4810 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4811 pkt->tag.vio_sid = ldcp->local_session; \ 4812 vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); 4813 4814 /* 4815 * Process a multicast request from a vnet. 4816 * 4817 * Vnet's specify a multicast address that they are interested in. This 4818 * address is used as a key into the hash table which forms the multicast 4819 * forwarding database (mFDB). 4820 * 4821 * The table keys are the multicast addresses, while the table entries 4822 * are pointers to lists of ports which wish to receive packets for the 4823 * specified multicast address. 4824 * 4825 * When a multicast packet is being switched we use the address as a key 4826 * into the hash table, and then walk the appropriate port list forwarding 4827 * the pkt to each port in turn. 4828 * 4829 * If a vnet is no longer interested in a particular multicast grouping 4830 * we simply find the correct location in the hash table and then delete 4831 * the relevant port from the port list. 4832 * 4833 * To deal with the case whereby a port is being deleted without first 4834 * removing itself from the lists in the hash table, we maintain a list 4835 * of multicast addresses the port has registered an interest in, within 4836 * the port structure itself. We then simply walk that list of addresses 4837 * using them as keys into the hash table and remove the port from the 4838 * appropriate lists. 4839 */ 4840 static void 4841 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 4842 { 4843 vnet_mcast_msg_t *mcst_pkt; 4844 vsw_port_t *port = ldcp->ldc_port; 4845 vsw_t *vswp = ldcp->ldc_vswp; 4846 int i; 4847 4848 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4849 4850 /* 4851 * We know this is a ctrl/mcast packet so 4852 * cast it into the correct structure. 4853 */ 4854 mcst_pkt = (vnet_mcast_msg_t *)pkt; 4855 4856 switch (mcst_pkt->tag.vio_subtype) { 4857 case VIO_SUBTYPE_INFO: 4858 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4859 4860 /* 4861 * Check if in correct state to receive a multicast 4862 * message (i.e. handshake complete). If not reset 4863 * the handshake. 4864 */ 4865 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 4866 return; 4867 4868 /* 4869 * Before attempting to add or remove address check 4870 * that they are valid multicast addresses. 4871 * If not, then NACK back. 4872 */ 4873 for (i = 0; i < mcst_pkt->count; i++) { 4874 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 4875 DERR(vswp, "%s: invalid multicast address", 4876 __func__); 4877 SND_MCST_NACK(ldcp, mcst_pkt); 4878 return; 4879 } 4880 } 4881 4882 /* 4883 * Now add/remove the addresses. If this fails we 4884 * NACK back. 4885 */ 4886 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 4887 SND_MCST_NACK(ldcp, mcst_pkt); 4888 return; 4889 } 4890 4891 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4892 mcst_pkt->tag.vio_sid = ldcp->local_session; 4893 4894 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 4895 4896 vsw_send_msg(ldcp, (void *)mcst_pkt, 4897 sizeof (vnet_mcast_msg_t)); 4898 break; 4899 4900 case VIO_SUBTYPE_ACK: 4901 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4902 4903 /* 4904 * We shouldn't ever get a multicast ACK message as 4905 * at the moment we never request multicast addresses 4906 * to be set on some other device. This may change in 4907 * the future if we have cascading switches. 4908 */ 4909 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 4910 return; 4911 4912 /* Do nothing */ 4913 break; 4914 4915 case VIO_SUBTYPE_NACK: 4916 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4917 4918 /* 4919 * We shouldn't get a multicast NACK packet for the 4920 * same reasons as we shouldn't get a ACK packet. 4921 */ 4922 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 4923 return; 4924 4925 /* Do nothing */ 4926 break; 4927 4928 default: 4929 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4930 mcst_pkt->tag.vio_subtype); 4931 } 4932 4933 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4934 } 4935 4936 static void 4937 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 4938 { 4939 vio_rdx_msg_t *rdx_pkt; 4940 vsw_t *vswp = ldcp->ldc_vswp; 4941 4942 /* 4943 * We know this is a ctrl/rdx packet so 4944 * cast it into the correct structure. 4945 */ 4946 rdx_pkt = (vio_rdx_msg_t *)pkt; 4947 4948 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4949 4950 switch (rdx_pkt->tag.vio_subtype) { 4951 case VIO_SUBTYPE_INFO: 4952 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4953 4954 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) 4955 return; 4956 4957 rdx_pkt->tag.vio_sid = ldcp->local_session; 4958 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4959 4960 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 4961 4962 ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; 4963 4964 vsw_send_msg(ldcp, (void *)rdx_pkt, 4965 sizeof (vio_rdx_msg_t)); 4966 4967 vsw_next_milestone(ldcp); 4968 break; 4969 4970 case VIO_SUBTYPE_ACK: 4971 /* 4972 * Should be handled in-band by callback handler. 4973 */ 4974 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 4975 vsw_restart_handshake(ldcp); 4976 break; 4977 4978 case VIO_SUBTYPE_NACK: 4979 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4980 4981 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) 4982 return; 4983 4984 ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; 4985 vsw_next_milestone(ldcp); 4986 break; 4987 4988 default: 4989 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4990 rdx_pkt->tag.vio_subtype); 4991 } 4992 4993 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4994 } 4995 4996 static void 4997 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 4998 { 4999 uint16_t env = tag.vio_subtype_env; 5000 vsw_t *vswp = ldcp->ldc_vswp; 5001 5002 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5003 5004 /* session id check */ 5005 if (ldcp->session_status & VSW_PEER_SESSION) { 5006 if (ldcp->peer_session != tag.vio_sid) { 5007 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5008 __func__, ldcp->ldc_id, tag.vio_sid); 5009 vsw_restart_handshake(ldcp); 5010 return; 5011 } 5012 } 5013 5014 /* 5015 * It is an error for us to be getting data packets 5016 * before the handshake has completed. 5017 */ 5018 if (ldcp->hphase != VSW_MILESTONE4) { 5019 DERR(vswp, "%s: got data packet before handshake complete " 5020 "hphase %d (%x: %x)", __func__, ldcp->hphase, 5021 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 5022 DUMP_FLAGS(ldcp->lane_in.lstate); 5023 DUMP_FLAGS(ldcp->lane_out.lstate); 5024 vsw_restart_handshake(ldcp); 5025 return; 5026 } 5027 5028 /* 5029 * Switch on vio_subtype envelope, then let lower routines 5030 * decide if its an INFO, ACK or NACK packet. 5031 */ 5032 if (env == VIO_DRING_DATA) { 5033 vsw_process_data_dring_pkt(ldcp, dpkt); 5034 } else if (env == VIO_PKT_DATA) { 5035 vsw_process_data_raw_pkt(ldcp, dpkt); 5036 } else if (env == VIO_DESC_DATA) { 5037 vsw_process_data_ibnd_pkt(ldcp, dpkt); 5038 } else { 5039 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 5040 __func__, env); 5041 } 5042 5043 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5044 } 5045 5046 #define SND_DRING_NACK(ldcp, pkt) \ 5047 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5048 pkt->tag.vio_sid = ldcp->local_session; \ 5049 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); 5050 5051 static void 5052 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 5053 { 5054 vio_dring_msg_t *dring_pkt; 5055 vnet_public_desc_t *pub_addr = NULL; 5056 vsw_private_desc_t *priv_addr = NULL; 5057 dring_info_t *dp = NULL; 5058 vsw_t *vswp = ldcp->ldc_vswp; 5059 mblk_t *mp = NULL; 5060 mblk_t *bp = NULL; 5061 mblk_t *bpt = NULL; 5062 size_t nbytes = 0; 5063 size_t off = 0; 5064 uint64_t ncookies = 0; 5065 uint64_t chain = 0; 5066 uint64_t j, len; 5067 uint32_t pos, start, datalen; 5068 uint32_t range_start, range_end; 5069 int32_t end, num, cnt = 0; 5070 int i, rv; 5071 boolean_t ack_needed = B_FALSE; 5072 boolean_t prev_desc_ack = B_FALSE; 5073 int read_attempts = 0; 5074 5075 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5076 5077 /* 5078 * We know this is a data/dring packet so 5079 * cast it into the correct structure. 5080 */ 5081 dring_pkt = (vio_dring_msg_t *)dpkt; 5082 5083 /* 5084 * Switch on the vio_subtype. If its INFO then we need to 5085 * process the data. If its an ACK we need to make sure 5086 * it makes sense (i.e did we send an earlier data/info), 5087 * and if its a NACK then we maybe attempt a retry. 5088 */ 5089 switch (dring_pkt->tag.vio_subtype) { 5090 case VIO_SUBTYPE_INFO: 5091 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 5092 5093 if ((dp = vsw_ident2dring(&ldcp->lane_in, 5094 dring_pkt->dring_ident)) == NULL) { 5095 5096 DERR(vswp, "%s(%lld): unable to find dring from " 5097 "ident 0x%llx", __func__, ldcp->ldc_id, 5098 dring_pkt->dring_ident); 5099 5100 SND_DRING_NACK(ldcp, dring_pkt); 5101 return; 5102 } 5103 5104 start = pos = dring_pkt->start_idx; 5105 end = dring_pkt->end_idx; 5106 len = dp->num_descriptors; 5107 5108 range_start = range_end = pos; 5109 5110 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 5111 __func__, ldcp->ldc_id, start, end); 5112 5113 if (end == -1) { 5114 num = -1; 5115 } else if (end >= 0) { 5116 num = end >= pos ? 5117 end - pos + 1: (len - pos + 1) + end; 5118 5119 /* basic sanity check */ 5120 if (end > len) { 5121 DERR(vswp, "%s(%lld): endpoint %lld outside " 5122 "ring length %lld", __func__, 5123 ldcp->ldc_id, end, len); 5124 5125 SND_DRING_NACK(ldcp, dring_pkt); 5126 return; 5127 } 5128 } else { 5129 DERR(vswp, "%s(%lld): invalid endpoint %lld", 5130 __func__, ldcp->ldc_id, end); 5131 SND_DRING_NACK(ldcp, dring_pkt); 5132 return; 5133 } 5134 5135 while (cnt != num) { 5136 vsw_recheck_desc: 5137 if ((rv = ldc_mem_dring_acquire(dp->handle, 5138 pos, pos)) != 0) { 5139 DERR(vswp, "%s(%lld): unable to acquire " 5140 "descriptor at pos %d: err %d", 5141 __func__, pos, ldcp->ldc_id, rv); 5142 SND_DRING_NACK(ldcp, dring_pkt); 5143 return; 5144 } 5145 5146 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 5147 5148 /* 5149 * When given a bounded range of descriptors 5150 * to process, its an error to hit a descriptor 5151 * which is not ready. In the non-bounded case 5152 * (end_idx == -1) this simply indicates we have 5153 * reached the end of the current active range. 5154 */ 5155 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 5156 /* unbound - no error */ 5157 if (end == -1) { 5158 if (read_attempts == vsw_read_attempts) 5159 break; 5160 5161 delay(drv_usectohz(vsw_desc_delay)); 5162 read_attempts++; 5163 goto vsw_recheck_desc; 5164 } 5165 5166 /* bounded - error - so NACK back */ 5167 DERR(vswp, "%s(%lld): descriptor not READY " 5168 "(%d)", __func__, ldcp->ldc_id, 5169 pub_addr->hdr.dstate); 5170 SND_DRING_NACK(ldcp, dring_pkt); 5171 return; 5172 } 5173 5174 DTRACE_PROBE1(read_attempts, int, read_attempts); 5175 5176 range_end = pos; 5177 5178 /* 5179 * If we ACK'd the previous descriptor then now 5180 * record the new range start position for later 5181 * ACK's. 5182 */ 5183 if (prev_desc_ack) { 5184 range_start = pos; 5185 5186 D2(vswp, "%s(%lld): updating range start " 5187 "to be %d", __func__, ldcp->ldc_id, 5188 range_start); 5189 5190 prev_desc_ack = B_FALSE; 5191 } 5192 5193 /* 5194 * Data is padded to align on 8 byte boundary, 5195 * datalen is actual data length, i.e. minus that 5196 * padding. 5197 */ 5198 datalen = pub_addr->nbytes; 5199 5200 /* 5201 * Does peer wish us to ACK when we have finished 5202 * with this descriptor ? 5203 */ 5204 if (pub_addr->hdr.ack) 5205 ack_needed = B_TRUE; 5206 5207 D2(vswp, "%s(%lld): processing desc %lld at pos" 5208 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 5209 __func__, ldcp->ldc_id, pos, pub_addr, 5210 pub_addr->hdr.dstate, datalen); 5211 5212 /* 5213 * Mark that we are starting to process descriptor. 5214 */ 5215 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 5216 5217 mp = vio_allocb(ldcp->rxh); 5218 if (mp == NULL) { 5219 /* 5220 * No free receive buffers available, so 5221 * fallback onto allocb(9F). Make sure that 5222 * we get a data buffer which is a multiple 5223 * of 8 as this is required by ldc_mem_copy. 5224 */ 5225 DTRACE_PROBE(allocb); 5226 mp = allocb(datalen + VNET_IPALIGN + 8, 5227 BPRI_MED); 5228 } 5229 5230 /* 5231 * Ensure that we ask ldc for an aligned 5232 * number of bytes. 5233 */ 5234 nbytes = datalen + VNET_IPALIGN; 5235 if (nbytes & 0x7) { 5236 off = 8 - (nbytes & 0x7); 5237 nbytes += off; 5238 } 5239 5240 ncookies = pub_addr->ncookies; 5241 rv = ldc_mem_copy(ldcp->ldc_handle, 5242 (caddr_t)mp->b_rptr, 0, &nbytes, 5243 pub_addr->memcookie, ncookies, 5244 LDC_COPY_IN); 5245 5246 if (rv != 0) { 5247 DERR(vswp, "%s(%d): unable to copy in " 5248 "data from %d cookies in desc %d" 5249 " (rv %d)", __func__, ldcp->ldc_id, 5250 ncookies, pos, rv); 5251 freemsg(mp); 5252 5253 pub_addr->hdr.dstate = VIO_DESC_DONE; 5254 (void) ldc_mem_dring_release(dp->handle, 5255 pos, pos); 5256 break; 5257 } else { 5258 D2(vswp, "%s(%d): copied in %ld bytes" 5259 " using %d cookies", __func__, 5260 ldcp->ldc_id, nbytes, ncookies); 5261 } 5262 5263 /* adjust the read pointer to skip over the padding */ 5264 mp->b_rptr += VNET_IPALIGN; 5265 5266 /* point to the actual end of data */ 5267 mp->b_wptr = mp->b_rptr + datalen; 5268 5269 /* build a chain of received packets */ 5270 if (bp == NULL) { 5271 /* first pkt */ 5272 bp = mp; 5273 bp->b_next = bp->b_prev = NULL; 5274 bpt = bp; 5275 chain = 1; 5276 } else { 5277 mp->b_next = NULL; 5278 mp->b_prev = bpt; 5279 bpt->b_next = mp; 5280 bpt = mp; 5281 chain++; 5282 } 5283 5284 /* mark we are finished with this descriptor */ 5285 pub_addr->hdr.dstate = VIO_DESC_DONE; 5286 5287 (void) ldc_mem_dring_release(dp->handle, pos, pos); 5288 5289 /* 5290 * Send an ACK back to peer if requested. 5291 */ 5292 if (ack_needed) { 5293 ack_needed = B_FALSE; 5294 5295 dring_pkt->start_idx = range_start; 5296 dring_pkt->end_idx = range_end; 5297 5298 DERR(vswp, "%s(%lld): processed %d %d, ACK" 5299 " requested", __func__, ldcp->ldc_id, 5300 dring_pkt->start_idx, 5301 dring_pkt->end_idx); 5302 5303 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 5304 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5305 dring_pkt->tag.vio_sid = ldcp->local_session; 5306 vsw_send_msg(ldcp, (void *)dring_pkt, 5307 sizeof (vio_dring_msg_t)); 5308 5309 prev_desc_ack = B_TRUE; 5310 range_start = pos; 5311 } 5312 5313 /* next descriptor */ 5314 pos = (pos + 1) % len; 5315 cnt++; 5316 5317 /* 5318 * Break out of loop here and stop processing to 5319 * allow some other network device (or disk) to 5320 * get access to the cpu. 5321 */ 5322 /* send the chain of packets to be switched */ 5323 if (chain > vsw_chain_len) { 5324 D3(vswp, "%s(%lld): switching chain of %d " 5325 "msgs", __func__, ldcp->ldc_id, chain); 5326 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 5327 ldcp->ldc_port, NULL); 5328 bp = NULL; 5329 break; 5330 } 5331 } 5332 5333 /* send the chain of packets to be switched */ 5334 if (bp != NULL) { 5335 D3(vswp, "%s(%lld): switching chain of %d msgs", 5336 __func__, ldcp->ldc_id, chain); 5337 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 5338 ldcp->ldc_port, NULL); 5339 } 5340 5341 DTRACE_PROBE1(msg_cnt, int, cnt); 5342 5343 /* 5344 * We are now finished so ACK back with the state 5345 * set to STOPPING so our peer knows we are finished 5346 */ 5347 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5348 dring_pkt->tag.vio_sid = ldcp->local_session; 5349 5350 dring_pkt->dring_process_state = VIO_DP_STOPPED; 5351 5352 DTRACE_PROBE(stop_process_sent); 5353 5354 /* 5355 * We have not processed any more descriptors beyond 5356 * the last one we ACK'd. 5357 */ 5358 if (prev_desc_ack) 5359 range_start = range_end; 5360 5361 dring_pkt->start_idx = range_start; 5362 dring_pkt->end_idx = range_end; 5363 5364 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 5365 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5366 dring_pkt->end_idx); 5367 5368 vsw_send_msg(ldcp, (void *)dring_pkt, 5369 sizeof (vio_dring_msg_t)); 5370 break; 5371 5372 case VIO_SUBTYPE_ACK: 5373 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 5374 /* 5375 * Verify that the relevant descriptors are all 5376 * marked as DONE 5377 */ 5378 if ((dp = vsw_ident2dring(&ldcp->lane_out, 5379 dring_pkt->dring_ident)) == NULL) { 5380 DERR(vswp, "%s: unknown ident in ACK", __func__); 5381 return; 5382 } 5383 5384 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 5385 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5386 5387 start = end = 0; 5388 start = dring_pkt->start_idx; 5389 end = dring_pkt->end_idx; 5390 len = dp->num_descriptors; 5391 5392 j = num = 0; 5393 /* calculate # descriptors taking into a/c wrap around */ 5394 num = end >= start ? end - start + 1: (len - start + 1) + end; 5395 5396 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 5397 __func__, ldcp->ldc_id, start, end, num); 5398 5399 mutex_enter(&dp->dlock); 5400 dp->last_ack_recv = end; 5401 mutex_exit(&dp->dlock); 5402 5403 for (i = start; j < num; i = (i + 1) % len, j++) { 5404 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 5405 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5406 5407 /* 5408 * If the last descriptor in a range has the ACK 5409 * bit set then we will get two messages from our 5410 * peer relating to it. The normal ACK msg and then 5411 * a subsequent STOP msg. The first message will have 5412 * resulted in the descriptor being reclaimed and 5413 * its state set to FREE so when we encounter a non 5414 * DONE descriptor we need to check to see if its 5415 * because we have just reclaimed it. 5416 */ 5417 mutex_enter(&priv_addr->dstate_lock); 5418 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 5419 /* clear all the fields */ 5420 bzero(priv_addr->datap, priv_addr->datalen); 5421 priv_addr->datalen = 0; 5422 5423 pub_addr->hdr.dstate = VIO_DESC_FREE; 5424 pub_addr->hdr.ack = 0; 5425 5426 priv_addr->dstate = VIO_DESC_FREE; 5427 mutex_exit(&priv_addr->dstate_lock); 5428 5429 D3(vswp, "clearing descp %d : pub state " 5430 "0x%llx : priv state 0x%llx", i, 5431 pub_addr->hdr.dstate, 5432 priv_addr->dstate); 5433 5434 } else { 5435 mutex_exit(&priv_addr->dstate_lock); 5436 5437 if (dring_pkt->dring_process_state != 5438 VIO_DP_STOPPED) { 5439 DERR(vswp, "%s: descriptor %lld at pos " 5440 " 0x%llx not DONE (0x%lx)\n", 5441 __func__, i, pub_addr, 5442 pub_addr->hdr.dstate); 5443 return; 5444 } 5445 } 5446 } 5447 5448 /* 5449 * If our peer is stopping processing descriptors then 5450 * we check to make sure it has processed all the descriptors 5451 * we have updated. If not then we send it a new message 5452 * to prompt it to restart. 5453 */ 5454 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 5455 DTRACE_PROBE(stop_process_recv); 5456 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 5457 __func__, ldcp->ldc_id, dring_pkt->start_idx, 5458 dring_pkt->end_idx); 5459 5460 /* 5461 * Check next descriptor in public section of ring. 5462 * If its marked as READY then we need to prompt our 5463 * peer to start processing the ring again. 5464 */ 5465 i = (end + 1) % len; 5466 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 5467 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5468 5469 /* 5470 * Hold the restart lock across all of this to 5471 * make sure that its not possible for us to 5472 * decide that a msg needs to be sent in the future 5473 * but the sending code having already checked is 5474 * about to exit. 5475 */ 5476 mutex_enter(&dp->restart_lock); 5477 mutex_enter(&priv_addr->dstate_lock); 5478 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 5479 5480 mutex_exit(&priv_addr->dstate_lock); 5481 5482 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 5483 dring_pkt->tag.vio_sid = ldcp->local_session; 5484 5485 mutex_enter(&ldcp->lane_out.seq_lock); 5486 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 5487 mutex_exit(&ldcp->lane_out.seq_lock); 5488 5489 dring_pkt->start_idx = (end + 1) % len; 5490 dring_pkt->end_idx = -1; 5491 5492 D2(vswp, "%s(%lld) : sending restart msg:" 5493 " %d : %d", __func__, ldcp->ldc_id, 5494 dring_pkt->start_idx, 5495 dring_pkt->end_idx); 5496 5497 vsw_send_msg(ldcp, (void *)dring_pkt, 5498 sizeof (vio_dring_msg_t)); 5499 } else { 5500 mutex_exit(&priv_addr->dstate_lock); 5501 dp->restart_reqd = B_TRUE; 5502 } 5503 mutex_exit(&dp->restart_lock); 5504 } 5505 break; 5506 5507 case VIO_SUBTYPE_NACK: 5508 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 5509 __func__, ldcp->ldc_id); 5510 /* 5511 * Something is badly wrong if we are getting NACK's 5512 * for our data pkts. So reset the channel. 5513 */ 5514 vsw_restart_handshake(ldcp); 5515 5516 break; 5517 5518 default: 5519 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5520 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 5521 } 5522 5523 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5524 } 5525 5526 /* 5527 * VIO_PKT_DATA (a.k.a raw data mode ) 5528 * 5529 * Note - currently not supported. Do nothing. 5530 */ 5531 static void 5532 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 5533 { 5534 _NOTE(ARGUNUSED(dpkt)) 5535 5536 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5537 5538 DERR(NULL, "%s (%lld): currently not supported", 5539 __func__, ldcp->ldc_id); 5540 5541 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5542 } 5543 5544 #define SND_IBND_DESC_NACK(ldcp, pkt) \ 5545 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5546 pkt->tag.vio_sid = ldcp->local_session; \ 5547 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t)); 5548 5549 /* 5550 * Process an in-band descriptor message (most likely from 5551 * OBP). 5552 */ 5553 static void 5554 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 5555 { 5556 vio_ibnd_desc_t *ibnd_desc; 5557 dring_info_t *dp = NULL; 5558 vsw_private_desc_t *priv_addr = NULL; 5559 vsw_t *vswp = ldcp->ldc_vswp; 5560 mblk_t *mp = NULL; 5561 size_t nbytes = 0; 5562 size_t off = 0; 5563 uint64_t idx = 0; 5564 uint32_t num = 1, len, datalen = 0; 5565 uint64_t ncookies = 0; 5566 int i, rv; 5567 int j = 0; 5568 5569 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5570 5571 ibnd_desc = (vio_ibnd_desc_t *)pkt; 5572 5573 switch (ibnd_desc->hdr.tag.vio_subtype) { 5574 case VIO_SUBTYPE_INFO: 5575 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5576 5577 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5578 return; 5579 5580 /* 5581 * Data is padded to align on a 8 byte boundary, 5582 * nbytes is actual data length, i.e. minus that 5583 * padding. 5584 */ 5585 datalen = ibnd_desc->nbytes; 5586 5587 D2(vswp, "%s(%lld): processing inband desc : " 5588 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 5589 5590 ncookies = ibnd_desc->ncookies; 5591 5592 /* 5593 * allocb(9F) returns an aligned data block. We 5594 * need to ensure that we ask ldc for an aligned 5595 * number of bytes also. 5596 */ 5597 nbytes = datalen; 5598 if (nbytes & 0x7) { 5599 off = 8 - (nbytes & 0x7); 5600 nbytes += off; 5601 } 5602 5603 mp = allocb(datalen, BPRI_MED); 5604 if (mp == NULL) { 5605 DERR(vswp, "%s(%lld): allocb failed", 5606 __func__, ldcp->ldc_id); 5607 return; 5608 } 5609 5610 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 5611 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 5612 LDC_COPY_IN); 5613 5614 if (rv != 0) { 5615 DERR(vswp, "%s(%d): unable to copy in data from " 5616 "%d cookie(s)", __func__, 5617 ldcp->ldc_id, ncookies); 5618 freemsg(mp); 5619 return; 5620 } else { 5621 D2(vswp, "%s(%d): copied in %ld bytes using %d " 5622 "cookies", __func__, ldcp->ldc_id, nbytes, 5623 ncookies); 5624 } 5625 5626 /* point to the actual end of data */ 5627 mp->b_wptr = mp->b_rptr + datalen; 5628 5629 /* 5630 * We ACK back every in-band descriptor message we process 5631 */ 5632 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 5633 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 5634 vsw_send_msg(ldcp, (void *)ibnd_desc, 5635 sizeof (vio_ibnd_desc_t)); 5636 5637 /* send the packet to be switched */ 5638 vsw_switch_frame(vswp, mp, VSW_VNETPORT, 5639 ldcp->ldc_port, NULL); 5640 5641 break; 5642 5643 case VIO_SUBTYPE_ACK: 5644 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5645 5646 /* Verify the ACK is valid */ 5647 idx = ibnd_desc->hdr.desc_handle; 5648 5649 if (idx >= VSW_RING_NUM_EL) { 5650 cmn_err(CE_WARN, "%s: corrupted ACK received " 5651 "(idx %ld)", __func__, idx); 5652 return; 5653 } 5654 5655 if ((dp = ldcp->lane_out.dringp) == NULL) { 5656 DERR(vswp, "%s: no dring found", __func__); 5657 return; 5658 } 5659 5660 len = dp->num_descriptors; 5661 /* 5662 * If the descriptor we are being ACK'ed for is not the 5663 * one we expected, then pkts were lost somwhere, either 5664 * when we tried to send a msg, or a previous ACK msg from 5665 * our peer. In either case we now reclaim the descriptors 5666 * in the range from the last ACK we received up to the 5667 * current ACK. 5668 */ 5669 if (idx != dp->last_ack_recv) { 5670 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 5671 __func__, dp->last_ack_recv, idx); 5672 num = idx >= dp->last_ack_recv ? 5673 idx - dp->last_ack_recv + 1: 5674 (len - dp->last_ack_recv + 1) + idx; 5675 } 5676 5677 /* 5678 * When we sent the in-band message to our peer we 5679 * marked the copy in our private ring as READY. We now 5680 * check that the descriptor we are being ACK'ed for is in 5681 * fact READY, i.e. it is one we have shared with our peer. 5682 * 5683 * If its not we flag an error, but still reset the descr 5684 * back to FREE. 5685 */ 5686 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 5687 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5688 mutex_enter(&priv_addr->dstate_lock); 5689 if (priv_addr->dstate != VIO_DESC_READY) { 5690 DERR(vswp, "%s: (%ld) desc at index %ld not " 5691 "READY (0x%lx)", __func__, 5692 ldcp->ldc_id, idx, priv_addr->dstate); 5693 DERR(vswp, "%s: bound %d: ncookies %ld : " 5694 "datalen %ld", __func__, 5695 priv_addr->bound, priv_addr->ncookies, 5696 priv_addr->datalen); 5697 } 5698 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 5699 ldcp->ldc_id, idx); 5700 /* release resources associated with sent msg */ 5701 bzero(priv_addr->datap, priv_addr->datalen); 5702 priv_addr->datalen = 0; 5703 priv_addr->dstate = VIO_DESC_FREE; 5704 mutex_exit(&priv_addr->dstate_lock); 5705 } 5706 /* update to next expected value */ 5707 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 5708 5709 break; 5710 5711 case VIO_SUBTYPE_NACK: 5712 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5713 5714 /* 5715 * We should only get a NACK if our peer doesn't like 5716 * something about a message we have sent it. If this 5717 * happens we just release the resources associated with 5718 * the message. (We are relying on higher layers to decide 5719 * whether or not to resend. 5720 */ 5721 5722 /* limit check */ 5723 idx = ibnd_desc->hdr.desc_handle; 5724 5725 if (idx >= VSW_RING_NUM_EL) { 5726 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 5727 __func__, idx); 5728 return; 5729 } 5730 5731 if ((dp = ldcp->lane_out.dringp) == NULL) { 5732 DERR(vswp, "%s: no dring found", __func__); 5733 return; 5734 } 5735 5736 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5737 5738 /* move to correct location in ring */ 5739 priv_addr += idx; 5740 5741 /* release resources associated with sent msg */ 5742 mutex_enter(&priv_addr->dstate_lock); 5743 bzero(priv_addr->datap, priv_addr->datalen); 5744 priv_addr->datalen = 0; 5745 priv_addr->dstate = VIO_DESC_FREE; 5746 mutex_exit(&priv_addr->dstate_lock); 5747 5748 break; 5749 5750 default: 5751 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5752 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 5753 } 5754 5755 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5756 } 5757 5758 static void 5759 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 5760 { 5761 _NOTE(ARGUNUSED(epkt)) 5762 5763 vsw_t *vswp = ldcp->ldc_vswp; 5764 uint16_t env = tag.vio_subtype_env; 5765 5766 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5767 5768 /* 5769 * Error vio_subtypes have yet to be defined. So for 5770 * the moment we can't do anything. 5771 */ 5772 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 5773 5774 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5775 } 5776 5777 /* 5778 * Switch the given ethernet frame when operating in layer 2 mode. 5779 * 5780 * vswp: pointer to the vsw instance 5781 * mp: pointer to chain of ethernet frame(s) to be switched 5782 * caller: identifies the source of this frame as: 5783 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 5784 * 2. VSW_PHYSDEV - the physical ethernet device 5785 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 5786 * arg: argument provided by the caller. 5787 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 5788 * 2. for PHYSDEV - NULL 5789 * 3. for LOCALDEV - pointer to to this vsw_t(self) 5790 */ 5791 void 5792 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 5793 vsw_port_t *arg, mac_resource_handle_t mrh) 5794 { 5795 struct ether_header *ehp; 5796 vsw_port_t *port = NULL; 5797 mblk_t *bp, *ret_m; 5798 mblk_t *nmp = NULL; 5799 vsw_port_list_t *plist = &vswp->plist; 5800 5801 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5802 5803 /* 5804 * PERF: rather than breaking up the chain here, scan it 5805 * to find all mblks heading to same destination and then 5806 * pass that sub-chain to the lower transmit functions. 5807 */ 5808 5809 /* process the chain of packets */ 5810 bp = mp; 5811 while (bp) { 5812 mp = bp; 5813 bp = bp->b_next; 5814 mp->b_next = mp->b_prev = NULL; 5815 ehp = (struct ether_header *)mp->b_rptr; 5816 5817 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5818 __func__, MBLKSIZE(mp), MBLKL(mp)); 5819 5820 READ_ENTER(&vswp->if_lockrw); 5821 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 5822 /* 5823 * If destination is VSW_LOCALDEV (vsw as an eth 5824 * interface) and if the device is up & running, 5825 * send the packet up the stack on this host. 5826 * If the virtual interface is down, drop the packet. 5827 */ 5828 if (caller != VSW_LOCALDEV) { 5829 if (vswp->if_state & VSW_IF_UP) { 5830 RW_EXIT(&vswp->if_lockrw); 5831 mac_rx(vswp->if_mh, mrh, mp); 5832 } else { 5833 RW_EXIT(&vswp->if_lockrw); 5834 /* Interface down, drop pkt */ 5835 freemsg(mp); 5836 } 5837 } else { 5838 RW_EXIT(&vswp->if_lockrw); 5839 freemsg(mp); 5840 } 5841 continue; 5842 } 5843 RW_EXIT(&vswp->if_lockrw); 5844 5845 READ_ENTER(&plist->lockrw); 5846 port = vsw_lookup_fdb(vswp, ehp); 5847 if (port) { 5848 /* 5849 * Mark the port as in-use. 5850 */ 5851 mutex_enter(&port->ref_lock); 5852 port->ref_cnt++; 5853 mutex_exit(&port->ref_lock); 5854 RW_EXIT(&plist->lockrw); 5855 5856 /* 5857 * If plumbed and in promisc mode then copy msg 5858 * and send up the stack. 5859 */ 5860 READ_ENTER(&vswp->if_lockrw); 5861 if (VSW_U_P(vswp->if_state)) { 5862 RW_EXIT(&vswp->if_lockrw); 5863 nmp = copymsg(mp); 5864 if (nmp) 5865 mac_rx(vswp->if_mh, mrh, nmp); 5866 } else { 5867 RW_EXIT(&vswp->if_lockrw); 5868 } 5869 5870 /* 5871 * If the destination is in FDB, the packet 5872 * should be forwarded to the correponding 5873 * vsw_port (connected to a vnet device - 5874 * VSW_VNETPORT) 5875 */ 5876 (void) vsw_portsend(port, mp); 5877 5878 /* 5879 * Decrement use count in port and check if 5880 * should wake delete thread. 5881 */ 5882 mutex_enter(&port->ref_lock); 5883 port->ref_cnt--; 5884 if (port->ref_cnt == 0) 5885 cv_signal(&port->ref_cv); 5886 mutex_exit(&port->ref_lock); 5887 } else { 5888 RW_EXIT(&plist->lockrw); 5889 /* 5890 * Destination not in FDB. 5891 * 5892 * If the destination is broadcast or 5893 * multicast forward the packet to all 5894 * (VNETPORTs, PHYSDEV, LOCALDEV), 5895 * except the caller. 5896 */ 5897 if (IS_BROADCAST(ehp)) { 5898 D3(vswp, "%s: BROADCAST pkt", __func__); 5899 (void) vsw_forward_all(vswp, mp, 5900 caller, arg); 5901 } else if (IS_MULTICAST(ehp)) { 5902 D3(vswp, "%s: MULTICAST pkt", __func__); 5903 (void) vsw_forward_grp(vswp, mp, 5904 caller, arg); 5905 } else { 5906 /* 5907 * If the destination is unicast, and came 5908 * from either a logical network device or 5909 * the switch itself when it is plumbed, then 5910 * send it out on the physical device and also 5911 * up the stack if the logical interface is 5912 * in promiscious mode. 5913 * 5914 * NOTE: The assumption here is that if we 5915 * cannot find the destination in our fdb, its 5916 * a unicast address, and came from either a 5917 * vnet or down the stack (when plumbed) it 5918 * must be destinded for an ethernet device 5919 * outside our ldoms. 5920 */ 5921 if (caller == VSW_VNETPORT) { 5922 READ_ENTER(&vswp->if_lockrw); 5923 if (VSW_U_P(vswp->if_state)) { 5924 RW_EXIT(&vswp->if_lockrw); 5925 nmp = copymsg(mp); 5926 if (nmp) 5927 mac_rx(vswp->if_mh, 5928 mrh, nmp); 5929 } else { 5930 RW_EXIT(&vswp->if_lockrw); 5931 } 5932 if ((ret_m = vsw_tx_msg(vswp, mp)) 5933 != NULL) { 5934 DERR(vswp, "%s: drop mblks to " 5935 "phys dev", __func__); 5936 freemsg(ret_m); 5937 } 5938 5939 } else if (caller == VSW_PHYSDEV) { 5940 /* 5941 * Pkt seen because card in promisc 5942 * mode. Send up stack if plumbed in 5943 * promisc mode, else drop it. 5944 */ 5945 READ_ENTER(&vswp->if_lockrw); 5946 if (VSW_U_P(vswp->if_state)) { 5947 RW_EXIT(&vswp->if_lockrw); 5948 mac_rx(vswp->if_mh, mrh, mp); 5949 } else { 5950 RW_EXIT(&vswp->if_lockrw); 5951 freemsg(mp); 5952 } 5953 5954 } else if (caller == VSW_LOCALDEV) { 5955 /* 5956 * Pkt came down the stack, send out 5957 * over physical device. 5958 */ 5959 if ((ret_m = vsw_tx_msg(vswp, mp)) 5960 != NULL) { 5961 DERR(vswp, "%s: drop mblks to " 5962 "phys dev", __func__); 5963 freemsg(ret_m); 5964 } 5965 } 5966 } 5967 } 5968 } 5969 D1(vswp, "%s: exit\n", __func__); 5970 } 5971 5972 /* 5973 * Switch ethernet frame when in layer 3 mode (i.e. using IP 5974 * layer to do the routing). 5975 * 5976 * There is a large amount of overlap between this function and 5977 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 5978 * both these functions. 5979 */ 5980 void 5981 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 5982 vsw_port_t *arg, mac_resource_handle_t mrh) 5983 { 5984 struct ether_header *ehp; 5985 vsw_port_t *port = NULL; 5986 mblk_t *bp = NULL; 5987 vsw_port_list_t *plist = &vswp->plist; 5988 5989 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5990 5991 /* 5992 * In layer 3 mode should only ever be switching packets 5993 * between IP layer and vnet devices. So make sure thats 5994 * who is invoking us. 5995 */ 5996 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 5997 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 5998 freemsgchain(mp); 5999 return; 6000 } 6001 6002 /* process the chain of packets */ 6003 bp = mp; 6004 while (bp) { 6005 mp = bp; 6006 bp = bp->b_next; 6007 mp->b_next = mp->b_prev = NULL; 6008 ehp = (struct ether_header *)mp->b_rptr; 6009 6010 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6011 __func__, MBLKSIZE(mp), MBLKL(mp)); 6012 6013 READ_ENTER(&plist->lockrw); 6014 port = vsw_lookup_fdb(vswp, ehp); 6015 if (port) { 6016 /* 6017 * Mark port as in-use. 6018 */ 6019 mutex_enter(&port->ref_lock); 6020 port->ref_cnt++; 6021 mutex_exit(&port->ref_lock); 6022 RW_EXIT(&plist->lockrw); 6023 6024 D2(vswp, "%s: sending to target port", __func__); 6025 (void) vsw_portsend(port, mp); 6026 6027 /* 6028 * Finished with port so decrement ref count and 6029 * check if should wake delete thread. 6030 */ 6031 mutex_enter(&port->ref_lock); 6032 port->ref_cnt--; 6033 if (port->ref_cnt == 0) 6034 cv_signal(&port->ref_cv); 6035 mutex_exit(&port->ref_lock); 6036 } else { 6037 RW_EXIT(&plist->lockrw); 6038 /* 6039 * Destination not in FDB 6040 * 6041 * If the destination is broadcast or 6042 * multicast forward the packet to all 6043 * (VNETPORTs, PHYSDEV, LOCALDEV), 6044 * except the caller. 6045 */ 6046 if (IS_BROADCAST(ehp)) { 6047 D2(vswp, "%s: BROADCAST pkt", __func__); 6048 (void) vsw_forward_all(vswp, mp, 6049 caller, arg); 6050 } else if (IS_MULTICAST(ehp)) { 6051 D2(vswp, "%s: MULTICAST pkt", __func__); 6052 (void) vsw_forward_grp(vswp, mp, 6053 caller, arg); 6054 } else { 6055 /* 6056 * Unicast pkt from vnet that we don't have 6057 * an FDB entry for, so must be destinded for 6058 * the outside world. Attempt to send up to the 6059 * IP layer to allow it to deal with it. 6060 */ 6061 if (caller == VSW_VNETPORT) { 6062 READ_ENTER(&vswp->if_lockrw); 6063 if (vswp->if_state & VSW_IF_UP) { 6064 RW_EXIT(&vswp->if_lockrw); 6065 D2(vswp, "%s: sending up", 6066 __func__); 6067 mac_rx(vswp->if_mh, mrh, mp); 6068 } else { 6069 RW_EXIT(&vswp->if_lockrw); 6070 /* Interface down, drop pkt */ 6071 D2(vswp, "%s I/F down", 6072 __func__); 6073 freemsg(mp); 6074 } 6075 } 6076 } 6077 } 6078 } 6079 6080 D1(vswp, "%s: exit", __func__); 6081 } 6082 6083 /* 6084 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 6085 * except the caller (port on which frame arrived). 6086 */ 6087 static int 6088 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6089 { 6090 vsw_port_list_t *plist = &vswp->plist; 6091 vsw_port_t *portp; 6092 mblk_t *nmp = NULL; 6093 mblk_t *ret_m = NULL; 6094 int skip_port = 0; 6095 6096 D1(vswp, "vsw_forward_all: enter\n"); 6097 6098 /* 6099 * Broadcast message from inside ldoms so send to outside 6100 * world if in either of layer 2 modes. 6101 */ 6102 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6103 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6104 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 6105 6106 nmp = dupmsg(mp); 6107 if (nmp) { 6108 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6109 DERR(vswp, "%s: dropping pkt(s) " 6110 "consisting of %ld bytes of data for" 6111 " physical device", __func__, MBLKL(ret_m)); 6112 freemsg(ret_m); 6113 } 6114 } 6115 } 6116 6117 if (caller == VSW_VNETPORT) 6118 skip_port = 1; 6119 6120 /* 6121 * Broadcast message from other vnet (layer 2 or 3) or outside 6122 * world (layer 2 only), send up stack if plumbed. 6123 */ 6124 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 6125 READ_ENTER(&vswp->if_lockrw); 6126 if (vswp->if_state & VSW_IF_UP) { 6127 RW_EXIT(&vswp->if_lockrw); 6128 nmp = copymsg(mp); 6129 if (nmp) 6130 mac_rx(vswp->if_mh, NULL, nmp); 6131 } else { 6132 RW_EXIT(&vswp->if_lockrw); 6133 } 6134 } 6135 6136 /* send it to all VNETPORTs */ 6137 READ_ENTER(&plist->lockrw); 6138 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 6139 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 6140 /* 6141 * Caution ! - don't reorder these two checks as arg 6142 * will be NULL if the caller is PHYSDEV. skip_port is 6143 * only set if caller is VNETPORT. 6144 */ 6145 if ((skip_port) && (portp == arg)) 6146 continue; 6147 else { 6148 nmp = dupmsg(mp); 6149 if (nmp) { 6150 (void) vsw_portsend(portp, nmp); 6151 } else { 6152 DERR(vswp, "vsw_forward_all: nmp NULL"); 6153 } 6154 } 6155 } 6156 RW_EXIT(&plist->lockrw); 6157 6158 freemsg(mp); 6159 6160 D1(vswp, "vsw_forward_all: exit\n"); 6161 return (0); 6162 } 6163 6164 /* 6165 * Forward pkts to any devices or interfaces which have registered 6166 * an interest in them (i.e. multicast groups). 6167 */ 6168 static int 6169 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6170 { 6171 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 6172 mfdb_ent_t *entp = NULL; 6173 mfdb_ent_t *tpp = NULL; 6174 vsw_port_t *port; 6175 uint64_t key = 0; 6176 mblk_t *nmp = NULL; 6177 mblk_t *ret_m = NULL; 6178 boolean_t check_if = B_TRUE; 6179 6180 /* 6181 * Convert address to hash table key 6182 */ 6183 KEY_HASH(key, ehp->ether_dhost); 6184 6185 D1(vswp, "%s: key 0x%llx", __func__, key); 6186 6187 /* 6188 * If pkt came from either a vnet or down the stack (if we are 6189 * plumbed) and we are in layer 2 mode, then we send the pkt out 6190 * over the physical adapter, and then check to see if any other 6191 * vnets are interested in it. 6192 */ 6193 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6194 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6195 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 6196 nmp = dupmsg(mp); 6197 if (nmp) { 6198 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6199 DERR(vswp, "%s: dropping pkt(s) " 6200 "consisting of %ld bytes of " 6201 "data for physical device", 6202 __func__, MBLKL(ret_m)); 6203 freemsg(ret_m); 6204 } 6205 } 6206 } 6207 6208 READ_ENTER(&vswp->mfdbrw); 6209 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 6210 (mod_hash_val_t *)&entp) != 0) { 6211 D3(vswp, "%s: no table entry found for addr 0x%llx", 6212 __func__, key); 6213 } else { 6214 /* 6215 * Send to list of devices associated with this address... 6216 */ 6217 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 6218 6219 /* dont send to ourselves */ 6220 if ((caller == VSW_VNETPORT) && 6221 (tpp->d_addr == (void *)arg)) { 6222 port = (vsw_port_t *)tpp->d_addr; 6223 D3(vswp, "%s: not sending to ourselves" 6224 " : port %d", __func__, 6225 port->p_instance); 6226 continue; 6227 6228 } else if ((caller == VSW_LOCALDEV) && 6229 (tpp->d_type == VSW_LOCALDEV)) { 6230 D3(vswp, "%s: not sending back up stack", 6231 __func__); 6232 continue; 6233 } 6234 6235 if (tpp->d_type == VSW_VNETPORT) { 6236 port = (vsw_port_t *)tpp->d_addr; 6237 D3(vswp, "%s: sending to port %ld for " 6238 " addr 0x%llx", __func__, 6239 port->p_instance, key); 6240 6241 nmp = dupmsg(mp); 6242 if (nmp) 6243 (void) vsw_portsend(port, nmp); 6244 } else { 6245 if (vswp->if_state & VSW_IF_UP) { 6246 nmp = copymsg(mp); 6247 if (nmp) 6248 mac_rx(vswp->if_mh, NULL, nmp); 6249 check_if = B_FALSE; 6250 D3(vswp, "%s: sending up stack" 6251 " for addr 0x%llx", __func__, 6252 key); 6253 } 6254 } 6255 } 6256 } 6257 6258 RW_EXIT(&vswp->mfdbrw); 6259 6260 /* 6261 * If the pkt came from either a vnet or from physical device, 6262 * and if we havent already sent the pkt up the stack then we 6263 * check now if we can/should (i.e. the interface is plumbed 6264 * and in promisc mode). 6265 */ 6266 if ((check_if) && 6267 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 6268 READ_ENTER(&vswp->if_lockrw); 6269 if (VSW_U_P(vswp->if_state)) { 6270 RW_EXIT(&vswp->if_lockrw); 6271 D3(vswp, "%s: (caller %d) finally sending up stack" 6272 " for addr 0x%llx", __func__, caller, key); 6273 nmp = copymsg(mp); 6274 if (nmp) 6275 mac_rx(vswp->if_mh, NULL, nmp); 6276 } else { 6277 RW_EXIT(&vswp->if_lockrw); 6278 } 6279 } 6280 6281 freemsg(mp); 6282 6283 D1(vswp, "%s: exit", __func__); 6284 6285 return (0); 6286 } 6287 6288 /* transmit the packet over the given port */ 6289 static int 6290 vsw_portsend(vsw_port_t *port, mblk_t *mp) 6291 { 6292 vsw_ldc_list_t *ldcl = &port->p_ldclist; 6293 vsw_ldc_t *ldcp; 6294 int status = 0; 6295 6296 6297 READ_ENTER(&ldcl->lockrw); 6298 /* 6299 * Note for now, we have a single channel. 6300 */ 6301 ldcp = ldcl->head; 6302 if (ldcp == NULL) { 6303 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 6304 freemsg(mp); 6305 RW_EXIT(&ldcl->lockrw); 6306 return (1); 6307 } 6308 6309 /* 6310 * Send the message out using the appropriate 6311 * transmit function which will free mblock when it 6312 * is finished with it. 6313 */ 6314 mutex_enter(&port->tx_lock); 6315 if (port->transmit != NULL) 6316 status = (*port->transmit)(ldcp, mp); 6317 else { 6318 freemsg(mp); 6319 } 6320 mutex_exit(&port->tx_lock); 6321 6322 RW_EXIT(&ldcl->lockrw); 6323 6324 return (status); 6325 } 6326 6327 /* 6328 * Send packet out via descriptor ring to a logical device. 6329 */ 6330 static int 6331 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 6332 { 6333 vio_dring_msg_t dring_pkt; 6334 dring_info_t *dp = NULL; 6335 vsw_private_desc_t *priv_desc = NULL; 6336 vnet_public_desc_t *pub = NULL; 6337 vsw_t *vswp = ldcp->ldc_vswp; 6338 mblk_t *bp; 6339 size_t n, size; 6340 caddr_t bufp; 6341 int idx; 6342 int status = LDC_TX_SUCCESS; 6343 6344 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 6345 6346 /* TODO: make test a macro */ 6347 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 6348 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 6349 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 6350 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 6351 ldcp->lane_out.lstate); 6352 freemsg(mp); 6353 return (LDC_TX_FAILURE); 6354 } 6355 6356 /* 6357 * Note - using first ring only, this may change 6358 * in the future. 6359 */ 6360 if ((dp = ldcp->lane_out.dringp) == NULL) { 6361 DERR(vswp, "%s(%lld): no dring for outbound lane on" 6362 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 6363 freemsg(mp); 6364 return (LDC_TX_FAILURE); 6365 } 6366 6367 size = msgsize(mp); 6368 if (size > (size_t)ETHERMAX) { 6369 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6370 ldcp->ldc_id, size); 6371 freemsg(mp); 6372 return (LDC_TX_FAILURE); 6373 } 6374 6375 /* 6376 * Find a free descriptor 6377 * 6378 * Note: for the moment we are assuming that we will only 6379 * have one dring going from the switch to each of its 6380 * peers. This may change in the future. 6381 */ 6382 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6383 D2(vswp, "%s(%lld): no descriptor available for ring " 6384 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6385 6386 /* nothing more we can do */ 6387 status = LDC_TX_NORESOURCES; 6388 goto vsw_dringsend_free_exit; 6389 } else { 6390 D2(vswp, "%s(%lld): free private descriptor found at pos " 6391 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 6392 priv_desc); 6393 } 6394 6395 /* copy data into the descriptor */ 6396 bufp = priv_desc->datap; 6397 bufp += VNET_IPALIGN; 6398 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6399 n = MBLKL(bp); 6400 bcopy(bp->b_rptr, bufp, n); 6401 bufp += n; 6402 } 6403 6404 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6405 6406 pub = priv_desc->descp; 6407 pub->nbytes = priv_desc->datalen; 6408 6409 mutex_enter(&priv_desc->dstate_lock); 6410 pub->hdr.dstate = VIO_DESC_READY; 6411 mutex_exit(&priv_desc->dstate_lock); 6412 6413 /* 6414 * Determine whether or not we need to send a message to our 6415 * peer prompting them to read our newly updated descriptor(s). 6416 */ 6417 mutex_enter(&dp->restart_lock); 6418 if (dp->restart_reqd) { 6419 dp->restart_reqd = B_FALSE; 6420 mutex_exit(&dp->restart_lock); 6421 6422 /* 6423 * Send a vio_dring_msg to peer to prompt them to read 6424 * the updated descriptor ring. 6425 */ 6426 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 6427 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 6428 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 6429 dring_pkt.tag.vio_sid = ldcp->local_session; 6430 6431 /* Note - for now using first ring */ 6432 dring_pkt.dring_ident = dp->ident; 6433 6434 mutex_enter(&ldcp->lane_out.seq_lock); 6435 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 6436 mutex_exit(&ldcp->lane_out.seq_lock); 6437 6438 /* 6439 * If last_ack_recv is -1 then we know we've not 6440 * received any ack's yet, so this must be the first 6441 * msg sent, so set the start to the begining of the ring. 6442 */ 6443 mutex_enter(&dp->dlock); 6444 if (dp->last_ack_recv == -1) { 6445 dring_pkt.start_idx = 0; 6446 } else { 6447 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 6448 dp->num_descriptors; 6449 } 6450 dring_pkt.end_idx = -1; 6451 mutex_exit(&dp->dlock); 6452 6453 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 6454 ldcp->ldc_id, dp, dring_pkt.dring_ident); 6455 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 6456 __func__, ldcp->ldc_id, dring_pkt.start_idx, 6457 dring_pkt.end_idx, dring_pkt.seq_num); 6458 6459 vsw_send_msg(ldcp, (void *)&dring_pkt, 6460 sizeof (vio_dring_msg_t)); 6461 } else { 6462 mutex_exit(&dp->restart_lock); 6463 D2(vswp, "%s(%lld): updating descp %d", __func__, 6464 ldcp->ldc_id, idx); 6465 } 6466 6467 vsw_dringsend_free_exit: 6468 6469 /* free the message block */ 6470 freemsg(mp); 6471 6472 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 6473 return (status); 6474 } 6475 6476 /* 6477 * Send an in-band descriptor message over ldc. 6478 */ 6479 static int 6480 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 6481 { 6482 vsw_t *vswp = ldcp->ldc_vswp; 6483 vio_ibnd_desc_t ibnd_msg; 6484 vsw_private_desc_t *priv_desc = NULL; 6485 dring_info_t *dp = NULL; 6486 size_t n, size = 0; 6487 caddr_t bufp; 6488 mblk_t *bp; 6489 int idx, i; 6490 int status = LDC_TX_SUCCESS; 6491 static int warn_msg = 1; 6492 6493 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6494 6495 ASSERT(mp != NULL); 6496 6497 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 6498 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 6499 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 6500 __func__, ldcp->ldc_id, ldcp->ldc_status, 6501 ldcp->lane_out.lstate); 6502 freemsg(mp); 6503 return (LDC_TX_FAILURE); 6504 } 6505 6506 /* 6507 * only expect single dring to exist, which we use 6508 * as an internal buffer, rather than a transfer channel. 6509 */ 6510 if ((dp = ldcp->lane_out.dringp) == NULL) { 6511 DERR(vswp, "%s(%lld): no dring for outbound lane", 6512 __func__, ldcp->ldc_id); 6513 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 6514 __func__, ldcp->ldc_id, ldcp->ldc_status, 6515 ldcp->lane_out.lstate); 6516 freemsg(mp); 6517 return (LDC_TX_FAILURE); 6518 } 6519 6520 size = msgsize(mp); 6521 if (size > (size_t)ETHERMAX) { 6522 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6523 ldcp->ldc_id, size); 6524 freemsg(mp); 6525 return (LDC_TX_FAILURE); 6526 } 6527 6528 /* 6529 * Find a free descriptor in our buffer ring 6530 */ 6531 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6532 if (warn_msg) { 6533 DERR(vswp, "%s(%lld): no descriptor available for ring " 6534 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6535 warn_msg = 0; 6536 } 6537 6538 /* nothing more we can do */ 6539 status = LDC_TX_NORESOURCES; 6540 goto vsw_descrsend_free_exit; 6541 } else { 6542 D2(vswp, "%s(%lld): free private descriptor found at pos " 6543 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 6544 priv_desc); 6545 warn_msg = 1; 6546 } 6547 6548 /* copy data into the descriptor */ 6549 bufp = priv_desc->datap; 6550 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6551 n = MBLKL(bp); 6552 bcopy(bp->b_rptr, bufp, n); 6553 bufp += n; 6554 } 6555 6556 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6557 6558 /* create and send the in-band descp msg */ 6559 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 6560 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 6561 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 6562 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 6563 6564 mutex_enter(&ldcp->lane_out.seq_lock); 6565 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 6566 mutex_exit(&ldcp->lane_out.seq_lock); 6567 6568 /* 6569 * Copy the mem cookies describing the data from the 6570 * private region of the descriptor ring into the inband 6571 * descriptor. 6572 */ 6573 for (i = 0; i < priv_desc->ncookies; i++) { 6574 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 6575 sizeof (ldc_mem_cookie_t)); 6576 } 6577 6578 ibnd_msg.hdr.desc_handle = idx; 6579 ibnd_msg.ncookies = priv_desc->ncookies; 6580 ibnd_msg.nbytes = size; 6581 6582 vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t)); 6583 6584 vsw_descrsend_free_exit: 6585 6586 /* free the allocated message blocks */ 6587 freemsg(mp); 6588 6589 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6590 return (status); 6591 } 6592 6593 static void 6594 vsw_send_ver(void *arg) 6595 { 6596 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 6597 vsw_t *vswp = ldcp->ldc_vswp; 6598 lane_t *lp = &ldcp->lane_out; 6599 vio_ver_msg_t ver_msg; 6600 6601 D1(vswp, "%s enter", __func__); 6602 6603 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6604 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6605 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 6606 ver_msg.tag.vio_sid = ldcp->local_session; 6607 6608 ver_msg.ver_major = vsw_versions[0].ver_major; 6609 ver_msg.ver_minor = vsw_versions[0].ver_minor; 6610 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 6611 6612 lp->lstate |= VSW_VER_INFO_SENT; 6613 lp->ver_major = ver_msg.ver_major; 6614 lp->ver_minor = ver_msg.ver_minor; 6615 6616 DUMP_TAG(ver_msg.tag); 6617 6618 vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); 6619 6620 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 6621 } 6622 6623 static void 6624 vsw_send_attr(vsw_ldc_t *ldcp) 6625 { 6626 vsw_t *vswp = ldcp->ldc_vswp; 6627 lane_t *lp = &ldcp->lane_out; 6628 vnet_attr_msg_t attr_msg; 6629 6630 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6631 6632 /* 6633 * Subtype is set to INFO by default 6634 */ 6635 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6636 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6637 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 6638 attr_msg.tag.vio_sid = ldcp->local_session; 6639 6640 /* payload copied from default settings for lane */ 6641 attr_msg.mtu = lp->mtu; 6642 attr_msg.addr_type = lp->addr_type; 6643 attr_msg.xfer_mode = lp->xfer_mode; 6644 attr_msg.ack_freq = lp->xfer_mode; 6645 6646 READ_ENTER(&vswp->if_lockrw); 6647 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 6648 RW_EXIT(&vswp->if_lockrw); 6649 6650 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 6651 6652 DUMP_TAG(attr_msg.tag); 6653 6654 vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); 6655 6656 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6657 } 6658 6659 /* 6660 * Create dring info msg (which also results in the creation of 6661 * a dring). 6662 */ 6663 static vio_dring_reg_msg_t * 6664 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 6665 { 6666 vio_dring_reg_msg_t *mp; 6667 dring_info_t *dp; 6668 vsw_t *vswp = ldcp->ldc_vswp; 6669 6670 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 6671 6672 /* 6673 * If we can't create a dring, obviously no point sending 6674 * a message. 6675 */ 6676 if ((dp = vsw_create_dring(ldcp)) == NULL) 6677 return (NULL); 6678 6679 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 6680 6681 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 6682 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 6683 mp->tag.vio_subtype_env = VIO_DRING_REG; 6684 mp->tag.vio_sid = ldcp->local_session; 6685 6686 /* payload */ 6687 mp->num_descriptors = dp->num_descriptors; 6688 mp->descriptor_size = dp->descriptor_size; 6689 mp->options = dp->options; 6690 mp->ncookies = dp->ncookies; 6691 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 6692 6693 mp->dring_ident = 0; 6694 6695 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 6696 6697 return (mp); 6698 } 6699 6700 static void 6701 vsw_send_dring_info(vsw_ldc_t *ldcp) 6702 { 6703 vio_dring_reg_msg_t *dring_msg; 6704 vsw_t *vswp = ldcp->ldc_vswp; 6705 6706 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 6707 6708 dring_msg = vsw_create_dring_info_pkt(ldcp); 6709 if (dring_msg == NULL) { 6710 cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); 6711 return; 6712 } 6713 6714 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 6715 6716 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 6717 6718 vsw_send_msg(ldcp, dring_msg, 6719 sizeof (vio_dring_reg_msg_t)); 6720 6721 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 6722 6723 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 6724 } 6725 6726 static void 6727 vsw_send_rdx(vsw_ldc_t *ldcp) 6728 { 6729 vsw_t *vswp = ldcp->ldc_vswp; 6730 vio_rdx_msg_t rdx_msg; 6731 6732 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6733 6734 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6735 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6736 rdx_msg.tag.vio_subtype_env = VIO_RDX; 6737 rdx_msg.tag.vio_sid = ldcp->local_session; 6738 6739 ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; 6740 6741 DUMP_TAG(rdx_msg.tag); 6742 6743 vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); 6744 6745 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 6746 } 6747 6748 /* 6749 * Generic routine to send message out over ldc channel. 6750 */ 6751 static void 6752 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) 6753 { 6754 int rv; 6755 size_t msglen = size; 6756 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 6757 vsw_t *vswp = ldcp->ldc_vswp; 6758 6759 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 6760 ldcp->ldc_id, size); 6761 6762 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 6763 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 6764 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 6765 6766 mutex_enter(&ldcp->ldc_txlock); 6767 do { 6768 msglen = size; 6769 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 6770 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 6771 6772 if ((rv != 0) || (msglen != size)) { 6773 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 6774 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 6775 rv, size, msglen); 6776 } 6777 mutex_exit(&ldcp->ldc_txlock); 6778 6779 /* channel has been reset */ 6780 if (rv == ECONNRESET) { 6781 vsw_handle_reset(ldcp); 6782 } 6783 6784 D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", 6785 ldcp->ldc_id, msglen); 6786 } 6787 6788 /* 6789 * Add an entry into FDB, for the given mac address and port_id. 6790 * Returns 0 on success, 1 on failure. 6791 * 6792 * Lock protecting FDB must be held by calling process. 6793 */ 6794 static int 6795 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 6796 { 6797 uint64_t addr = 0; 6798 6799 D1(vswp, "%s: enter", __func__); 6800 6801 KEY_HASH(addr, port->p_macaddr); 6802 6803 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6804 6805 /* 6806 * Note: duplicate keys will be rejected by mod_hash. 6807 */ 6808 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 6809 (mod_hash_val_t)port) != 0) { 6810 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 6811 return (1); 6812 } 6813 6814 D1(vswp, "%s: exit", __func__); 6815 return (0); 6816 } 6817 6818 /* 6819 * Remove an entry from FDB. 6820 * Returns 0 on success, 1 on failure. 6821 */ 6822 static int 6823 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 6824 { 6825 uint64_t addr = 0; 6826 6827 D1(vswp, "%s: enter", __func__); 6828 6829 KEY_HASH(addr, port->p_macaddr); 6830 6831 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6832 6833 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 6834 6835 D1(vswp, "%s: enter", __func__); 6836 6837 return (0); 6838 } 6839 6840 /* 6841 * Search fdb for a given mac address. 6842 * Returns pointer to the entry if found, else returns NULL. 6843 */ 6844 static vsw_port_t * 6845 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 6846 { 6847 uint64_t key = 0; 6848 vsw_port_t *port = NULL; 6849 6850 D1(vswp, "%s: enter", __func__); 6851 6852 KEY_HASH(key, ehp->ether_dhost); 6853 6854 D2(vswp, "%s: key = 0x%llx", __func__, key); 6855 6856 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 6857 (mod_hash_val_t *)&port) != 0) { 6858 return (NULL); 6859 } 6860 6861 D1(vswp, "%s: exit", __func__); 6862 6863 return (port); 6864 } 6865 6866 /* 6867 * Add or remove multicast address(es). 6868 * 6869 * Returns 0 on success, 1 on failure. 6870 */ 6871 static int 6872 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 6873 { 6874 mcst_addr_t *mcst_p = NULL; 6875 vsw_t *vswp = port->p_vswp; 6876 uint64_t addr = 0x0; 6877 int i, ret; 6878 6879 D1(vswp, "%s: enter", __func__); 6880 6881 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 6882 6883 if (vswp->mh == NULL) 6884 return (1); 6885 6886 for (i = 0; i < mcst_pkt->count; i++) { 6887 /* 6888 * Convert address into form that can be used 6889 * as hash table key. 6890 */ 6891 KEY_HASH(addr, mcst_pkt->mca[i]); 6892 6893 /* 6894 * Add or delete the specified address/port combination. 6895 */ 6896 if (mcst_pkt->set == 0x1) { 6897 D3(vswp, "%s: adding multicast address 0x%llx for " 6898 "port %ld", __func__, addr, port->p_instance); 6899 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6900 /* 6901 * Update the list of multicast 6902 * addresses contained within the 6903 * port structure to include this new 6904 * one. 6905 */ 6906 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 6907 KM_NOSLEEP); 6908 if (mcst_p == NULL) { 6909 DERR(vswp, "%s: unable to alloc mem", 6910 __func__); 6911 return (1); 6912 } 6913 6914 mcst_p->nextp = NULL; 6915 mcst_p->addr = addr; 6916 6917 mutex_enter(&port->mca_lock); 6918 mcst_p->nextp = port->mcap; 6919 port->mcap = mcst_p; 6920 mutex_exit(&port->mca_lock); 6921 6922 /* 6923 * Program the address into HW. If the addr 6924 * has already been programmed then the MAC 6925 * just increments a ref counter (which is 6926 * used when the address is being deleted) 6927 */ 6928 ret = mac_multicst_add(vswp->mh, 6929 (uchar_t *)&mcst_pkt->mca[i]); 6930 if (ret) { 6931 cmn_err(CE_WARN, "!unable to add " 6932 "multicast address"); 6933 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6934 addr, port); 6935 vsw_del_addr(VSW_VNETPORT, port, addr); 6936 return (ret); 6937 } 6938 6939 } else { 6940 DERR(vswp, "%s: error adding multicast " 6941 "address 0x%llx for port %ld", 6942 __func__, addr, port->p_instance); 6943 return (1); 6944 } 6945 } else { 6946 /* 6947 * Delete an entry from the multicast hash 6948 * table and update the address list 6949 * appropriately. 6950 */ 6951 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6952 D3(vswp, "%s: deleting multicast address " 6953 "0x%llx for port %ld", __func__, addr, 6954 port->p_instance); 6955 6956 vsw_del_addr(VSW_VNETPORT, port, addr); 6957 6958 /* 6959 * Remove the address from HW. The address 6960 * will actually only be removed once the ref 6961 * count within the MAC layer has dropped to 6962 * zero. I.e. we can safely call this fn even 6963 * if other ports are interested in this 6964 * address. 6965 */ 6966 (void) mac_multicst_remove(vswp->mh, 6967 (uchar_t *)&mcst_pkt->mca[i]); 6968 6969 } else { 6970 DERR(vswp, "%s: error deleting multicast " 6971 "addr 0x%llx for port %ld", 6972 __func__, addr, port->p_instance); 6973 return (1); 6974 } 6975 } 6976 } 6977 D1(vswp, "%s: exit", __func__); 6978 return (0); 6979 } 6980 6981 /* 6982 * Add a new multicast entry. 6983 * 6984 * Search hash table based on address. If match found then 6985 * update associated val (which is chain of ports), otherwise 6986 * create new key/val (addr/port) pair and insert into table. 6987 */ 6988 static int 6989 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6990 { 6991 int dup = 0; 6992 int rv = 0; 6993 mfdb_ent_t *ment = NULL; 6994 mfdb_ent_t *tmp_ent = NULL; 6995 mfdb_ent_t *new_ent = NULL; 6996 void *tgt = NULL; 6997 6998 if (devtype == VSW_VNETPORT) { 6999 /* 7000 * Being invoked from a vnet. 7001 */ 7002 ASSERT(arg != NULL); 7003 tgt = arg; 7004 D2(NULL, "%s: port %d : address 0x%llx", __func__, 7005 ((vsw_port_t *)arg)->p_instance, addr); 7006 } else { 7007 /* 7008 * We are being invoked via the m_multicst mac entry 7009 * point. 7010 */ 7011 D2(NULL, "%s: address 0x%llx", __func__, addr); 7012 tgt = (void *)vswp; 7013 } 7014 7015 WRITE_ENTER(&vswp->mfdbrw); 7016 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7017 (mod_hash_val_t *)&ment) != 0) { 7018 7019 /* address not currently in table */ 7020 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7021 ment->d_addr = (void *)tgt; 7022 ment->d_type = devtype; 7023 ment->nextp = NULL; 7024 7025 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 7026 (mod_hash_val_t)ment) != 0) { 7027 DERR(vswp, "%s: hash table insertion failed", __func__); 7028 kmem_free(ment, sizeof (mfdb_ent_t)); 7029 rv = 1; 7030 } else { 7031 D2(vswp, "%s: added initial entry for 0x%llx to " 7032 "table", __func__, addr); 7033 } 7034 } else { 7035 /* 7036 * Address in table. Check to see if specified port 7037 * is already associated with the address. If not add 7038 * it now. 7039 */ 7040 tmp_ent = ment; 7041 while (tmp_ent != NULL) { 7042 if (tmp_ent->d_addr == (void *)tgt) { 7043 if (devtype == VSW_VNETPORT) { 7044 DERR(vswp, "%s: duplicate port entry " 7045 "found for portid %ld and key " 7046 "0x%llx", __func__, 7047 ((vsw_port_t *)arg)->p_instance, 7048 addr); 7049 } else { 7050 DERR(vswp, "%s: duplicate entry found" 7051 "for key 0x%llx", 7052 __func__, addr); 7053 } 7054 rv = 1; 7055 dup = 1; 7056 break; 7057 } 7058 tmp_ent = tmp_ent->nextp; 7059 } 7060 7061 /* 7062 * Port not on list so add it to end now. 7063 */ 7064 if (0 == dup) { 7065 D2(vswp, "%s: added entry for 0x%llx to table", 7066 __func__, addr); 7067 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7068 new_ent->d_addr = (void *)tgt; 7069 new_ent->d_type = devtype; 7070 new_ent->nextp = NULL; 7071 7072 tmp_ent = ment; 7073 while (tmp_ent->nextp != NULL) 7074 tmp_ent = tmp_ent->nextp; 7075 7076 tmp_ent->nextp = new_ent; 7077 } 7078 } 7079 7080 RW_EXIT(&vswp->mfdbrw); 7081 return (rv); 7082 } 7083 7084 /* 7085 * Remove a multicast entry from the hashtable. 7086 * 7087 * Search hash table based on address. If match found, scan 7088 * list of ports associated with address. If specified port 7089 * found remove it from list. 7090 */ 7091 static int 7092 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7093 { 7094 mfdb_ent_t *ment = NULL; 7095 mfdb_ent_t *curr_p, *prev_p; 7096 void *tgt = NULL; 7097 7098 D1(vswp, "%s: enter", __func__); 7099 7100 if (devtype == VSW_VNETPORT) { 7101 tgt = (vsw_port_t *)arg; 7102 D2(vswp, "%s: removing port %d from mFDB for address" 7103 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 7104 addr); 7105 } else { 7106 D2(vswp, "%s: removing entry", __func__); 7107 tgt = (void *)vswp; 7108 } 7109 7110 WRITE_ENTER(&vswp->mfdbrw); 7111 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7112 (mod_hash_val_t *)&ment) != 0) { 7113 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 7114 RW_EXIT(&vswp->mfdbrw); 7115 return (1); 7116 } 7117 7118 prev_p = curr_p = ment; 7119 7120 while (curr_p != NULL) { 7121 if (curr_p->d_addr == (void *)tgt) { 7122 if (devtype == VSW_VNETPORT) { 7123 D2(vswp, "%s: port %d found", __func__, 7124 ((vsw_port_t *)tgt)->p_instance); 7125 } else { 7126 D2(vswp, "%s: instance found", __func__); 7127 } 7128 7129 if (prev_p == curr_p) { 7130 /* 7131 * head of list, if no other element is in 7132 * list then destroy this entry, otherwise 7133 * just replace it with updated value. 7134 */ 7135 ment = curr_p->nextp; 7136 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7137 if (ment == NULL) { 7138 (void) mod_hash_destroy(vswp->mfdb, 7139 (mod_hash_val_t)addr); 7140 } else { 7141 (void) mod_hash_replace(vswp->mfdb, 7142 (mod_hash_key_t)addr, 7143 (mod_hash_val_t)ment); 7144 } 7145 } else { 7146 /* 7147 * Not head of list, no need to do 7148 * replacement, just adjust list pointers. 7149 */ 7150 prev_p->nextp = curr_p->nextp; 7151 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7152 } 7153 break; 7154 } 7155 7156 prev_p = curr_p; 7157 curr_p = curr_p->nextp; 7158 } 7159 7160 RW_EXIT(&vswp->mfdbrw); 7161 7162 D1(vswp, "%s: exit", __func__); 7163 7164 return (0); 7165 } 7166 7167 /* 7168 * Port is being deleted, but has registered an interest in one 7169 * or more multicast groups. Using the list of addresses maintained 7170 * within the port structure find the appropriate entry in the hash 7171 * table and remove this port from the list of interested ports. 7172 */ 7173 static void 7174 vsw_del_mcst_port(vsw_port_t *port) 7175 { 7176 mcst_addr_t *mcst_p = NULL; 7177 vsw_t *vswp = port->p_vswp; 7178 7179 D1(vswp, "%s: enter", __func__); 7180 7181 mutex_enter(&port->mca_lock); 7182 while (port->mcap != NULL) { 7183 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7184 port->mcap->addr, port); 7185 7186 mcst_p = port->mcap->nextp; 7187 kmem_free(port->mcap, sizeof (mcst_addr_t)); 7188 port->mcap = mcst_p; 7189 } 7190 mutex_exit(&port->mca_lock); 7191 7192 D1(vswp, "%s: exit", __func__); 7193 } 7194 7195 /* 7196 * This vsw instance is detaching, but has registered an interest in one 7197 * or more multicast groups. Using the list of addresses maintained 7198 * within the vsw structure find the appropriate entry in the hash 7199 * table and remove this instance from the list of interested ports. 7200 */ 7201 static void 7202 vsw_del_mcst_vsw(vsw_t *vswp) 7203 { 7204 mcst_addr_t *next_p = NULL; 7205 7206 D1(vswp, "%s: enter", __func__); 7207 7208 mutex_enter(&vswp->mca_lock); 7209 7210 while (vswp->mcap != NULL) { 7211 DERR(vswp, "%s: deleting addr 0x%llx", 7212 __func__, vswp->mcap->addr); 7213 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 7214 vswp->mcap->addr, NULL); 7215 7216 next_p = vswp->mcap->nextp; 7217 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 7218 vswp->mcap = next_p; 7219 } 7220 7221 vswp->mcap = NULL; 7222 mutex_exit(&vswp->mca_lock); 7223 7224 D1(vswp, "%s: exit", __func__); 7225 } 7226 7227 7228 /* 7229 * Remove the specified address from the list of address maintained 7230 * in this port node. 7231 */ 7232 static void 7233 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 7234 { 7235 vsw_t *vswp = NULL; 7236 vsw_port_t *port = NULL; 7237 mcst_addr_t *prev_p = NULL; 7238 mcst_addr_t *curr_p = NULL; 7239 7240 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 7241 __func__, devtype, addr); 7242 7243 if (devtype == VSW_VNETPORT) { 7244 port = (vsw_port_t *)arg; 7245 mutex_enter(&port->mca_lock); 7246 prev_p = curr_p = port->mcap; 7247 } else { 7248 vswp = (vsw_t *)arg; 7249 mutex_enter(&vswp->mca_lock); 7250 prev_p = curr_p = vswp->mcap; 7251 } 7252 7253 while (curr_p != NULL) { 7254 if (curr_p->addr == addr) { 7255 D2(NULL, "%s: address found", __func__); 7256 /* match found */ 7257 if (prev_p == curr_p) { 7258 /* list head */ 7259 if (devtype == VSW_VNETPORT) 7260 port->mcap = curr_p->nextp; 7261 else 7262 vswp->mcap = curr_p->nextp; 7263 } else { 7264 prev_p->nextp = curr_p->nextp; 7265 } 7266 kmem_free(curr_p, sizeof (mcst_addr_t)); 7267 break; 7268 } else { 7269 prev_p = curr_p; 7270 curr_p = curr_p->nextp; 7271 } 7272 } 7273 7274 if (devtype == VSW_VNETPORT) 7275 mutex_exit(&port->mca_lock); 7276 else 7277 mutex_exit(&vswp->mca_lock); 7278 7279 D1(NULL, "%s: exit", __func__); 7280 } 7281 7282 /* 7283 * Creates a descriptor ring (dring) and links it into the 7284 * link of outbound drings for this channel. 7285 * 7286 * Returns NULL if creation failed. 7287 */ 7288 static dring_info_t * 7289 vsw_create_dring(vsw_ldc_t *ldcp) 7290 { 7291 vsw_private_desc_t *priv_addr = NULL; 7292 vsw_t *vswp = ldcp->ldc_vswp; 7293 ldc_mem_info_t minfo; 7294 dring_info_t *dp, *tp; 7295 int i; 7296 7297 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 7298 7299 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 7300 7301 /* create public section of ring */ 7302 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 7303 VSW_PUB_SIZE, &dp->handle)) != 0) { 7304 7305 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 7306 "failed", ldcp->ldc_id); 7307 goto create_fail_exit; 7308 } 7309 7310 ASSERT(dp->handle != NULL); 7311 7312 /* 7313 * Get the base address of the public section of the ring. 7314 */ 7315 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 7316 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 7317 ldcp->ldc_id); 7318 goto dring_fail_exit; 7319 } else { 7320 ASSERT(minfo.vaddr != 0); 7321 dp->pub_addr = minfo.vaddr; 7322 } 7323 7324 dp->num_descriptors = VSW_RING_NUM_EL; 7325 dp->descriptor_size = VSW_PUB_SIZE; 7326 dp->options = VIO_TX_DRING; 7327 dp->ncookies = 1; /* guaranteed by ldc */ 7328 7329 /* 7330 * create private portion of ring 7331 */ 7332 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 7333 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 7334 7335 if (vsw_setup_ring(ldcp, dp)) { 7336 DERR(vswp, "%s: unable to setup ring", __func__); 7337 goto dring_fail_exit; 7338 } 7339 7340 /* haven't used any descriptors yet */ 7341 dp->end_idx = 0; 7342 dp->last_ack_recv = -1; 7343 7344 /* bind dring to the channel */ 7345 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 7346 LDC_SHADOW_MAP, LDC_MEM_RW, 7347 &dp->cookie[0], &dp->ncookies)) != 0) { 7348 DERR(vswp, "vsw_create_dring: unable to bind to channel " 7349 "%lld", ldcp->ldc_id); 7350 goto dring_fail_exit; 7351 } 7352 7353 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 7354 dp->restart_reqd = B_TRUE; 7355 7356 /* 7357 * Only ever create rings for outgoing lane. Link it onto 7358 * end of list. 7359 */ 7360 if (ldcp->lane_out.dringp == NULL) { 7361 D2(vswp, "vsw_create_dring: adding first outbound ring"); 7362 ldcp->lane_out.dringp = dp; 7363 } else { 7364 tp = ldcp->lane_out.dringp; 7365 while (tp->next != NULL) 7366 tp = tp->next; 7367 7368 tp->next = dp; 7369 } 7370 7371 return (dp); 7372 7373 dring_fail_exit: 7374 (void) ldc_mem_dring_destroy(dp->handle); 7375 7376 create_fail_exit: 7377 if (dp->priv_addr != NULL) { 7378 priv_addr = dp->priv_addr; 7379 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7380 if (priv_addr->memhandle != NULL) 7381 (void) ldc_mem_free_handle( 7382 priv_addr->memhandle); 7383 priv_addr++; 7384 } 7385 kmem_free(dp->priv_addr, 7386 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 7387 } 7388 mutex_destroy(&dp->dlock); 7389 7390 kmem_free(dp, sizeof (dring_info_t)); 7391 return (NULL); 7392 } 7393 7394 /* 7395 * Create a ring consisting of just a private portion and link 7396 * it into the list of rings for the outbound lane. 7397 * 7398 * These type of rings are used primarily for temporary data 7399 * storage (i.e. as data buffers). 7400 */ 7401 void 7402 vsw_create_privring(vsw_ldc_t *ldcp) 7403 { 7404 dring_info_t *dp, *tp; 7405 vsw_t *vswp = ldcp->ldc_vswp; 7406 7407 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7408 7409 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 7410 7411 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 7412 7413 /* no public section */ 7414 dp->pub_addr = NULL; 7415 7416 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 7417 VSW_RING_NUM_EL), KM_SLEEP); 7418 7419 dp->num_descriptors = VSW_RING_NUM_EL; 7420 7421 if (vsw_setup_ring(ldcp, dp)) { 7422 DERR(vswp, "%s: setup of ring failed", __func__); 7423 kmem_free(dp->priv_addr, 7424 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 7425 mutex_destroy(&dp->dlock); 7426 kmem_free(dp, sizeof (dring_info_t)); 7427 return; 7428 } 7429 7430 /* haven't used any descriptors yet */ 7431 dp->end_idx = 0; 7432 7433 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 7434 dp->restart_reqd = B_TRUE; 7435 7436 /* 7437 * Only ever create rings for outgoing lane. Link it onto 7438 * end of list. 7439 */ 7440 if (ldcp->lane_out.dringp == NULL) { 7441 D2(vswp, "%s: adding first outbound privring", __func__); 7442 ldcp->lane_out.dringp = dp; 7443 } else { 7444 tp = ldcp->lane_out.dringp; 7445 while (tp->next != NULL) 7446 tp = tp->next; 7447 7448 tp->next = dp; 7449 } 7450 7451 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7452 } 7453 7454 /* 7455 * Setup the descriptors in the dring. Returns 0 on success, 1 on 7456 * failure. 7457 */ 7458 int 7459 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 7460 { 7461 vnet_public_desc_t *pub_addr = NULL; 7462 vsw_private_desc_t *priv_addr = NULL; 7463 vsw_t *vswp = ldcp->ldc_vswp; 7464 uint64_t *tmpp; 7465 uint64_t offset = 0; 7466 uint32_t ncookies = 0; 7467 static char *name = "vsw_setup_ring"; 7468 int i, j, nc, rv; 7469 7470 priv_addr = dp->priv_addr; 7471 pub_addr = dp->pub_addr; 7472 7473 /* public section may be null but private should never be */ 7474 ASSERT(priv_addr != NULL); 7475 7476 /* 7477 * Allocate the region of memory which will be used to hold 7478 * the data the descriptors will refer to. 7479 */ 7480 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 7481 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 7482 7483 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 7484 dp->data_sz, dp->data_addr); 7485 7486 tmpp = (uint64_t *)dp->data_addr; 7487 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 7488 7489 /* 7490 * Initialise some of the private and public (if they exist) 7491 * descriptor fields. 7492 */ 7493 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7494 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 7495 7496 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 7497 &priv_addr->memhandle)) != 0) { 7498 DERR(vswp, "%s: alloc mem handle failed", name); 7499 goto setup_ring_cleanup; 7500 } 7501 7502 priv_addr->datap = (void *)tmpp; 7503 7504 rv = ldc_mem_bind_handle(priv_addr->memhandle, 7505 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 7506 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 7507 &(priv_addr->memcookie[0]), &ncookies); 7508 if (rv != 0) { 7509 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 7510 "(rv %d)", name, ldcp->ldc_id, rv); 7511 goto setup_ring_cleanup; 7512 } 7513 priv_addr->bound = 1; 7514 7515 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 7516 name, i, priv_addr->memcookie[0].addr, 7517 priv_addr->memcookie[0].size); 7518 7519 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 7520 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 7521 "invalid num of cookies (%d) for size 0x%llx", 7522 name, ldcp->ldc_id, ncookies, 7523 VSW_RING_EL_DATA_SZ); 7524 7525 goto setup_ring_cleanup; 7526 } else { 7527 for (j = 1; j < ncookies; j++) { 7528 rv = ldc_mem_nextcookie(priv_addr->memhandle, 7529 &(priv_addr->memcookie[j])); 7530 if (rv != 0) { 7531 DERR(vswp, "%s: ldc_mem_nextcookie " 7532 "failed rv (%d)", name, rv); 7533 goto setup_ring_cleanup; 7534 } 7535 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 7536 "size 0x%llx", name, j, 7537 priv_addr->memcookie[j].addr, 7538 priv_addr->memcookie[j].size); 7539 } 7540 7541 } 7542 priv_addr->ncookies = ncookies; 7543 priv_addr->dstate = VIO_DESC_FREE; 7544 7545 if (pub_addr != NULL) { 7546 7547 /* link pub and private sides */ 7548 priv_addr->descp = pub_addr; 7549 7550 pub_addr->ncookies = priv_addr->ncookies; 7551 7552 for (nc = 0; nc < pub_addr->ncookies; nc++) { 7553 bcopy(&priv_addr->memcookie[nc], 7554 &pub_addr->memcookie[nc], 7555 sizeof (ldc_mem_cookie_t)); 7556 } 7557 7558 pub_addr->hdr.dstate = VIO_DESC_FREE; 7559 pub_addr++; 7560 } 7561 7562 /* 7563 * move to next element in the dring and the next 7564 * position in the data buffer. 7565 */ 7566 priv_addr++; 7567 tmpp += offset; 7568 } 7569 7570 return (0); 7571 7572 setup_ring_cleanup: 7573 priv_addr = dp->priv_addr; 7574 7575 for (j = 0; j < i; j++) { 7576 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 7577 (void) ldc_mem_free_handle(priv_addr->memhandle); 7578 7579 mutex_destroy(&priv_addr->dstate_lock); 7580 7581 priv_addr++; 7582 } 7583 kmem_free(dp->data_addr, dp->data_sz); 7584 7585 return (1); 7586 } 7587 7588 /* 7589 * Searches the private section of a ring for a free descriptor, 7590 * starting at the location of the last free descriptor found 7591 * previously. 7592 * 7593 * Returns 0 if free descriptor is available, and updates state 7594 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 7595 * 7596 * FUTURE: might need to return contiguous range of descriptors 7597 * as dring info msg assumes all will be contiguous. 7598 */ 7599 static int 7600 vsw_dring_find_free_desc(dring_info_t *dringp, 7601 vsw_private_desc_t **priv_p, int *idx) 7602 { 7603 vsw_private_desc_t *addr = NULL; 7604 int num = VSW_RING_NUM_EL; 7605 int ret = 1; 7606 7607 D1(NULL, "%s enter\n", __func__); 7608 7609 ASSERT(dringp->priv_addr != NULL); 7610 7611 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 7612 __func__, dringp, dringp->end_idx); 7613 7614 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 7615 7616 mutex_enter(&addr->dstate_lock); 7617 if (addr->dstate == VIO_DESC_FREE) { 7618 addr->dstate = VIO_DESC_READY; 7619 *priv_p = addr; 7620 *idx = dringp->end_idx; 7621 dringp->end_idx = (dringp->end_idx + 1) % num; 7622 ret = 0; 7623 7624 } 7625 mutex_exit(&addr->dstate_lock); 7626 7627 /* ring full */ 7628 if (ret == 1) { 7629 D2(NULL, "%s: no desp free: started at %d", __func__, 7630 dringp->end_idx); 7631 } 7632 7633 D1(NULL, "%s: exit\n", __func__); 7634 7635 return (ret); 7636 } 7637 7638 /* 7639 * Map from a dring identifier to the ring itself. Returns 7640 * pointer to ring or NULL if no match found. 7641 */ 7642 static dring_info_t * 7643 vsw_ident2dring(lane_t *lane, uint64_t ident) 7644 { 7645 dring_info_t *dp = NULL; 7646 7647 if ((dp = lane->dringp) == NULL) { 7648 return (NULL); 7649 } else { 7650 if (dp->ident == ident) 7651 return (dp); 7652 7653 while (dp != NULL) { 7654 if (dp->ident == ident) 7655 break; 7656 dp = dp->next; 7657 } 7658 } 7659 7660 return (dp); 7661 } 7662 7663 /* 7664 * Set the default lane attributes. These are copied into 7665 * the attr msg we send to our peer. If they are not acceptable 7666 * then (currently) the handshake ends. 7667 */ 7668 static void 7669 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 7670 { 7671 bzero(lp, sizeof (lane_t)); 7672 7673 READ_ENTER(&vswp->if_lockrw); 7674 ether_copy(&(vswp->if_addr), &(lp->addr)); 7675 RW_EXIT(&vswp->if_lockrw); 7676 7677 lp->mtu = VSW_MTU; 7678 lp->addr_type = ADDR_TYPE_MAC; 7679 lp->xfer_mode = VIO_DRING_MODE; 7680 lp->ack_freq = 0; /* for shared mode */ 7681 7682 mutex_enter(&lp->seq_lock); 7683 lp->seq_num = VNET_ISS; 7684 mutex_exit(&lp->seq_lock); 7685 } 7686 7687 /* 7688 * Verify that the attributes are acceptable. 7689 * 7690 * FUTURE: If some attributes are not acceptable, change them 7691 * our desired values. 7692 */ 7693 static int 7694 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 7695 { 7696 int ret = 0; 7697 7698 D1(NULL, "vsw_check_attr enter\n"); 7699 7700 /* 7701 * Note we currently only support in-band descriptors 7702 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 7703 */ 7704 if ((pkt->xfer_mode != VIO_DESC_MODE) && 7705 (pkt->xfer_mode != VIO_DRING_MODE)) { 7706 D2(NULL, "vsw_check_attr: unknown mode %x\n", 7707 pkt->xfer_mode); 7708 ret = 1; 7709 } 7710 7711 /* Only support MAC addresses at moment. */ 7712 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 7713 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 7714 "or address 0x%llx\n", pkt->addr_type, 7715 pkt->addr); 7716 ret = 1; 7717 } 7718 7719 /* 7720 * MAC address supplied by device should match that stored 7721 * in the vsw-port OBP node. Need to decide what to do if they 7722 * don't match, for the moment just warn but don't fail. 7723 */ 7724 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 7725 DERR(NULL, "vsw_check_attr: device supplied address " 7726 "0x%llx doesn't match node address 0x%llx\n", 7727 pkt->addr, port->p_macaddr); 7728 } 7729 7730 /* 7731 * Ack freq only makes sense in pkt mode, in shared 7732 * mode the ring descriptors say whether or not to 7733 * send back an ACK. 7734 */ 7735 if ((pkt->xfer_mode == VIO_DRING_MODE) && 7736 (pkt->ack_freq > 0)) { 7737 D2(NULL, "vsw_check_attr: non zero ack freq " 7738 " in SHM mode\n"); 7739 ret = 1; 7740 } 7741 7742 /* 7743 * Note: for the moment we only support ETHER 7744 * frames. This may change in the future. 7745 */ 7746 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 7747 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 7748 pkt->mtu); 7749 ret = 1; 7750 } 7751 7752 D1(NULL, "vsw_check_attr exit\n"); 7753 7754 return (ret); 7755 } 7756 7757 /* 7758 * Returns 1 if there is a problem, 0 otherwise. 7759 */ 7760 static int 7761 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 7762 { 7763 _NOTE(ARGUNUSED(pkt)) 7764 7765 int ret = 0; 7766 7767 D1(NULL, "vsw_check_dring_info enter\n"); 7768 7769 if ((pkt->num_descriptors == 0) || 7770 (pkt->descriptor_size == 0) || 7771 (pkt->ncookies != 1)) { 7772 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 7773 ret = 1; 7774 } 7775 7776 D1(NULL, "vsw_check_dring_info exit\n"); 7777 7778 return (ret); 7779 } 7780 7781 /* 7782 * Returns 1 if two memory cookies match. Otherwise returns 0. 7783 */ 7784 static int 7785 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 7786 { 7787 if ((m1->addr != m2->addr) || 7788 (m2->size != m2->size)) { 7789 return (0); 7790 } else { 7791 return (1); 7792 } 7793 } 7794 7795 /* 7796 * Returns 1 if ring described in reg message matches that 7797 * described by dring_info structure. Otherwise returns 0. 7798 */ 7799 static int 7800 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 7801 { 7802 if ((msg->descriptor_size != dp->descriptor_size) || 7803 (msg->num_descriptors != dp->num_descriptors) || 7804 (msg->ncookies != dp->ncookies) || 7805 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 7806 return (0); 7807 } else { 7808 return (1); 7809 } 7810 7811 } 7812 7813 static caddr_t 7814 vsw_print_ethaddr(uint8_t *a, char *ebuf) 7815 { 7816 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 7817 a[0], a[1], a[2], a[3], a[4], a[5]); 7818 return (ebuf); 7819 } 7820 7821 /* 7822 * Reset and free all the resources associated with 7823 * the channel. 7824 */ 7825 static void 7826 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 7827 { 7828 dring_info_t *dp, *dpp; 7829 lane_t *lp = NULL; 7830 int rv = 0; 7831 7832 ASSERT(ldcp != NULL); 7833 7834 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 7835 7836 if (dir == INBOUND) { 7837 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 7838 " of channel %lld", __func__, ldcp->ldc_id); 7839 lp = &ldcp->lane_in; 7840 } else { 7841 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 7842 " of channel %lld", __func__, ldcp->ldc_id); 7843 lp = &ldcp->lane_out; 7844 } 7845 7846 lp->lstate = VSW_LANE_INACTIV; 7847 mutex_enter(&lp->seq_lock); 7848 lp->seq_num = VNET_ISS; 7849 mutex_exit(&lp->seq_lock); 7850 if (lp->dringp) { 7851 if (dir == INBOUND) { 7852 dp = lp->dringp; 7853 while (dp != NULL) { 7854 dpp = dp->next; 7855 if (dp->handle != NULL) 7856 (void) ldc_mem_dring_unmap(dp->handle); 7857 kmem_free(dp, sizeof (dring_info_t)); 7858 dp = dpp; 7859 } 7860 } else { 7861 /* 7862 * unbind, destroy exported dring, free dring struct 7863 */ 7864 dp = lp->dringp; 7865 rv = vsw_free_ring(dp); 7866 } 7867 if (rv == 0) { 7868 lp->dringp = NULL; 7869 } 7870 } 7871 7872 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 7873 } 7874 7875 /* 7876 * Free ring and all associated resources. 7877 */ 7878 static int 7879 vsw_free_ring(dring_info_t *dp) 7880 { 7881 vsw_private_desc_t *paddr = NULL; 7882 dring_info_t *dpp; 7883 int i, rv = 1; 7884 7885 while (dp != NULL) { 7886 mutex_enter(&dp->dlock); 7887 dpp = dp->next; 7888 if (dp->priv_addr != NULL) { 7889 /* 7890 * First unbind and free the memory handles 7891 * stored in each descriptor within the ring. 7892 */ 7893 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7894 paddr = (vsw_private_desc_t *) 7895 dp->priv_addr + i; 7896 if (paddr->memhandle != NULL) { 7897 if (paddr->bound == 1) { 7898 rv = ldc_mem_unbind_handle( 7899 paddr->memhandle); 7900 7901 if (rv != 0) { 7902 DERR(NULL, "error " 7903 "unbinding handle for " 7904 "ring 0x%llx at pos %d", 7905 dp, i); 7906 mutex_exit(&dp->dlock); 7907 return (rv); 7908 } 7909 paddr->bound = 0; 7910 } 7911 7912 rv = ldc_mem_free_handle( 7913 paddr->memhandle); 7914 if (rv != 0) { 7915 DERR(NULL, "error freeing " 7916 "handle for ring " 7917 "0x%llx at pos %d", 7918 dp, i); 7919 mutex_exit(&dp->dlock); 7920 return (rv); 7921 } 7922 paddr->memhandle = NULL; 7923 } 7924 mutex_destroy(&paddr->dstate_lock); 7925 } 7926 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 7927 * VSW_RING_NUM_EL)); 7928 } 7929 7930 /* 7931 * Now unbind and destroy the ring itself. 7932 */ 7933 if (dp->handle != NULL) { 7934 (void) ldc_mem_dring_unbind(dp->handle); 7935 (void) ldc_mem_dring_destroy(dp->handle); 7936 } 7937 7938 if (dp->data_addr != NULL) { 7939 kmem_free(dp->data_addr, dp->data_sz); 7940 } 7941 7942 mutex_exit(&dp->dlock); 7943 mutex_destroy(&dp->dlock); 7944 mutex_destroy(&dp->restart_lock); 7945 kmem_free(dp, sizeof (dring_info_t)); 7946 7947 dp = dpp; 7948 } 7949 return (0); 7950 } 7951 7952 /* 7953 * Debugging routines 7954 */ 7955 static void 7956 display_state(void) 7957 { 7958 vsw_t *vswp; 7959 vsw_port_list_t *plist; 7960 vsw_port_t *port; 7961 vsw_ldc_list_t *ldcl; 7962 vsw_ldc_t *ldcp; 7963 7964 cmn_err(CE_NOTE, "***** system state *****"); 7965 7966 for (vswp = vsw_head; vswp; vswp = vswp->next) { 7967 plist = &vswp->plist; 7968 READ_ENTER(&plist->lockrw); 7969 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 7970 vswp->instance, plist->num_ports); 7971 7972 for (port = plist->head; port != NULL; port = port->p_next) { 7973 ldcl = &port->p_ldclist; 7974 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 7975 port->p_instance, ldcl->num_ldcs); 7976 READ_ENTER(&ldcl->lockrw); 7977 ldcp = ldcl->head; 7978 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 7979 cmn_err(CE_CONT, "chan %lu : dev %d : " 7980 "status %d : phase %u\n", 7981 ldcp->ldc_id, ldcp->dev_class, 7982 ldcp->ldc_status, ldcp->hphase); 7983 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 7984 "psession %lu\n", 7985 ldcp->ldc_id, 7986 ldcp->local_session, 7987 ldcp->peer_session); 7988 7989 cmn_err(CE_CONT, "Inbound lane:\n"); 7990 display_lane(&ldcp->lane_in); 7991 cmn_err(CE_CONT, "Outbound lane:\n"); 7992 display_lane(&ldcp->lane_out); 7993 } 7994 RW_EXIT(&ldcl->lockrw); 7995 } 7996 RW_EXIT(&plist->lockrw); 7997 } 7998 cmn_err(CE_NOTE, "***** system state *****"); 7999 } 8000 8001 static void 8002 display_lane(lane_t *lp) 8003 { 8004 dring_info_t *drp; 8005 8006 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 8007 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 8008 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 8009 lp->addr_type, lp->addr, lp->xfer_mode); 8010 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 8011 8012 cmn_err(CE_CONT, "Dring info:\n"); 8013 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 8014 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 8015 drp->num_descriptors, drp->descriptor_size); 8016 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 8017 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 8018 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 8019 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 8020 drp->ident, drp->end_idx); 8021 display_ring(drp); 8022 } 8023 } 8024 8025 static void 8026 display_ring(dring_info_t *dringp) 8027 { 8028 uint64_t i; 8029 uint64_t priv_count = 0; 8030 uint64_t pub_count = 0; 8031 vnet_public_desc_t *pub_addr = NULL; 8032 vsw_private_desc_t *priv_addr = NULL; 8033 8034 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8035 if (dringp->pub_addr != NULL) { 8036 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 8037 8038 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 8039 pub_count++; 8040 } 8041 8042 if (dringp->priv_addr != NULL) { 8043 priv_addr = 8044 (vsw_private_desc_t *)dringp->priv_addr + i; 8045 8046 if (priv_addr->dstate == VIO_DESC_FREE) 8047 priv_count++; 8048 } 8049 } 8050 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 8051 i, priv_count, pub_count); 8052 } 8053 8054 static void 8055 dump_flags(uint64_t state) 8056 { 8057 int i; 8058 8059 typedef struct flag_name { 8060 int flag_val; 8061 char *flag_name; 8062 } flag_name_t; 8063 8064 flag_name_t flags[] = { 8065 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 8066 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 8067 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 8068 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 8069 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 8070 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 8071 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 8072 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 8073 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 8074 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 8075 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 8076 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 8077 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 8078 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 8079 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 8080 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 8081 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 8082 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 8083 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 8084 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 8085 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 8086 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 8087 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 8088 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 8089 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 8090 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 8091 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 8092 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 8093 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 8094 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 8095 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 8096 8097 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 8098 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 8099 if (state & flags[i].flag_val) 8100 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 8101 } 8102 } 8103