1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 #include <sys/atomic.h> 74 75 /* Switching setup routines */ 76 void vsw_setup_switching_timeout(void *arg); 77 void vsw_stop_switching_timeout(vsw_t *vswp); 78 int vsw_setup_switching(vsw_t *); 79 static int vsw_setup_layer2(vsw_t *); 80 static int vsw_setup_layer3(vsw_t *); 81 82 /* Switching/data transmit routines */ 83 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 84 vsw_port_t *port, mac_resource_handle_t); 85 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 86 vsw_port_t *port, mac_resource_handle_t); 87 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, mblk_t *mpt, 88 int caller, vsw_port_t *port); 89 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, mblk_t *mpt, 90 int caller, vsw_port_t *port); 91 92 /* Forwarding database (FDB) routines */ 93 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 94 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 95 void vsw_del_mcst_port(vsw_port_t *); 96 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 97 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 98 void vsw_del_mcst_vsw(vsw_t *); 99 int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 100 int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 101 102 /* Support functions */ 103 static mblk_t *vsw_dupmsgchain(mblk_t *mp); 104 static int vsw_get_same_dest_list(struct ether_header *ehp, 105 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp); 106 107 108 /* 109 * Functions imported from other files. 110 */ 111 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 112 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t); 113 extern int vsw_mac_open(vsw_t *vswp); 114 extern void vsw_mac_close(vsw_t *vswp); 115 extern void vsw_mac_rx(vsw_t *vswp, int caller, mac_resource_handle_t mrh, 116 mblk_t *mp, mblk_t *mpt, vsw_macrx_flags_t flags); 117 extern void vsw_mac_rx(vsw_t *vswp, int caller, mac_resource_handle_t mrh, 118 mblk_t *mp, mblk_t *mpt, vsw_macrx_flags_t flags); 119 extern void vsw_set_addrs(vsw_t *vswp); 120 extern int vsw_get_hw_maddr(vsw_t *); 121 extern int vsw_mac_attach(vsw_t *vswp); 122 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt); 123 124 /* 125 * Tunables used in this file. 126 */ 127 extern int vsw_setup_switching_delay; 128 129 130 /* 131 * Timeout routine to setup switching mode: 132 * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop() 133 * initially. If it fails and the error is EAGAIN, then this timeout handler 134 * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried 135 * until we successfully finish it; or the returned error is not EAGAIN. 136 */ 137 void 138 vsw_setup_switching_timeout(void *arg) 139 { 140 vsw_t *vswp = (vsw_t *)arg; 141 int rv; 142 143 if (vswp->swtmout_enabled == B_FALSE) 144 return; 145 146 rv = vsw_setup_switching(vswp); 147 148 if (rv == 0) { 149 /* 150 * Successfully setup switching mode. 151 * Program unicst, mcst addrs of vsw 152 * interface and ports in the physdev. 153 */ 154 vsw_set_addrs(vswp); 155 } 156 157 mutex_enter(&vswp->swtmout_lock); 158 159 if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) { 160 /* 161 * Reschedule timeout() if the error is EAGAIN and the 162 * timeout is still enabled. For errors other than EAGAIN, 163 * we simply return without rescheduling timeout(). 164 */ 165 vswp->swtmout_id = 166 timeout(vsw_setup_switching_timeout, vswp, 167 (vsw_setup_switching_delay * drv_usectohz(MICROSEC))); 168 goto exit; 169 } 170 171 /* timeout handler completed */ 172 vswp->swtmout_enabled = B_FALSE; 173 vswp->swtmout_id = 0; 174 175 exit: 176 mutex_exit(&vswp->swtmout_lock); 177 } 178 179 /* 180 * Cancel the timeout handler to setup switching mode. 181 */ 182 void 183 vsw_stop_switching_timeout(vsw_t *vswp) 184 { 185 timeout_id_t tid; 186 187 mutex_enter(&vswp->swtmout_lock); 188 189 tid = vswp->swtmout_id; 190 191 if (tid != 0) { 192 /* signal timeout handler to stop */ 193 vswp->swtmout_enabled = B_FALSE; 194 vswp->swtmout_id = 0; 195 mutex_exit(&vswp->swtmout_lock); 196 197 (void) untimeout(tid); 198 } else { 199 mutex_exit(&vswp->swtmout_lock); 200 } 201 202 (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE); 203 204 mutex_enter(&vswp->mac_lock); 205 vswp->mac_open_retries = 0; 206 mutex_exit(&vswp->mac_lock); 207 } 208 209 /* 210 * Setup the required switching mode. 211 * This routine is invoked from vsw_attach() or vsw_update_md_prop() 212 * initially. If it fails and the error is EAGAIN, then a timeout handler 213 * is started to retry vsw_setup_switching(), until it successfully finishes; 214 * or the returned error is not EAGAIN. 215 * 216 * Returns: 217 * 0 on success. 218 * EAGAIN if retry is needed. 219 * 1 on all other failures. 220 */ 221 int 222 vsw_setup_switching(vsw_t *vswp) 223 { 224 int i, rv = 1; 225 226 D1(vswp, "%s: enter", __func__); 227 228 /* 229 * Select best switching mode. 230 * Note that we start from the saved smode_idx. This is done as 231 * this routine can be called from the timeout handler to retry 232 * setting up a specific mode. Currently only the function which 233 * sets up layer2/promisc mode returns EAGAIN if the underlying 234 * physical device is not available yet, causing retries. 235 */ 236 for (i = vswp->smode_idx; i < vswp->smode_num; i++) { 237 vswp->smode_idx = i; 238 switch (vswp->smode[i]) { 239 case VSW_LAYER2: 240 case VSW_LAYER2_PROMISC: 241 rv = vsw_setup_layer2(vswp); 242 break; 243 244 case VSW_LAYER3: 245 rv = vsw_setup_layer3(vswp); 246 break; 247 248 default: 249 DERR(vswp, "unknown switch mode"); 250 break; 251 } 252 253 if ((rv == 0) || (rv == EAGAIN)) 254 break; 255 256 /* all other errors(rv != 0): continue & select the next mode */ 257 rv = 1; 258 } 259 260 if (rv && (rv != EAGAIN)) { 261 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 262 "switching mode", vswp->instance); 263 } else if (rv == 0) { 264 (void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE); 265 } 266 267 D2(vswp, "%s: Operating in mode %d", __func__, 268 vswp->smode[vswp->smode_idx]); 269 270 D1(vswp, "%s: exit", __func__); 271 272 return (rv); 273 } 274 275 /* 276 * Setup for layer 2 switching. 277 * 278 * Returns: 279 * 0 on success. 280 * EAGAIN if retry is needed. 281 * EIO on all other failures. 282 */ 283 static int 284 vsw_setup_layer2(vsw_t *vswp) 285 { 286 int rv; 287 288 D1(vswp, "%s: enter", __func__); 289 290 vswp->vsw_switch_frame = vsw_switch_l2_frame; 291 292 rv = strlen(vswp->physname); 293 if (rv == 0) { 294 /* 295 * Physical device name is NULL, which is 296 * required for layer 2. 297 */ 298 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 299 vswp->instance); 300 return (EIO); 301 } 302 303 mutex_enter(&vswp->mac_lock); 304 305 rv = vsw_mac_open(vswp); 306 if (rv != 0) { 307 if (rv != EAGAIN) { 308 cmn_err(CE_WARN, "!vsw%d: Unable to open physical " 309 "device: %s\n", vswp->instance, vswp->physname); 310 } 311 mutex_exit(&vswp->mac_lock); 312 return (rv); 313 } 314 315 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 316 /* 317 * Verify that underlying device can support multiple 318 * unicast mac addresses. 319 */ 320 rv = vsw_get_hw_maddr(vswp); 321 if (rv != 0) { 322 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 323 "layer2 switching", vswp->instance); 324 goto exit_error; 325 } 326 } 327 328 /* 329 * Attempt to link into the MAC layer so we can get 330 * and send packets out over the physical adapter. 331 */ 332 rv = vsw_mac_attach(vswp); 333 if (rv != 0) { 334 /* 335 * Registration with the MAC layer has failed, 336 * so return error so that can fall back to next 337 * prefered switching method. 338 */ 339 cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: " 340 "%s\n", vswp->instance, vswp->physname); 341 goto exit_error; 342 } 343 344 D1(vswp, "%s: exit", __func__); 345 346 mutex_exit(&vswp->mac_lock); 347 return (0); 348 349 exit_error: 350 vsw_mac_close(vswp); 351 mutex_exit(&vswp->mac_lock); 352 return (EIO); 353 } 354 355 static int 356 vsw_setup_layer3(vsw_t *vswp) 357 { 358 D1(vswp, "%s: enter", __func__); 359 360 D2(vswp, "%s: operating in layer 3 mode", __func__); 361 vswp->vsw_switch_frame = vsw_switch_l3_frame; 362 363 D1(vswp, "%s: exit", __func__); 364 365 return (0); 366 } 367 368 /* 369 * Switch the given ethernet frame when operating in layer 2 mode. 370 * 371 * vswp: pointer to the vsw instance 372 * mp: pointer to chain of ethernet frame(s) to be switched 373 * caller: identifies the source of this frame as: 374 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 375 * 2. VSW_PHYSDEV - the physical ethernet device 376 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 377 * arg: argument provided by the caller. 378 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 379 * 2. for PHYSDEV - NULL 380 * 3. for LOCALDEV - pointer to to this vsw_t(self) 381 */ 382 void 383 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 384 vsw_port_t *arg, mac_resource_handle_t mrh) 385 { 386 struct ether_header *ehp; 387 vsw_port_t *port = NULL; 388 mblk_t *bp, *ret_m; 389 mblk_t *mpt = NULL; 390 int rv; 391 vsw_port_list_t *plist = &vswp->plist; 392 393 D1(vswp, "%s: enter (caller %d)", __func__, caller); 394 395 /* 396 * PERF: rather than breaking up the chain here, scan it 397 * to find all mblks heading to same destination and then 398 * pass that sub-chain to the lower transmit functions. 399 */ 400 401 /* process the chain of packets */ 402 bp = mp; 403 while (bp) { 404 ehp = (struct ether_header *)bp->b_rptr; 405 rv = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 406 ASSERT(rv != 0); 407 408 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 409 __func__, MBLKSIZE(mp), MBLKL(mp)); 410 411 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 412 /* 413 * If destination is VSW_LOCALDEV (vsw as an eth 414 * interface) and if the device is up & running, 415 * send the packet up the stack on this host. 416 * If the virtual interface is down, drop the packet. 417 */ 418 if (caller != VSW_LOCALDEV) { 419 vsw_mac_rx(vswp, caller, mrh, mp, 420 mpt, VSW_MACRX_FREEMSG); 421 } else { 422 freemsgchain(mp); 423 } 424 continue; 425 } 426 427 READ_ENTER(&plist->lockrw); 428 port = vsw_lookup_fdb(vswp, ehp); 429 if (port) { 430 /* 431 * Mark the port as in-use before releasing the lockrw. 432 */ 433 VSW_PORT_REFHOLD(port); 434 RW_EXIT(&plist->lockrw); 435 436 /* 437 * If plumbed and in promisc mode then copy msg 438 * and send up the stack. 439 */ 440 vsw_mac_rx(vswp, caller, mrh, mp, 441 mpt, VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 442 443 /* 444 * If the destination is in FDB, the packet 445 * should be forwarded to the correponding 446 * vsw_port (connected to a vnet device - 447 * VSW_VNETPORT) 448 */ 449 (void) vsw_portsend(port, mp, mpt); 450 451 /* 452 * Decrement use count in port. 453 */ 454 VSW_PORT_REFRELE(port); 455 } else { 456 RW_EXIT(&plist->lockrw); 457 /* 458 * Destination not in FDB. 459 * 460 * If the destination is broadcast or 461 * multicast forward the packet to all 462 * (VNETPORTs, PHYSDEV, LOCALDEV), 463 * except the caller. 464 */ 465 if (IS_BROADCAST(ehp)) { 466 D3(vswp, "%s: BROADCAST pkt", __func__); 467 (void) vsw_forward_all(vswp, mp, mpt, 468 caller, arg); 469 } else if (IS_MULTICAST(ehp)) { 470 D3(vswp, "%s: MULTICAST pkt", __func__); 471 (void) vsw_forward_grp(vswp, mp, mpt, 472 caller, arg); 473 } else { 474 /* 475 * If the destination is unicast, and came 476 * from either a logical network device or 477 * the switch itself when it is plumbed, then 478 * send it out on the physical device and also 479 * up the stack if the logical interface is 480 * in promiscious mode. 481 * 482 * NOTE: The assumption here is that if we 483 * cannot find the destination in our fdb, its 484 * a unicast address, and came from either a 485 * vnet or down the stack (when plumbed) it 486 * must be destinded for an ethernet device 487 * outside our ldoms. 488 */ 489 if (caller == VSW_VNETPORT) { 490 /* promisc check copy etc */ 491 vsw_mac_rx(vswp, caller, mrh, mp, mpt, 492 VSW_MACRX_PROMISC | 493 VSW_MACRX_COPYMSG); 494 495 if ((ret_m = vsw_tx_msg(vswp, mp)) 496 != NULL) { 497 DERR(vswp, "%s: drop mblks to " 498 "phys dev", __func__); 499 freemsgchain(ret_m); 500 } 501 502 } else if (caller == VSW_PHYSDEV) { 503 /* 504 * Pkt seen because card in promisc 505 * mode. Send up stack if plumbed in 506 * promisc mode, else drop it. 507 */ 508 vsw_mac_rx(vswp, caller, mrh, mp, mpt, 509 VSW_MACRX_PROMISC | 510 VSW_MACRX_FREEMSG); 511 512 } else if (caller == VSW_LOCALDEV) { 513 /* 514 * Pkt came down the stack, send out 515 * over physical device. 516 */ 517 if ((ret_m = vsw_tx_msg(vswp, mp)) 518 != NULL) { 519 DERR(vswp, "%s: drop mblks to " 520 "phys dev", __func__); 521 freemsgchain(ret_m); 522 } 523 } 524 } 525 } 526 } 527 D1(vswp, "%s: exit\n", __func__); 528 } 529 530 /* 531 * Switch ethernet frame when in layer 3 mode (i.e. using IP 532 * layer to do the routing). 533 * 534 * There is a large amount of overlap between this function and 535 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 536 * both these functions. 537 */ 538 void 539 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 540 vsw_port_t *arg, mac_resource_handle_t mrh) 541 { 542 struct ether_header *ehp; 543 vsw_port_t *port = NULL; 544 mblk_t *bp = NULL; 545 mblk_t *mpt; 546 int rv; 547 vsw_port_list_t *plist = &vswp->plist; 548 549 D1(vswp, "%s: enter (caller %d)", __func__, caller); 550 551 /* 552 * In layer 3 mode should only ever be switching packets 553 * between IP layer and vnet devices. So make sure thats 554 * who is invoking us. 555 */ 556 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 557 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 558 freemsgchain(mp); 559 return; 560 } 561 562 /* process the chain of packets */ 563 bp = mp; 564 while (bp) { 565 ehp = (struct ether_header *)bp->b_rptr; 566 rv = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 567 ASSERT(rv != 0); 568 569 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 570 __func__, MBLKSIZE(mp), MBLKL(mp)); 571 572 READ_ENTER(&plist->lockrw); 573 port = vsw_lookup_fdb(vswp, ehp); 574 if (port) { 575 /* 576 * Mark the port as in-use before releasing the lockrw. 577 */ 578 VSW_PORT_REFHOLD(port); 579 RW_EXIT(&plist->lockrw); 580 581 D2(vswp, "%s: sending to target port", __func__); 582 (void) vsw_portsend(port, mp, mpt); 583 584 /* 585 * Decrement ref count. 586 */ 587 VSW_PORT_REFRELE(port); 588 } else { 589 RW_EXIT(&plist->lockrw); 590 /* 591 * Destination not in FDB 592 * 593 * If the destination is broadcast or 594 * multicast forward the packet to all 595 * (VNETPORTs, PHYSDEV, LOCALDEV), 596 * except the caller. 597 */ 598 if (IS_BROADCAST(ehp)) { 599 D2(vswp, "%s: BROADCAST pkt", __func__); 600 (void) vsw_forward_all(vswp, mp, mpt, 601 caller, arg); 602 } else if (IS_MULTICAST(ehp)) { 603 D2(vswp, "%s: MULTICAST pkt", __func__); 604 (void) vsw_forward_grp(vswp, mp, mpt, 605 caller, arg); 606 } else { 607 /* 608 * Unicast pkt from vnet that we don't have 609 * an FDB entry for, so must be destinded for 610 * the outside world. Attempt to send up to the 611 * IP layer to allow it to deal with it. 612 */ 613 if (caller == VSW_VNETPORT) { 614 vsw_mac_rx(vswp, caller, mrh, 615 mp, mpt, VSW_MACRX_FREEMSG); 616 } 617 } 618 } 619 } 620 621 D1(vswp, "%s: exit", __func__); 622 } 623 624 /* 625 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 626 * except the caller (port on which frame arrived). 627 */ 628 static int 629 vsw_forward_all(vsw_t *vswp, mblk_t *mp, mblk_t *mpt, 630 int caller, vsw_port_t *arg) 631 { 632 vsw_port_list_t *plist = &vswp->plist; 633 vsw_port_t *portp; 634 mblk_t *nmp = NULL; 635 mblk_t *ret_m = NULL; 636 int skip_port = 0; 637 638 D1(vswp, "vsw_forward_all: enter\n"); 639 640 /* 641 * Broadcast message from inside ldoms so send to outside 642 * world if in either of layer 2 modes. 643 */ 644 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 645 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 646 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 647 648 nmp = vsw_dupmsgchain(mp); 649 if (nmp) { 650 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 651 DERR(vswp, "%s: dropping pkt(s) " 652 "consisting of %ld bytes of data for" 653 " physical device", __func__, MBLKL(ret_m)); 654 freemsgchain(ret_m); 655 } 656 } 657 } 658 659 if (caller == VSW_VNETPORT) 660 skip_port = 1; 661 662 /* 663 * Broadcast message from other vnet (layer 2 or 3) or outside 664 * world (layer 2 only), send up stack if plumbed. 665 */ 666 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 667 vsw_mac_rx(vswp, caller, NULL, mp, mpt, VSW_MACRX_COPYMSG); 668 } 669 670 /* send it to all VNETPORTs */ 671 READ_ENTER(&plist->lockrw); 672 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 673 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 674 /* 675 * Caution ! - don't reorder these two checks as arg 676 * will be NULL if the caller is PHYSDEV. skip_port is 677 * only set if caller is VNETPORT. 678 */ 679 if ((skip_port) && (portp == arg)) { 680 continue; 681 } else { 682 nmp = vsw_dupmsgchain(mp); 683 if (nmp) { 684 mblk_t *mpt = nmp; 685 686 /* Find tail */ 687 while (mpt->b_next != NULL) { 688 mpt = mpt->b_next; 689 } 690 /* 691 * The plist->lockrw is protecting the 692 * portp from getting destroyed here. 693 * So, no ref_cnt is incremented here. 694 */ 695 (void) vsw_portsend(portp, nmp, mpt); 696 } else { 697 DERR(vswp, "vsw_forward_all: nmp NULL"); 698 } 699 } 700 } 701 RW_EXIT(&plist->lockrw); 702 703 freemsgchain(mp); 704 705 D1(vswp, "vsw_forward_all: exit\n"); 706 return (0); 707 } 708 709 /* 710 * Forward pkts to any devices or interfaces which have registered 711 * an interest in them (i.e. multicast groups). 712 */ 713 static int 714 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, mblk_t *mpt, 715 int caller, vsw_port_t *arg) 716 { 717 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 718 mfdb_ent_t *entp = NULL; 719 mfdb_ent_t *tpp = NULL; 720 vsw_port_t *port; 721 uint64_t key = 0; 722 mblk_t *nmp = NULL; 723 mblk_t *ret_m = NULL; 724 boolean_t check_if = B_TRUE; 725 726 /* 727 * Convert address to hash table key 728 */ 729 KEY_HASH(key, ehp->ether_dhost); 730 731 D1(vswp, "%s: key 0x%llx", __func__, key); 732 733 /* 734 * If pkt came from either a vnet or down the stack (if we are 735 * plumbed) and we are in layer 2 mode, then we send the pkt out 736 * over the physical adapter, and then check to see if any other 737 * vnets are interested in it. 738 */ 739 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 740 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 741 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 742 nmp = vsw_dupmsgchain(mp); 743 if (nmp) { 744 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 745 DERR(vswp, "%s: dropping pkt(s) consisting of " 746 "%ld bytes of data for physical device", 747 __func__, MBLKL(ret_m)); 748 freemsgchain(ret_m); 749 } 750 } 751 } 752 753 READ_ENTER(&vswp->mfdbrw); 754 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 755 (mod_hash_val_t *)&entp) != 0) { 756 D3(vswp, "%s: no table entry found for addr 0x%llx", 757 __func__, key); 758 } else { 759 /* 760 * Send to list of devices associated with this address... 761 */ 762 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 763 764 /* dont send to ourselves */ 765 if ((caller == VSW_VNETPORT) && 766 (tpp->d_addr == (void *)arg)) { 767 port = (vsw_port_t *)tpp->d_addr; 768 D3(vswp, "%s: not sending to ourselves" 769 " : port %d", __func__, port->p_instance); 770 continue; 771 772 } else if ((caller == VSW_LOCALDEV) && 773 (tpp->d_type == VSW_LOCALDEV)) { 774 D3(vswp, "%s: not sending back up stack", 775 __func__); 776 continue; 777 } 778 779 if (tpp->d_type == VSW_VNETPORT) { 780 port = (vsw_port_t *)tpp->d_addr; 781 D3(vswp, "%s: sending to port %ld for addr " 782 "0x%llx", __func__, port->p_instance, key); 783 784 nmp = vsw_dupmsgchain(mp); 785 if (nmp) { 786 mblk_t *mpt = nmp; 787 788 /* Find tail */ 789 while (mpt->b_next != NULL) { 790 mpt = mpt->b_next; 791 } 792 /* 793 * The vswp->mfdbrw is protecting the 794 * portp from getting destroyed here. 795 * So, no ref_cnt is incremented here. 796 */ 797 (void) vsw_portsend(port, nmp, mpt); 798 } 799 } else { 800 vsw_mac_rx(vswp, caller, NULL, 801 mp, mpt, VSW_MACRX_COPYMSG); 802 D3(vswp, "%s: sending up stack" 803 " for addr 0x%llx", __func__, key); 804 check_if = B_FALSE; 805 } 806 } 807 } 808 809 RW_EXIT(&vswp->mfdbrw); 810 811 /* 812 * If the pkt came from either a vnet or from physical device, 813 * and if we havent already sent the pkt up the stack then we 814 * check now if we can/should (i.e. the interface is plumbed 815 * and in promisc mode). 816 */ 817 if ((check_if) && 818 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 819 vsw_mac_rx(vswp, caller, NULL, mp, mpt, 820 VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 821 } 822 823 freemsgchain(mp); 824 825 D1(vswp, "%s: exit", __func__); 826 827 return (0); 828 } 829 830 /* 831 * Add an entry into FDB, for the given mac address and port_id. 832 * Returns 0 on success, 1 on failure. 833 * 834 * Lock protecting FDB must be held by calling process. 835 */ 836 int 837 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 838 { 839 uint64_t addr = 0; 840 841 D1(vswp, "%s: enter", __func__); 842 843 KEY_HASH(addr, port->p_macaddr); 844 845 D2(vswp, "%s: key = 0x%llx", __func__, addr); 846 847 /* 848 * Note: duplicate keys will be rejected by mod_hash. 849 */ 850 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 851 (mod_hash_val_t)port) != 0) { 852 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 853 return (1); 854 } 855 856 D1(vswp, "%s: exit", __func__); 857 return (0); 858 } 859 860 /* 861 * Remove an entry from FDB. 862 * Returns 0 on success, 1 on failure. 863 */ 864 int 865 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 866 { 867 uint64_t addr = 0; 868 869 D1(vswp, "%s: enter", __func__); 870 871 KEY_HASH(addr, port->p_macaddr); 872 873 D2(vswp, "%s: key = 0x%llx", __func__, addr); 874 875 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 876 877 D1(vswp, "%s: enter", __func__); 878 879 return (0); 880 } 881 882 /* 883 * Search fdb for a given mac address. 884 * Returns pointer to the entry if found, else returns NULL. 885 */ 886 static vsw_port_t * 887 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 888 { 889 uint64_t key = 0; 890 vsw_port_t *port = NULL; 891 892 D1(vswp, "%s: enter", __func__); 893 894 KEY_HASH(key, ehp->ether_dhost); 895 896 D2(vswp, "%s: key = 0x%llx", __func__, key); 897 898 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 899 (mod_hash_val_t *)&port) != 0) { 900 D2(vswp, "%s: no port found", __func__); 901 return (NULL); 902 } 903 904 D1(vswp, "%s: exit", __func__); 905 906 return (port); 907 } 908 909 /* 910 * Add or remove multicast address(es). 911 * 912 * Returns 0 on success, 1 on failure. 913 */ 914 int 915 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 916 { 917 mcst_addr_t *mcst_p = NULL; 918 vsw_t *vswp = port->p_vswp; 919 uint64_t addr = 0x0; 920 int i; 921 922 D1(vswp, "%s: enter", __func__); 923 924 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 925 926 for (i = 0; i < mcst_pkt->count; i++) { 927 /* 928 * Convert address into form that can be used 929 * as hash table key. 930 */ 931 KEY_HASH(addr, mcst_pkt->mca[i]); 932 933 /* 934 * Add or delete the specified address/port combination. 935 */ 936 if (mcst_pkt->set == 0x1) { 937 D3(vswp, "%s: adding multicast address 0x%llx for " 938 "port %ld", __func__, addr, port->p_instance); 939 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 940 /* 941 * Update the list of multicast 942 * addresses contained within the 943 * port structure to include this new 944 * one. 945 */ 946 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), 947 KM_NOSLEEP); 948 if (mcst_p == NULL) { 949 DERR(vswp, "%s: unable to alloc mem", 950 __func__); 951 (void) vsw_del_mcst(vswp, 952 VSW_VNETPORT, addr, port); 953 return (1); 954 } 955 956 mcst_p->nextp = NULL; 957 mcst_p->addr = addr; 958 ether_copy(&mcst_pkt->mca[i], &mcst_p->mca); 959 960 /* 961 * Program the address into HW. If the addr 962 * has already been programmed then the MAC 963 * just increments a ref counter (which is 964 * used when the address is being deleted) 965 */ 966 mutex_enter(&vswp->mac_lock); 967 if (vswp->mh != NULL) { 968 if (mac_multicst_add(vswp->mh, 969 (uchar_t *)&mcst_pkt->mca[i])) { 970 mutex_exit(&vswp->mac_lock); 971 cmn_err(CE_WARN, "!vsw%d: " 972 "unable to add multicast " 973 "address: %s\n", 974 vswp->instance, 975 ether_sprintf((void *) 976 &mcst_p->mca)); 977 (void) vsw_del_mcst(vswp, 978 VSW_VNETPORT, addr, port); 979 kmem_free(mcst_p, 980 sizeof (*mcst_p)); 981 return (1); 982 } 983 mcst_p->mac_added = B_TRUE; 984 } 985 mutex_exit(&vswp->mac_lock); 986 987 mutex_enter(&port->mca_lock); 988 mcst_p->nextp = port->mcap; 989 port->mcap = mcst_p; 990 mutex_exit(&port->mca_lock); 991 992 } else { 993 DERR(vswp, "%s: error adding multicast " 994 "address 0x%llx for port %ld", 995 __func__, addr, port->p_instance); 996 return (1); 997 } 998 } else { 999 /* 1000 * Delete an entry from the multicast hash 1001 * table and update the address list 1002 * appropriately. 1003 */ 1004 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 1005 D3(vswp, "%s: deleting multicast address " 1006 "0x%llx for port %ld", __func__, addr, 1007 port->p_instance); 1008 1009 mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr); 1010 ASSERT(mcst_p != NULL); 1011 1012 /* 1013 * Remove the address from HW. The address 1014 * will actually only be removed once the ref 1015 * count within the MAC layer has dropped to 1016 * zero. I.e. we can safely call this fn even 1017 * if other ports are interested in this 1018 * address. 1019 */ 1020 mutex_enter(&vswp->mac_lock); 1021 if (vswp->mh != NULL && mcst_p->mac_added) { 1022 if (mac_multicst_remove(vswp->mh, 1023 (uchar_t *)&mcst_pkt->mca[i])) { 1024 mutex_exit(&vswp->mac_lock); 1025 cmn_err(CE_WARN, "!vsw%d: " 1026 "unable to remove mcast " 1027 "address: %s\n", 1028 vswp->instance, 1029 ether_sprintf((void *) 1030 &mcst_p->mca)); 1031 kmem_free(mcst_p, 1032 sizeof (*mcst_p)); 1033 return (1); 1034 } 1035 mcst_p->mac_added = B_FALSE; 1036 } 1037 mutex_exit(&vswp->mac_lock); 1038 kmem_free(mcst_p, sizeof (*mcst_p)); 1039 1040 } else { 1041 DERR(vswp, "%s: error deleting multicast " 1042 "addr 0x%llx for port %ld", 1043 __func__, addr, port->p_instance); 1044 return (1); 1045 } 1046 } 1047 } 1048 D1(vswp, "%s: exit", __func__); 1049 return (0); 1050 } 1051 1052 /* 1053 * Add a new multicast entry. 1054 * 1055 * Search hash table based on address. If match found then 1056 * update associated val (which is chain of ports), otherwise 1057 * create new key/val (addr/port) pair and insert into table. 1058 */ 1059 int 1060 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1061 { 1062 int dup = 0; 1063 int rv = 0; 1064 mfdb_ent_t *ment = NULL; 1065 mfdb_ent_t *tmp_ent = NULL; 1066 mfdb_ent_t *new_ent = NULL; 1067 void *tgt = NULL; 1068 1069 if (devtype == VSW_VNETPORT) { 1070 /* 1071 * Being invoked from a vnet. 1072 */ 1073 ASSERT(arg != NULL); 1074 tgt = arg; 1075 D2(NULL, "%s: port %d : address 0x%llx", __func__, 1076 ((vsw_port_t *)arg)->p_instance, addr); 1077 } else { 1078 /* 1079 * We are being invoked via the m_multicst mac entry 1080 * point. 1081 */ 1082 D2(NULL, "%s: address 0x%llx", __func__, addr); 1083 tgt = (void *)vswp; 1084 } 1085 1086 WRITE_ENTER(&vswp->mfdbrw); 1087 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1088 (mod_hash_val_t *)&ment) != 0) { 1089 1090 /* address not currently in table */ 1091 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1092 ment->d_addr = (void *)tgt; 1093 ment->d_type = devtype; 1094 ment->nextp = NULL; 1095 1096 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 1097 (mod_hash_val_t)ment) != 0) { 1098 DERR(vswp, "%s: hash table insertion failed", __func__); 1099 kmem_free(ment, sizeof (mfdb_ent_t)); 1100 rv = 1; 1101 } else { 1102 D2(vswp, "%s: added initial entry for 0x%llx to " 1103 "table", __func__, addr); 1104 } 1105 } else { 1106 /* 1107 * Address in table. Check to see if specified port 1108 * is already associated with the address. If not add 1109 * it now. 1110 */ 1111 tmp_ent = ment; 1112 while (tmp_ent != NULL) { 1113 if (tmp_ent->d_addr == (void *)tgt) { 1114 if (devtype == VSW_VNETPORT) { 1115 DERR(vswp, "%s: duplicate port entry " 1116 "found for portid %ld and key " 1117 "0x%llx", __func__, 1118 ((vsw_port_t *)arg)->p_instance, 1119 addr); 1120 } else { 1121 DERR(vswp, "%s: duplicate entry found" 1122 "for key 0x%llx", __func__, addr); 1123 } 1124 rv = 1; 1125 dup = 1; 1126 break; 1127 } 1128 tmp_ent = tmp_ent->nextp; 1129 } 1130 1131 /* 1132 * Port not on list so add it to end now. 1133 */ 1134 if (0 == dup) { 1135 D2(vswp, "%s: added entry for 0x%llx to table", 1136 __func__, addr); 1137 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1138 new_ent->d_addr = (void *)tgt; 1139 new_ent->d_type = devtype; 1140 new_ent->nextp = NULL; 1141 1142 tmp_ent = ment; 1143 while (tmp_ent->nextp != NULL) 1144 tmp_ent = tmp_ent->nextp; 1145 1146 tmp_ent->nextp = new_ent; 1147 } 1148 } 1149 1150 RW_EXIT(&vswp->mfdbrw); 1151 return (rv); 1152 } 1153 1154 /* 1155 * Remove a multicast entry from the hashtable. 1156 * 1157 * Search hash table based on address. If match found, scan 1158 * list of ports associated with address. If specified port 1159 * found remove it from list. 1160 */ 1161 int 1162 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1163 { 1164 mfdb_ent_t *ment = NULL; 1165 mfdb_ent_t *curr_p, *prev_p; 1166 void *tgt = NULL; 1167 1168 D1(vswp, "%s: enter", __func__); 1169 1170 if (devtype == VSW_VNETPORT) { 1171 tgt = (vsw_port_t *)arg; 1172 D2(vswp, "%s: removing port %d from mFDB for address" 1173 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr); 1174 } else { 1175 D2(vswp, "%s: removing entry", __func__); 1176 tgt = (void *)vswp; 1177 } 1178 1179 WRITE_ENTER(&vswp->mfdbrw); 1180 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1181 (mod_hash_val_t *)&ment) != 0) { 1182 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 1183 RW_EXIT(&vswp->mfdbrw); 1184 return (1); 1185 } 1186 1187 prev_p = curr_p = ment; 1188 1189 while (curr_p != NULL) { 1190 if (curr_p->d_addr == (void *)tgt) { 1191 if (devtype == VSW_VNETPORT) { 1192 D2(vswp, "%s: port %d found", __func__, 1193 ((vsw_port_t *)tgt)->p_instance); 1194 } else { 1195 D2(vswp, "%s: instance found", __func__); 1196 } 1197 1198 if (prev_p == curr_p) { 1199 /* 1200 * head of list, if no other element is in 1201 * list then destroy this entry, otherwise 1202 * just replace it with updated value. 1203 */ 1204 ment = curr_p->nextp; 1205 if (ment == NULL) { 1206 (void) mod_hash_destroy(vswp->mfdb, 1207 (mod_hash_val_t)addr); 1208 } else { 1209 (void) mod_hash_replace(vswp->mfdb, 1210 (mod_hash_key_t)addr, 1211 (mod_hash_val_t)ment); 1212 } 1213 } else { 1214 /* 1215 * Not head of list, no need to do 1216 * replacement, just adjust list pointers. 1217 */ 1218 prev_p->nextp = curr_p->nextp; 1219 } 1220 break; 1221 } 1222 1223 prev_p = curr_p; 1224 curr_p = curr_p->nextp; 1225 } 1226 1227 RW_EXIT(&vswp->mfdbrw); 1228 1229 D1(vswp, "%s: exit", __func__); 1230 1231 if (curr_p == NULL) 1232 return (1); 1233 kmem_free(curr_p, sizeof (mfdb_ent_t)); 1234 return (0); 1235 } 1236 1237 /* 1238 * Port is being deleted, but has registered an interest in one 1239 * or more multicast groups. Using the list of addresses maintained 1240 * within the port structure find the appropriate entry in the hash 1241 * table and remove this port from the list of interested ports. 1242 */ 1243 void 1244 vsw_del_mcst_port(vsw_port_t *port) 1245 { 1246 mcst_addr_t *mcap = NULL; 1247 vsw_t *vswp = port->p_vswp; 1248 1249 D1(vswp, "%s: enter", __func__); 1250 1251 mutex_enter(&port->mca_lock); 1252 1253 while ((mcap = port->mcap) != NULL) { 1254 1255 port->mcap = mcap->nextp; 1256 1257 mutex_exit(&port->mca_lock); 1258 1259 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 1260 mcap->addr, port); 1261 1262 /* 1263 * Remove the address from HW. The address 1264 * will actually only be removed once the ref 1265 * count within the MAC layer has dropped to 1266 * zero. I.e. we can safely call this fn even 1267 * if other ports are interested in this 1268 * address. 1269 */ 1270 mutex_enter(&vswp->mac_lock); 1271 if (vswp->mh != NULL && mcap->mac_added) { 1272 (void) mac_multicst_remove(vswp->mh, 1273 (uchar_t *)&mcap->mca); 1274 } 1275 mutex_exit(&vswp->mac_lock); 1276 1277 kmem_free(mcap, sizeof (*mcap)); 1278 1279 mutex_enter(&port->mca_lock); 1280 1281 } 1282 1283 mutex_exit(&port->mca_lock); 1284 1285 D1(vswp, "%s: exit", __func__); 1286 } 1287 1288 /* 1289 * This vsw instance is detaching, but has registered an interest in one 1290 * or more multicast groups. Using the list of addresses maintained 1291 * within the vsw structure find the appropriate entry in the hash 1292 * table and remove this instance from the list of interested ports. 1293 */ 1294 void 1295 vsw_del_mcst_vsw(vsw_t *vswp) 1296 { 1297 mcst_addr_t *next_p = NULL; 1298 1299 D1(vswp, "%s: enter", __func__); 1300 1301 mutex_enter(&vswp->mca_lock); 1302 1303 while (vswp->mcap != NULL) { 1304 DERR(vswp, "%s: deleting addr 0x%llx", 1305 __func__, vswp->mcap->addr); 1306 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL); 1307 1308 next_p = vswp->mcap->nextp; 1309 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 1310 vswp->mcap = next_p; 1311 } 1312 1313 vswp->mcap = NULL; 1314 mutex_exit(&vswp->mca_lock); 1315 1316 D1(vswp, "%s: exit", __func__); 1317 } 1318 1319 static int 1320 vsw_get_same_dest_list(struct ether_header *ehp, 1321 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp) 1322 { 1323 int count = 0; 1324 mblk_t *bp; 1325 mblk_t *nbp; 1326 mblk_t *head = NULL; 1327 mblk_t *tail = NULL; 1328 mblk_t *prev = NULL; 1329 struct ether_header *behp; 1330 1331 /* process the chain of packets */ 1332 bp = *mpp; 1333 while (bp) { 1334 nbp = bp->b_next; 1335 behp = (struct ether_header *)bp->b_rptr; 1336 bp->b_prev = NULL; 1337 if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) { 1338 if (prev == NULL) { 1339 *mpp = nbp; 1340 } else { 1341 prev->b_next = nbp; 1342 } 1343 bp->b_next = NULL; 1344 if (head == NULL) { 1345 head = tail = bp; 1346 } else { 1347 tail->b_next = bp; 1348 tail = bp; 1349 } 1350 count++; 1351 } else { 1352 prev = bp; 1353 } 1354 bp = nbp; 1355 } 1356 *rhead = head; 1357 *rtail = tail; 1358 DTRACE_PROBE1(vsw_same_dest, int, count); 1359 return (count); 1360 } 1361 1362 static mblk_t * 1363 vsw_dupmsgchain(mblk_t *mp) 1364 { 1365 mblk_t *nmp = NULL; 1366 mblk_t **nmpp = &nmp; 1367 1368 for (; mp != NULL; mp = mp->b_next) { 1369 if ((*nmpp = dupmsg(mp)) == NULL) { 1370 freemsgchain(nmp); 1371 return (NULL); 1372 } 1373 1374 nmpp = &((*nmpp)->b_next); 1375 } 1376 1377 return (nmp); 1378 } 1379