1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 #include <sys/atomic.h> 74 75 /* Switching setup routines */ 76 void vsw_setup_switching_timeout(void *arg); 77 void vsw_stop_switching_timeout(vsw_t *vswp); 78 int vsw_setup_switching(vsw_t *); 79 static int vsw_setup_layer2(vsw_t *); 80 static int vsw_setup_layer3(vsw_t *); 81 82 /* Switching/data transmit routines */ 83 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 84 vsw_port_t *port, mac_resource_handle_t); 85 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 86 vsw_port_t *port, mac_resource_handle_t); 87 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, 88 int caller, vsw_port_t *port); 89 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, 90 int caller, vsw_port_t *port); 91 92 /* Forwarding database (FDB) routines */ 93 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 94 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 95 void vsw_del_mcst_port(vsw_port_t *); 96 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 97 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 98 void vsw_del_mcst_vsw(vsw_t *); 99 int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 100 int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 101 102 /* Support functions */ 103 static mblk_t *vsw_dupmsgchain(mblk_t *mp); 104 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp, 105 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp); 106 107 108 /* 109 * Functions imported from other files. 110 */ 111 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 112 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t); 113 extern int vsw_mac_open(vsw_t *vswp); 114 extern void vsw_mac_close(vsw_t *vswp); 115 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh, 116 mblk_t *mp, vsw_macrx_flags_t flags); 117 extern void vsw_set_addrs(vsw_t *vswp); 118 extern int vsw_get_hw_maddr(vsw_t *); 119 extern int vsw_mac_attach(vsw_t *vswp); 120 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, 121 uint32_t count); 122 123 /* 124 * Tunables used in this file. 125 */ 126 extern int vsw_setup_switching_delay; 127 128 129 /* 130 * Timeout routine to setup switching mode: 131 * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop() 132 * initially. If it fails and the error is EAGAIN, then this timeout handler 133 * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried 134 * until we successfully finish it; or the returned error is not EAGAIN. 135 */ 136 void 137 vsw_setup_switching_timeout(void *arg) 138 { 139 vsw_t *vswp = (vsw_t *)arg; 140 int rv; 141 142 if (vswp->swtmout_enabled == B_FALSE) 143 return; 144 145 rv = vsw_setup_switching(vswp); 146 147 if (rv == 0) { 148 /* 149 * Successfully setup switching mode. 150 * Program unicst, mcst addrs of vsw 151 * interface and ports in the physdev. 152 */ 153 vsw_set_addrs(vswp); 154 } 155 156 mutex_enter(&vswp->swtmout_lock); 157 158 if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) { 159 /* 160 * Reschedule timeout() if the error is EAGAIN and the 161 * timeout is still enabled. For errors other than EAGAIN, 162 * we simply return without rescheduling timeout(). 163 */ 164 vswp->swtmout_id = 165 timeout(vsw_setup_switching_timeout, vswp, 166 (vsw_setup_switching_delay * drv_usectohz(MICROSEC))); 167 goto exit; 168 } 169 170 /* timeout handler completed */ 171 vswp->swtmout_enabled = B_FALSE; 172 vswp->swtmout_id = 0; 173 174 exit: 175 mutex_exit(&vswp->swtmout_lock); 176 } 177 178 /* 179 * Cancel the timeout handler to setup switching mode. 180 */ 181 void 182 vsw_stop_switching_timeout(vsw_t *vswp) 183 { 184 timeout_id_t tid; 185 186 mutex_enter(&vswp->swtmout_lock); 187 188 tid = vswp->swtmout_id; 189 190 if (tid != 0) { 191 /* signal timeout handler to stop */ 192 vswp->swtmout_enabled = B_FALSE; 193 vswp->swtmout_id = 0; 194 mutex_exit(&vswp->swtmout_lock); 195 196 (void) untimeout(tid); 197 } else { 198 mutex_exit(&vswp->swtmout_lock); 199 } 200 201 (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE); 202 203 mutex_enter(&vswp->mac_lock); 204 vswp->mac_open_retries = 0; 205 mutex_exit(&vswp->mac_lock); 206 } 207 208 /* 209 * Setup the required switching mode. 210 * This routine is invoked from vsw_attach() or vsw_update_md_prop() 211 * initially. If it fails and the error is EAGAIN, then a timeout handler 212 * is started to retry vsw_setup_switching(), until it successfully finishes; 213 * or the returned error is not EAGAIN. 214 * 215 * Returns: 216 * 0 on success. 217 * EAGAIN if retry is needed. 218 * 1 on all other failures. 219 */ 220 int 221 vsw_setup_switching(vsw_t *vswp) 222 { 223 int i, rv = 1; 224 225 D1(vswp, "%s: enter", __func__); 226 227 /* 228 * Select best switching mode. 229 * Note that we start from the saved smode_idx. This is done as 230 * this routine can be called from the timeout handler to retry 231 * setting up a specific mode. Currently only the function which 232 * sets up layer2/promisc mode returns EAGAIN if the underlying 233 * physical device is not available yet, causing retries. 234 */ 235 for (i = vswp->smode_idx; i < vswp->smode_num; i++) { 236 vswp->smode_idx = i; 237 switch (vswp->smode[i]) { 238 case VSW_LAYER2: 239 case VSW_LAYER2_PROMISC: 240 rv = vsw_setup_layer2(vswp); 241 break; 242 243 case VSW_LAYER3: 244 rv = vsw_setup_layer3(vswp); 245 break; 246 247 default: 248 DERR(vswp, "unknown switch mode"); 249 break; 250 } 251 252 if ((rv == 0) || (rv == EAGAIN)) 253 break; 254 255 /* all other errors(rv != 0): continue & select the next mode */ 256 rv = 1; 257 } 258 259 if (rv && (rv != EAGAIN)) { 260 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 261 "switching mode", vswp->instance); 262 } else if (rv == 0) { 263 (void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE); 264 } 265 266 D2(vswp, "%s: Operating in mode %d", __func__, 267 vswp->smode[vswp->smode_idx]); 268 269 D1(vswp, "%s: exit", __func__); 270 271 return (rv); 272 } 273 274 /* 275 * Setup for layer 2 switching. 276 * 277 * Returns: 278 * 0 on success. 279 * EAGAIN if retry is needed. 280 * EIO on all other failures. 281 */ 282 static int 283 vsw_setup_layer2(vsw_t *vswp) 284 { 285 int rv; 286 287 D1(vswp, "%s: enter", __func__); 288 289 vswp->vsw_switch_frame = vsw_switch_l2_frame; 290 291 rv = strlen(vswp->physname); 292 if (rv == 0) { 293 /* 294 * Physical device name is NULL, which is 295 * required for layer 2. 296 */ 297 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 298 vswp->instance); 299 return (EIO); 300 } 301 302 mutex_enter(&vswp->mac_lock); 303 304 rv = vsw_mac_open(vswp); 305 if (rv != 0) { 306 if (rv != EAGAIN) { 307 cmn_err(CE_WARN, "!vsw%d: Unable to open physical " 308 "device: %s\n", vswp->instance, vswp->physname); 309 } 310 mutex_exit(&vswp->mac_lock); 311 return (rv); 312 } 313 314 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 315 /* 316 * Verify that underlying device can support multiple 317 * unicast mac addresses. 318 */ 319 rv = vsw_get_hw_maddr(vswp); 320 if (rv != 0) { 321 goto exit_error; 322 } 323 } 324 325 /* 326 * Attempt to link into the MAC layer so we can get 327 * and send packets out over the physical adapter. 328 */ 329 rv = vsw_mac_attach(vswp); 330 if (rv != 0) { 331 /* 332 * Registration with the MAC layer has failed, 333 * so return error so that can fall back to next 334 * prefered switching method. 335 */ 336 cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: " 337 "%s\n", vswp->instance, vswp->physname); 338 goto exit_error; 339 } 340 341 D1(vswp, "%s: exit", __func__); 342 343 mutex_exit(&vswp->mac_lock); 344 return (0); 345 346 exit_error: 347 vsw_mac_close(vswp); 348 mutex_exit(&vswp->mac_lock); 349 return (EIO); 350 } 351 352 static int 353 vsw_setup_layer3(vsw_t *vswp) 354 { 355 D1(vswp, "%s: enter", __func__); 356 357 D2(vswp, "%s: operating in layer 3 mode", __func__); 358 vswp->vsw_switch_frame = vsw_switch_l3_frame; 359 360 D1(vswp, "%s: exit", __func__); 361 362 return (0); 363 } 364 365 /* 366 * Switch the given ethernet frame when operating in layer 2 mode. 367 * 368 * vswp: pointer to the vsw instance 369 * mp: pointer to chain of ethernet frame(s) to be switched 370 * caller: identifies the source of this frame as: 371 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 372 * 2. VSW_PHYSDEV - the physical ethernet device 373 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 374 * arg: argument provided by the caller. 375 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 376 * 2. for PHYSDEV - NULL 377 * 3. for LOCALDEV - pointer to to this vsw_t(self) 378 */ 379 void 380 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 381 vsw_port_t *arg, mac_resource_handle_t mrh) 382 { 383 struct ether_header *ehp; 384 vsw_port_t *port = NULL; 385 mblk_t *bp, *ret_m; 386 mblk_t *mpt = NULL; 387 uint32_t count; 388 vsw_port_list_t *plist = &vswp->plist; 389 390 D1(vswp, "%s: enter (caller %d)", __func__, caller); 391 392 /* 393 * PERF: rather than breaking up the chain here, scan it 394 * to find all mblks heading to same destination and then 395 * pass that sub-chain to the lower transmit functions. 396 */ 397 398 /* process the chain of packets */ 399 bp = mp; 400 while (bp) { 401 ehp = (struct ether_header *)bp->b_rptr; 402 count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 403 ASSERT(count != 0); 404 405 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 406 __func__, MBLKSIZE(mp), MBLKL(mp)); 407 408 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 409 /* 410 * If destination is VSW_LOCALDEV (vsw as an eth 411 * interface) and if the device is up & running, 412 * send the packet up the stack on this host. 413 * If the virtual interface is down, drop the packet. 414 */ 415 if (caller != VSW_LOCALDEV) { 416 vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG); 417 } else { 418 freemsgchain(mp); 419 } 420 continue; 421 } 422 423 READ_ENTER(&plist->lockrw); 424 port = vsw_lookup_fdb(vswp, ehp); 425 if (port) { 426 /* 427 * Mark the port as in-use before releasing the lockrw. 428 */ 429 VSW_PORT_REFHOLD(port); 430 RW_EXIT(&plist->lockrw); 431 432 /* 433 * If plumbed and in promisc mode then copy msg 434 * and send up the stack. 435 */ 436 vsw_mac_rx(vswp, mrh, mp, 437 VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 438 439 /* 440 * If the destination is in FDB, the packet 441 * should be forwarded to the correponding 442 * vsw_port (connected to a vnet device - 443 * VSW_VNETPORT) 444 */ 445 (void) vsw_portsend(port, mp, mpt, count); 446 447 /* 448 * Decrement use count in port. 449 */ 450 VSW_PORT_REFRELE(port); 451 } else { 452 RW_EXIT(&plist->lockrw); 453 /* 454 * Destination not in FDB. 455 * 456 * If the destination is broadcast or 457 * multicast forward the packet to all 458 * (VNETPORTs, PHYSDEV, LOCALDEV), 459 * except the caller. 460 */ 461 if (IS_BROADCAST(ehp)) { 462 D2(vswp, "%s: BROADCAST pkt", __func__); 463 (void) vsw_forward_all(vswp, mp, caller, arg); 464 } else if (IS_MULTICAST(ehp)) { 465 D2(vswp, "%s: MULTICAST pkt", __func__); 466 (void) vsw_forward_grp(vswp, mp, caller, arg); 467 } else { 468 /* 469 * If the destination is unicast, and came 470 * from either a logical network device or 471 * the switch itself when it is plumbed, then 472 * send it out on the physical device and also 473 * up the stack if the logical interface is 474 * in promiscious mode. 475 * 476 * NOTE: The assumption here is that if we 477 * cannot find the destination in our fdb, its 478 * a unicast address, and came from either a 479 * vnet or down the stack (when plumbed) it 480 * must be destinded for an ethernet device 481 * outside our ldoms. 482 */ 483 if (caller == VSW_VNETPORT) { 484 /* promisc check copy etc */ 485 vsw_mac_rx(vswp, mrh, mp, 486 VSW_MACRX_PROMISC | 487 VSW_MACRX_COPYMSG); 488 489 if ((ret_m = vsw_tx_msg(vswp, mp)) 490 != NULL) { 491 DERR(vswp, "%s: drop mblks to " 492 "phys dev", __func__); 493 freemsgchain(ret_m); 494 } 495 496 } else if (caller == VSW_PHYSDEV) { 497 /* 498 * Pkt seen because card in promisc 499 * mode. Send up stack if plumbed in 500 * promisc mode, else drop it. 501 */ 502 vsw_mac_rx(vswp, mrh, mp, 503 VSW_MACRX_PROMISC | 504 VSW_MACRX_FREEMSG); 505 506 } else if (caller == VSW_LOCALDEV) { 507 /* 508 * Pkt came down the stack, send out 509 * over physical device. 510 */ 511 if ((ret_m = vsw_tx_msg(vswp, mp)) 512 != NULL) { 513 DERR(vswp, "%s: drop mblks to " 514 "phys dev", __func__); 515 freemsgchain(ret_m); 516 } 517 } 518 } 519 } 520 } 521 D1(vswp, "%s: exit\n", __func__); 522 } 523 524 /* 525 * Switch ethernet frame when in layer 3 mode (i.e. using IP 526 * layer to do the routing). 527 * 528 * There is a large amount of overlap between this function and 529 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 530 * both these functions. 531 */ 532 void 533 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 534 vsw_port_t *arg, mac_resource_handle_t mrh) 535 { 536 struct ether_header *ehp; 537 vsw_port_t *port = NULL; 538 mblk_t *bp = NULL; 539 mblk_t *mpt; 540 uint32_t count; 541 vsw_port_list_t *plist = &vswp->plist; 542 543 D1(vswp, "%s: enter (caller %d)", __func__, caller); 544 545 /* 546 * In layer 3 mode should only ever be switching packets 547 * between IP layer and vnet devices. So make sure thats 548 * who is invoking us. 549 */ 550 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 551 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 552 freemsgchain(mp); 553 return; 554 } 555 556 /* process the chain of packets */ 557 bp = mp; 558 while (bp) { 559 ehp = (struct ether_header *)bp->b_rptr; 560 count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 561 ASSERT(count != 0); 562 563 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 564 __func__, MBLKSIZE(mp), MBLKL(mp)); 565 566 READ_ENTER(&plist->lockrw); 567 port = vsw_lookup_fdb(vswp, ehp); 568 if (port) { 569 /* 570 * Mark the port as in-use before releasing the lockrw. 571 */ 572 VSW_PORT_REFHOLD(port); 573 RW_EXIT(&plist->lockrw); 574 575 D2(vswp, "%s: sending to target port", __func__); 576 (void) vsw_portsend(port, mp, mpt, count); 577 578 /* 579 * Decrement ref count. 580 */ 581 VSW_PORT_REFRELE(port); 582 } else { 583 RW_EXIT(&plist->lockrw); 584 /* 585 * Destination not in FDB 586 * 587 * If the destination is broadcast or 588 * multicast forward the packet to all 589 * (VNETPORTs, PHYSDEV, LOCALDEV), 590 * except the caller. 591 */ 592 if (IS_BROADCAST(ehp)) { 593 D2(vswp, "%s: BROADCAST pkt", __func__); 594 (void) vsw_forward_all(vswp, mp, caller, arg); 595 } else if (IS_MULTICAST(ehp)) { 596 D2(vswp, "%s: MULTICAST pkt", __func__); 597 (void) vsw_forward_grp(vswp, mp, caller, arg); 598 } else { 599 /* 600 * Unicast pkt from vnet that we don't have 601 * an FDB entry for, so must be destinded for 602 * the outside world. Attempt to send up to the 603 * IP layer to allow it to deal with it. 604 */ 605 if (caller == VSW_VNETPORT) { 606 vsw_mac_rx(vswp, mrh, 607 mp, VSW_MACRX_FREEMSG); 608 } 609 } 610 } 611 } 612 613 D1(vswp, "%s: exit", __func__); 614 } 615 616 /* 617 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 618 * except the caller (port on which frame arrived). 619 */ 620 static int 621 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 622 { 623 vsw_port_list_t *plist = &vswp->plist; 624 vsw_port_t *portp; 625 mblk_t *nmp = NULL; 626 mblk_t *ret_m = NULL; 627 int skip_port = 0; 628 629 D1(vswp, "vsw_forward_all: enter\n"); 630 631 /* 632 * Broadcast message from inside ldoms so send to outside 633 * world if in either of layer 2 modes. 634 */ 635 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 636 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 637 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 638 639 nmp = vsw_dupmsgchain(mp); 640 if (nmp) { 641 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 642 DERR(vswp, "%s: dropping pkt(s) " 643 "consisting of %ld bytes of data for" 644 " physical device", __func__, MBLKL(ret_m)); 645 freemsgchain(ret_m); 646 } 647 } 648 } 649 650 if (caller == VSW_VNETPORT) 651 skip_port = 1; 652 653 /* 654 * Broadcast message from other vnet (layer 2 or 3) or outside 655 * world (layer 2 only), send up stack if plumbed. 656 */ 657 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 658 vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG); 659 } 660 661 /* send it to all VNETPORTs */ 662 READ_ENTER(&plist->lockrw); 663 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 664 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 665 /* 666 * Caution ! - don't reorder these two checks as arg 667 * will be NULL if the caller is PHYSDEV. skip_port is 668 * only set if caller is VNETPORT. 669 */ 670 if ((skip_port) && (portp == arg)) { 671 continue; 672 } else { 673 nmp = vsw_dupmsgchain(mp); 674 if (nmp) { 675 mblk_t *mpt = nmp; 676 uint32_t count = 1; 677 678 /* Find tail */ 679 while (mpt->b_next != NULL) { 680 mpt = mpt->b_next; 681 count++; 682 } 683 /* 684 * The plist->lockrw is protecting the 685 * portp from getting destroyed here. 686 * So, no ref_cnt is incremented here. 687 */ 688 (void) vsw_portsend(portp, nmp, mpt, count); 689 } else { 690 DERR(vswp, "vsw_forward_all: nmp NULL"); 691 } 692 } 693 } 694 RW_EXIT(&plist->lockrw); 695 696 freemsgchain(mp); 697 698 D1(vswp, "vsw_forward_all: exit\n"); 699 return (0); 700 } 701 702 /* 703 * Forward pkts to any devices or interfaces which have registered 704 * an interest in them (i.e. multicast groups). 705 */ 706 static int 707 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 708 { 709 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 710 mfdb_ent_t *entp = NULL; 711 mfdb_ent_t *tpp = NULL; 712 vsw_port_t *port; 713 uint64_t key = 0; 714 mblk_t *nmp = NULL; 715 mblk_t *ret_m = NULL; 716 boolean_t check_if = B_TRUE; 717 718 /* 719 * Convert address to hash table key 720 */ 721 KEY_HASH(key, ehp->ether_dhost); 722 723 D1(vswp, "%s: key 0x%llx", __func__, key); 724 725 /* 726 * If pkt came from either a vnet or down the stack (if we are 727 * plumbed) and we are in layer 2 mode, then we send the pkt out 728 * over the physical adapter, and then check to see if any other 729 * vnets are interested in it. 730 */ 731 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 732 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 733 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 734 nmp = vsw_dupmsgchain(mp); 735 if (nmp) { 736 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 737 DERR(vswp, "%s: dropping pkt(s) consisting of " 738 "%ld bytes of data for physical device", 739 __func__, MBLKL(ret_m)); 740 freemsgchain(ret_m); 741 } 742 } 743 } 744 745 READ_ENTER(&vswp->mfdbrw); 746 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 747 (mod_hash_val_t *)&entp) != 0) { 748 D3(vswp, "%s: no table entry found for addr 0x%llx", 749 __func__, key); 750 } else { 751 /* 752 * Send to list of devices associated with this address... 753 */ 754 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 755 756 /* dont send to ourselves */ 757 if ((caller == VSW_VNETPORT) && 758 (tpp->d_addr == (void *)arg)) { 759 port = (vsw_port_t *)tpp->d_addr; 760 D3(vswp, "%s: not sending to ourselves" 761 " : port %d", __func__, port->p_instance); 762 continue; 763 764 } else if ((caller == VSW_LOCALDEV) && 765 (tpp->d_type == VSW_LOCALDEV)) { 766 D2(vswp, "%s: not sending back up stack", 767 __func__); 768 continue; 769 } 770 771 if (tpp->d_type == VSW_VNETPORT) { 772 port = (vsw_port_t *)tpp->d_addr; 773 D3(vswp, "%s: sending to port %ld for addr " 774 "0x%llx", __func__, port->p_instance, key); 775 776 nmp = vsw_dupmsgchain(mp); 777 if (nmp) { 778 mblk_t *mpt = nmp; 779 uint32_t count = 1; 780 781 /* Find tail */ 782 while (mpt->b_next != NULL) { 783 mpt = mpt->b_next; 784 count++; 785 } 786 /* 787 * The vswp->mfdbrw is protecting the 788 * portp from getting destroyed here. 789 * So, no ref_cnt is incremented here. 790 */ 791 (void) vsw_portsend(port, nmp, mpt, 792 count); 793 } 794 } else { 795 vsw_mac_rx(vswp, NULL, 796 mp, VSW_MACRX_COPYMSG); 797 D2(vswp, "%s: sending up stack" 798 " for addr 0x%llx", __func__, key); 799 check_if = B_FALSE; 800 } 801 } 802 } 803 804 RW_EXIT(&vswp->mfdbrw); 805 806 /* 807 * If the pkt came from either a vnet or from physical device, 808 * and if we havent already sent the pkt up the stack then we 809 * check now if we can/should (i.e. the interface is plumbed 810 * and in promisc mode). 811 */ 812 if ((check_if) && 813 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 814 vsw_mac_rx(vswp, NULL, mp, 815 VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 816 } 817 818 freemsgchain(mp); 819 820 D1(vswp, "%s: exit", __func__); 821 822 return (0); 823 } 824 825 /* 826 * Add an entry into FDB, for the given mac address and port_id. 827 * Returns 0 on success, 1 on failure. 828 * 829 * Lock protecting FDB must be held by calling process. 830 */ 831 int 832 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 833 { 834 uint64_t addr = 0; 835 836 D1(vswp, "%s: enter", __func__); 837 838 KEY_HASH(addr, port->p_macaddr); 839 840 D2(vswp, "%s: key = 0x%llx", __func__, addr); 841 842 /* 843 * Note: duplicate keys will be rejected by mod_hash. 844 */ 845 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 846 (mod_hash_val_t)port) != 0) { 847 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 848 return (1); 849 } 850 851 D1(vswp, "%s: exit", __func__); 852 return (0); 853 } 854 855 /* 856 * Remove an entry from FDB. 857 * Returns 0 on success, 1 on failure. 858 */ 859 int 860 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 861 { 862 uint64_t addr = 0; 863 864 D1(vswp, "%s: enter", __func__); 865 866 KEY_HASH(addr, port->p_macaddr); 867 868 D2(vswp, "%s: key = 0x%llx", __func__, addr); 869 870 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 871 872 D1(vswp, "%s: enter", __func__); 873 874 return (0); 875 } 876 877 /* 878 * Search fdb for a given mac address. 879 * Returns pointer to the entry if found, else returns NULL. 880 */ 881 static vsw_port_t * 882 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 883 { 884 uint64_t key = 0; 885 vsw_port_t *port = NULL; 886 887 D1(vswp, "%s: enter", __func__); 888 889 KEY_HASH(key, ehp->ether_dhost); 890 891 D2(vswp, "%s: key = 0x%llx", __func__, key); 892 893 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 894 (mod_hash_val_t *)&port) != 0) { 895 D2(vswp, "%s: no port found", __func__); 896 return (NULL); 897 } 898 899 D1(vswp, "%s: exit", __func__); 900 901 return (port); 902 } 903 904 /* 905 * Add or remove multicast address(es). 906 * 907 * Returns 0 on success, 1 on failure. 908 */ 909 int 910 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 911 { 912 mcst_addr_t *mcst_p = NULL; 913 vsw_t *vswp = port->p_vswp; 914 uint64_t addr = 0x0; 915 int i; 916 917 D1(vswp, "%s: enter", __func__); 918 919 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 920 921 for (i = 0; i < mcst_pkt->count; i++) { 922 /* 923 * Convert address into form that can be used 924 * as hash table key. 925 */ 926 KEY_HASH(addr, mcst_pkt->mca[i]); 927 928 /* 929 * Add or delete the specified address/port combination. 930 */ 931 if (mcst_pkt->set == 0x1) { 932 D3(vswp, "%s: adding multicast address 0x%llx for " 933 "port %ld", __func__, addr, port->p_instance); 934 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 935 /* 936 * Update the list of multicast 937 * addresses contained within the 938 * port structure to include this new 939 * one. 940 */ 941 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), 942 KM_NOSLEEP); 943 if (mcst_p == NULL) { 944 DERR(vswp, "%s: unable to alloc mem", 945 __func__); 946 (void) vsw_del_mcst(vswp, 947 VSW_VNETPORT, addr, port); 948 return (1); 949 } 950 951 mcst_p->nextp = NULL; 952 mcst_p->addr = addr; 953 ether_copy(&mcst_pkt->mca[i], &mcst_p->mca); 954 955 /* 956 * Program the address into HW. If the addr 957 * has already been programmed then the MAC 958 * just increments a ref counter (which is 959 * used when the address is being deleted) 960 */ 961 mutex_enter(&vswp->mac_lock); 962 if (vswp->mh != NULL) { 963 if (mac_multicst_add(vswp->mh, 964 (uchar_t *)&mcst_pkt->mca[i])) { 965 mutex_exit(&vswp->mac_lock); 966 cmn_err(CE_WARN, "!vsw%d: " 967 "unable to add multicast " 968 "address: %s\n", 969 vswp->instance, 970 ether_sprintf((void *) 971 &mcst_p->mca)); 972 (void) vsw_del_mcst(vswp, 973 VSW_VNETPORT, addr, port); 974 kmem_free(mcst_p, 975 sizeof (*mcst_p)); 976 return (1); 977 } 978 mcst_p->mac_added = B_TRUE; 979 } 980 mutex_exit(&vswp->mac_lock); 981 982 mutex_enter(&port->mca_lock); 983 mcst_p->nextp = port->mcap; 984 port->mcap = mcst_p; 985 mutex_exit(&port->mca_lock); 986 987 } else { 988 DERR(vswp, "%s: error adding multicast " 989 "address 0x%llx for port %ld", 990 __func__, addr, port->p_instance); 991 return (1); 992 } 993 } else { 994 /* 995 * Delete an entry from the multicast hash 996 * table and update the address list 997 * appropriately. 998 */ 999 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 1000 D3(vswp, "%s: deleting multicast address " 1001 "0x%llx for port %ld", __func__, addr, 1002 port->p_instance); 1003 1004 mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr); 1005 ASSERT(mcst_p != NULL); 1006 1007 /* 1008 * Remove the address from HW. The address 1009 * will actually only be removed once the ref 1010 * count within the MAC layer has dropped to 1011 * zero. I.e. we can safely call this fn even 1012 * if other ports are interested in this 1013 * address. 1014 */ 1015 mutex_enter(&vswp->mac_lock); 1016 if (vswp->mh != NULL && mcst_p->mac_added) { 1017 if (mac_multicst_remove(vswp->mh, 1018 (uchar_t *)&mcst_pkt->mca[i])) { 1019 mutex_exit(&vswp->mac_lock); 1020 cmn_err(CE_WARN, "!vsw%d: " 1021 "unable to remove mcast " 1022 "address: %s\n", 1023 vswp->instance, 1024 ether_sprintf((void *) 1025 &mcst_p->mca)); 1026 kmem_free(mcst_p, 1027 sizeof (*mcst_p)); 1028 return (1); 1029 } 1030 mcst_p->mac_added = B_FALSE; 1031 } 1032 mutex_exit(&vswp->mac_lock); 1033 kmem_free(mcst_p, sizeof (*mcst_p)); 1034 1035 } else { 1036 DERR(vswp, "%s: error deleting multicast " 1037 "addr 0x%llx for port %ld", 1038 __func__, addr, port->p_instance); 1039 return (1); 1040 } 1041 } 1042 } 1043 D1(vswp, "%s: exit", __func__); 1044 return (0); 1045 } 1046 1047 /* 1048 * Add a new multicast entry. 1049 * 1050 * Search hash table based on address. If match found then 1051 * update associated val (which is chain of ports), otherwise 1052 * create new key/val (addr/port) pair and insert into table. 1053 */ 1054 int 1055 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1056 { 1057 int dup = 0; 1058 int rv = 0; 1059 mfdb_ent_t *ment = NULL; 1060 mfdb_ent_t *tmp_ent = NULL; 1061 mfdb_ent_t *new_ent = NULL; 1062 void *tgt = NULL; 1063 1064 if (devtype == VSW_VNETPORT) { 1065 /* 1066 * Being invoked from a vnet. 1067 */ 1068 ASSERT(arg != NULL); 1069 tgt = arg; 1070 D2(NULL, "%s: port %d : address 0x%llx", __func__, 1071 ((vsw_port_t *)arg)->p_instance, addr); 1072 } else { 1073 /* 1074 * We are being invoked via the m_multicst mac entry 1075 * point. 1076 */ 1077 D2(NULL, "%s: address 0x%llx", __func__, addr); 1078 tgt = (void *)vswp; 1079 } 1080 1081 WRITE_ENTER(&vswp->mfdbrw); 1082 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1083 (mod_hash_val_t *)&ment) != 0) { 1084 1085 /* address not currently in table */ 1086 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1087 ment->d_addr = (void *)tgt; 1088 ment->d_type = devtype; 1089 ment->nextp = NULL; 1090 1091 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 1092 (mod_hash_val_t)ment) != 0) { 1093 DERR(vswp, "%s: hash table insertion failed", __func__); 1094 kmem_free(ment, sizeof (mfdb_ent_t)); 1095 rv = 1; 1096 } else { 1097 D2(vswp, "%s: added initial entry for 0x%llx to " 1098 "table", __func__, addr); 1099 } 1100 } else { 1101 /* 1102 * Address in table. Check to see if specified port 1103 * is already associated with the address. If not add 1104 * it now. 1105 */ 1106 tmp_ent = ment; 1107 while (tmp_ent != NULL) { 1108 if (tmp_ent->d_addr == (void *)tgt) { 1109 if (devtype == VSW_VNETPORT) { 1110 DERR(vswp, "%s: duplicate port entry " 1111 "found for portid %ld and key " 1112 "0x%llx", __func__, 1113 ((vsw_port_t *)arg)->p_instance, 1114 addr); 1115 } else { 1116 DERR(vswp, "%s: duplicate entry found" 1117 "for key 0x%llx", __func__, addr); 1118 } 1119 rv = 1; 1120 dup = 1; 1121 break; 1122 } 1123 tmp_ent = tmp_ent->nextp; 1124 } 1125 1126 /* 1127 * Port not on list so add it to end now. 1128 */ 1129 if (0 == dup) { 1130 D2(vswp, "%s: added entry for 0x%llx to table", 1131 __func__, addr); 1132 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1133 new_ent->d_addr = (void *)tgt; 1134 new_ent->d_type = devtype; 1135 new_ent->nextp = NULL; 1136 1137 tmp_ent = ment; 1138 while (tmp_ent->nextp != NULL) 1139 tmp_ent = tmp_ent->nextp; 1140 1141 tmp_ent->nextp = new_ent; 1142 } 1143 } 1144 1145 RW_EXIT(&vswp->mfdbrw); 1146 return (rv); 1147 } 1148 1149 /* 1150 * Remove a multicast entry from the hashtable. 1151 * 1152 * Search hash table based on address. If match found, scan 1153 * list of ports associated with address. If specified port 1154 * found remove it from list. 1155 */ 1156 int 1157 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1158 { 1159 mfdb_ent_t *ment = NULL; 1160 mfdb_ent_t *curr_p, *prev_p; 1161 void *tgt = NULL; 1162 1163 D1(vswp, "%s: enter", __func__); 1164 1165 if (devtype == VSW_VNETPORT) { 1166 tgt = (vsw_port_t *)arg; 1167 D2(vswp, "%s: removing port %d from mFDB for address" 1168 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr); 1169 } else { 1170 D2(vswp, "%s: removing entry", __func__); 1171 tgt = (void *)vswp; 1172 } 1173 1174 WRITE_ENTER(&vswp->mfdbrw); 1175 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1176 (mod_hash_val_t *)&ment) != 0) { 1177 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 1178 RW_EXIT(&vswp->mfdbrw); 1179 return (1); 1180 } 1181 1182 prev_p = curr_p = ment; 1183 1184 while (curr_p != NULL) { 1185 if (curr_p->d_addr == (void *)tgt) { 1186 if (devtype == VSW_VNETPORT) { 1187 D2(vswp, "%s: port %d found", __func__, 1188 ((vsw_port_t *)tgt)->p_instance); 1189 } else { 1190 D2(vswp, "%s: instance found", __func__); 1191 } 1192 1193 if (prev_p == curr_p) { 1194 /* 1195 * head of list, if no other element is in 1196 * list then destroy this entry, otherwise 1197 * just replace it with updated value. 1198 */ 1199 ment = curr_p->nextp; 1200 if (ment == NULL) { 1201 (void) mod_hash_destroy(vswp->mfdb, 1202 (mod_hash_val_t)addr); 1203 } else { 1204 (void) mod_hash_replace(vswp->mfdb, 1205 (mod_hash_key_t)addr, 1206 (mod_hash_val_t)ment); 1207 } 1208 } else { 1209 /* 1210 * Not head of list, no need to do 1211 * replacement, just adjust list pointers. 1212 */ 1213 prev_p->nextp = curr_p->nextp; 1214 } 1215 break; 1216 } 1217 1218 prev_p = curr_p; 1219 curr_p = curr_p->nextp; 1220 } 1221 1222 RW_EXIT(&vswp->mfdbrw); 1223 1224 D1(vswp, "%s: exit", __func__); 1225 1226 if (curr_p == NULL) 1227 return (1); 1228 kmem_free(curr_p, sizeof (mfdb_ent_t)); 1229 return (0); 1230 } 1231 1232 /* 1233 * Port is being deleted, but has registered an interest in one 1234 * or more multicast groups. Using the list of addresses maintained 1235 * within the port structure find the appropriate entry in the hash 1236 * table and remove this port from the list of interested ports. 1237 */ 1238 void 1239 vsw_del_mcst_port(vsw_port_t *port) 1240 { 1241 mcst_addr_t *mcap = NULL; 1242 vsw_t *vswp = port->p_vswp; 1243 1244 D1(vswp, "%s: enter", __func__); 1245 1246 mutex_enter(&port->mca_lock); 1247 1248 while ((mcap = port->mcap) != NULL) { 1249 1250 port->mcap = mcap->nextp; 1251 1252 mutex_exit(&port->mca_lock); 1253 1254 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 1255 mcap->addr, port); 1256 1257 /* 1258 * Remove the address from HW. The address 1259 * will actually only be removed once the ref 1260 * count within the MAC layer has dropped to 1261 * zero. I.e. we can safely call this fn even 1262 * if other ports are interested in this 1263 * address. 1264 */ 1265 mutex_enter(&vswp->mac_lock); 1266 if (vswp->mh != NULL && mcap->mac_added) { 1267 (void) mac_multicst_remove(vswp->mh, 1268 (uchar_t *)&mcap->mca); 1269 } 1270 mutex_exit(&vswp->mac_lock); 1271 1272 kmem_free(mcap, sizeof (*mcap)); 1273 1274 mutex_enter(&port->mca_lock); 1275 1276 } 1277 1278 mutex_exit(&port->mca_lock); 1279 1280 D1(vswp, "%s: exit", __func__); 1281 } 1282 1283 /* 1284 * This vsw instance is detaching, but has registered an interest in one 1285 * or more multicast groups. Using the list of addresses maintained 1286 * within the vsw structure find the appropriate entry in the hash 1287 * table and remove this instance from the list of interested ports. 1288 */ 1289 void 1290 vsw_del_mcst_vsw(vsw_t *vswp) 1291 { 1292 mcst_addr_t *next_p = NULL; 1293 1294 D1(vswp, "%s: enter", __func__); 1295 1296 mutex_enter(&vswp->mca_lock); 1297 1298 while (vswp->mcap != NULL) { 1299 DERR(vswp, "%s: deleting addr 0x%llx", 1300 __func__, vswp->mcap->addr); 1301 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL); 1302 1303 next_p = vswp->mcap->nextp; 1304 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 1305 vswp->mcap = next_p; 1306 } 1307 1308 vswp->mcap = NULL; 1309 mutex_exit(&vswp->mca_lock); 1310 1311 D1(vswp, "%s: exit", __func__); 1312 } 1313 1314 static uint32_t 1315 vsw_get_same_dest_list(struct ether_header *ehp, 1316 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp) 1317 { 1318 uint32_t count = 0; 1319 mblk_t *bp; 1320 mblk_t *nbp; 1321 mblk_t *head = NULL; 1322 mblk_t *tail = NULL; 1323 mblk_t *prev = NULL; 1324 struct ether_header *behp; 1325 1326 /* process the chain of packets */ 1327 bp = *mpp; 1328 while (bp) { 1329 nbp = bp->b_next; 1330 behp = (struct ether_header *)bp->b_rptr; 1331 bp->b_prev = NULL; 1332 if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) { 1333 if (prev == NULL) { 1334 *mpp = nbp; 1335 } else { 1336 prev->b_next = nbp; 1337 } 1338 bp->b_next = NULL; 1339 if (head == NULL) { 1340 head = tail = bp; 1341 } else { 1342 tail->b_next = bp; 1343 tail = bp; 1344 } 1345 count++; 1346 } else { 1347 prev = bp; 1348 } 1349 bp = nbp; 1350 } 1351 *rhead = head; 1352 *rtail = tail; 1353 DTRACE_PROBE1(vsw_same_dest, int, count); 1354 return (count); 1355 } 1356 1357 static mblk_t * 1358 vsw_dupmsgchain(mblk_t *mp) 1359 { 1360 mblk_t *nmp = NULL; 1361 mblk_t **nmpp = &nmp; 1362 1363 for (; mp != NULL; mp = mp->b_next) { 1364 if ((*nmpp = dupmsg(mp)) == NULL) { 1365 freemsgchain(nmp); 1366 return (NULL); 1367 } 1368 1369 nmpp = &((*nmpp)->b_next); 1370 } 1371 1372 return (nmp); 1373 } 1374