1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 #include <sys/atomic.h> 74 75 /* Switching setup routines */ 76 void vsw_setup_switching_timeout(void *arg); 77 void vsw_stop_switching_timeout(vsw_t *vswp); 78 int vsw_setup_switching(vsw_t *); 79 static int vsw_setup_layer2(vsw_t *); 80 static int vsw_setup_layer3(vsw_t *); 81 82 /* Switching/data transmit routines */ 83 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 84 vsw_port_t *port, mac_resource_handle_t); 85 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 86 vsw_port_t *port, mac_resource_handle_t); 87 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, 88 int caller, vsw_port_t *port); 89 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, 90 int caller, vsw_port_t *port); 91 92 /* Forwarding database (FDB) routines */ 93 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 94 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 95 void vsw_del_mcst_port(vsw_port_t *); 96 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 97 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 98 void vsw_del_mcst_vsw(vsw_t *); 99 int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 100 int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 101 102 /* Support functions */ 103 static mblk_t *vsw_dupmsgchain(mblk_t *mp); 104 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp, 105 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp); 106 107 108 /* 109 * Functions imported from other files. 110 */ 111 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 112 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t); 113 extern int vsw_mac_open(vsw_t *vswp); 114 extern void vsw_mac_close(vsw_t *vswp); 115 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh, 116 mblk_t *mp, vsw_macrx_flags_t flags); 117 extern void vsw_set_addrs(vsw_t *vswp); 118 extern int vsw_get_hw_maddr(vsw_t *); 119 extern int vsw_mac_attach(vsw_t *vswp); 120 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, 121 uint32_t count); 122 123 /* 124 * Tunables used in this file. 125 */ 126 extern int vsw_setup_switching_delay; 127 128 129 /* 130 * Timeout routine to setup switching mode: 131 * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop() 132 * initially. If it fails and the error is EAGAIN, then this timeout handler 133 * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried 134 * until we successfully finish it; or the returned error is not EAGAIN. 135 */ 136 void 137 vsw_setup_switching_timeout(void *arg) 138 { 139 vsw_t *vswp = (vsw_t *)arg; 140 int rv; 141 142 if (vswp->swtmout_enabled == B_FALSE) 143 return; 144 145 rv = vsw_setup_switching(vswp); 146 147 if (rv == 0) { 148 /* 149 * Successfully setup switching mode. 150 * Program unicst, mcst addrs of vsw 151 * interface and ports in the physdev. 152 */ 153 vsw_set_addrs(vswp); 154 } 155 156 mutex_enter(&vswp->swtmout_lock); 157 158 if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) { 159 /* 160 * Reschedule timeout() if the error is EAGAIN and the 161 * timeout is still enabled. For errors other than EAGAIN, 162 * we simply return without rescheduling timeout(). 163 */ 164 vswp->swtmout_id = 165 timeout(vsw_setup_switching_timeout, vswp, 166 (vsw_setup_switching_delay * drv_usectohz(MICROSEC))); 167 goto exit; 168 } 169 170 /* timeout handler completed */ 171 vswp->swtmout_enabled = B_FALSE; 172 vswp->swtmout_id = 0; 173 174 exit: 175 mutex_exit(&vswp->swtmout_lock); 176 } 177 178 /* 179 * Cancel the timeout handler to setup switching mode. 180 */ 181 void 182 vsw_stop_switching_timeout(vsw_t *vswp) 183 { 184 timeout_id_t tid; 185 186 mutex_enter(&vswp->swtmout_lock); 187 188 tid = vswp->swtmout_id; 189 190 if (tid != 0) { 191 /* signal timeout handler to stop */ 192 vswp->swtmout_enabled = B_FALSE; 193 vswp->swtmout_id = 0; 194 mutex_exit(&vswp->swtmout_lock); 195 196 (void) untimeout(tid); 197 } else { 198 mutex_exit(&vswp->swtmout_lock); 199 } 200 201 (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE); 202 203 mutex_enter(&vswp->mac_lock); 204 vswp->mac_open_retries = 0; 205 mutex_exit(&vswp->mac_lock); 206 } 207 208 /* 209 * Setup the required switching mode. 210 * This routine is invoked from vsw_attach() or vsw_update_md_prop() 211 * initially. If it fails and the error is EAGAIN, then a timeout handler 212 * is started to retry vsw_setup_switching(), until it successfully finishes; 213 * or the returned error is not EAGAIN. 214 * 215 * Returns: 216 * 0 on success. 217 * EAGAIN if retry is needed. 218 * 1 on all other failures. 219 */ 220 int 221 vsw_setup_switching(vsw_t *vswp) 222 { 223 int i, rv = 1; 224 225 D1(vswp, "%s: enter", __func__); 226 227 /* 228 * Select best switching mode. 229 * Note that we start from the saved smode_idx. This is done as 230 * this routine can be called from the timeout handler to retry 231 * setting up a specific mode. Currently only the function which 232 * sets up layer2/promisc mode returns EAGAIN if the underlying 233 * physical device is not available yet, causing retries. 234 */ 235 for (i = vswp->smode_idx; i < vswp->smode_num; i++) { 236 vswp->smode_idx = i; 237 switch (vswp->smode[i]) { 238 case VSW_LAYER2: 239 case VSW_LAYER2_PROMISC: 240 rv = vsw_setup_layer2(vswp); 241 break; 242 243 case VSW_LAYER3: 244 rv = vsw_setup_layer3(vswp); 245 break; 246 247 default: 248 DERR(vswp, "unknown switch mode"); 249 break; 250 } 251 252 if ((rv == 0) || (rv == EAGAIN)) 253 break; 254 255 /* all other errors(rv != 0): continue & select the next mode */ 256 rv = 1; 257 } 258 259 if (rv && (rv != EAGAIN)) { 260 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 261 "switching mode", vswp->instance); 262 } else if (rv == 0) { 263 (void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE); 264 } 265 266 D2(vswp, "%s: Operating in mode %d", __func__, 267 vswp->smode[vswp->smode_idx]); 268 269 D1(vswp, "%s: exit", __func__); 270 271 return (rv); 272 } 273 274 /* 275 * Setup for layer 2 switching. 276 * 277 * Returns: 278 * 0 on success. 279 * EAGAIN if retry is needed. 280 * EIO on all other failures. 281 */ 282 static int 283 vsw_setup_layer2(vsw_t *vswp) 284 { 285 int rv; 286 287 D1(vswp, "%s: enter", __func__); 288 289 vswp->vsw_switch_frame = vsw_switch_l2_frame; 290 291 rv = strlen(vswp->physname); 292 if (rv == 0) { 293 /* 294 * Physical device name is NULL, which is 295 * required for layer 2. 296 */ 297 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 298 vswp->instance); 299 return (EIO); 300 } 301 302 mutex_enter(&vswp->mac_lock); 303 304 rv = vsw_mac_open(vswp); 305 if (rv != 0) { 306 if (rv != EAGAIN) { 307 cmn_err(CE_WARN, "!vsw%d: Unable to open physical " 308 "device: %s\n", vswp->instance, vswp->physname); 309 } 310 mutex_exit(&vswp->mac_lock); 311 return (rv); 312 } 313 314 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 315 /* 316 * Verify that underlying device can support multiple 317 * unicast mac addresses. 318 */ 319 rv = vsw_get_hw_maddr(vswp); 320 if (rv != 0) { 321 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 322 "layer2 switching", vswp->instance); 323 goto exit_error; 324 } 325 } 326 327 /* 328 * Attempt to link into the MAC layer so we can get 329 * and send packets out over the physical adapter. 330 */ 331 rv = vsw_mac_attach(vswp); 332 if (rv != 0) { 333 /* 334 * Registration with the MAC layer has failed, 335 * so return error so that can fall back to next 336 * prefered switching method. 337 */ 338 cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: " 339 "%s\n", vswp->instance, vswp->physname); 340 goto exit_error; 341 } 342 343 D1(vswp, "%s: exit", __func__); 344 345 mutex_exit(&vswp->mac_lock); 346 return (0); 347 348 exit_error: 349 vsw_mac_close(vswp); 350 mutex_exit(&vswp->mac_lock); 351 return (EIO); 352 } 353 354 static int 355 vsw_setup_layer3(vsw_t *vswp) 356 { 357 D1(vswp, "%s: enter", __func__); 358 359 D2(vswp, "%s: operating in layer 3 mode", __func__); 360 vswp->vsw_switch_frame = vsw_switch_l3_frame; 361 362 D1(vswp, "%s: exit", __func__); 363 364 return (0); 365 } 366 367 /* 368 * Switch the given ethernet frame when operating in layer 2 mode. 369 * 370 * vswp: pointer to the vsw instance 371 * mp: pointer to chain of ethernet frame(s) to be switched 372 * caller: identifies the source of this frame as: 373 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 374 * 2. VSW_PHYSDEV - the physical ethernet device 375 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 376 * arg: argument provided by the caller. 377 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 378 * 2. for PHYSDEV - NULL 379 * 3. for LOCALDEV - pointer to to this vsw_t(self) 380 */ 381 void 382 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 383 vsw_port_t *arg, mac_resource_handle_t mrh) 384 { 385 struct ether_header *ehp; 386 vsw_port_t *port = NULL; 387 mblk_t *bp, *ret_m; 388 mblk_t *mpt = NULL; 389 uint32_t count; 390 vsw_port_list_t *plist = &vswp->plist; 391 392 D1(vswp, "%s: enter (caller %d)", __func__, caller); 393 394 /* 395 * PERF: rather than breaking up the chain here, scan it 396 * to find all mblks heading to same destination and then 397 * pass that sub-chain to the lower transmit functions. 398 */ 399 400 /* process the chain of packets */ 401 bp = mp; 402 while (bp) { 403 ehp = (struct ether_header *)bp->b_rptr; 404 count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 405 ASSERT(count != 0); 406 407 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 408 __func__, MBLKSIZE(mp), MBLKL(mp)); 409 410 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 411 /* 412 * If destination is VSW_LOCALDEV (vsw as an eth 413 * interface) and if the device is up & running, 414 * send the packet up the stack on this host. 415 * If the virtual interface is down, drop the packet. 416 */ 417 if (caller != VSW_LOCALDEV) { 418 vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG); 419 } else { 420 freemsgchain(mp); 421 } 422 continue; 423 } 424 425 READ_ENTER(&plist->lockrw); 426 port = vsw_lookup_fdb(vswp, ehp); 427 if (port) { 428 /* 429 * Mark the port as in-use before releasing the lockrw. 430 */ 431 VSW_PORT_REFHOLD(port); 432 RW_EXIT(&plist->lockrw); 433 434 /* 435 * If plumbed and in promisc mode then copy msg 436 * and send up the stack. 437 */ 438 vsw_mac_rx(vswp, mrh, mp, 439 VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 440 441 /* 442 * If the destination is in FDB, the packet 443 * should be forwarded to the correponding 444 * vsw_port (connected to a vnet device - 445 * VSW_VNETPORT) 446 */ 447 (void) vsw_portsend(port, mp, mpt, count); 448 449 /* 450 * Decrement use count in port. 451 */ 452 VSW_PORT_REFRELE(port); 453 } else { 454 RW_EXIT(&plist->lockrw); 455 /* 456 * Destination not in FDB. 457 * 458 * If the destination is broadcast or 459 * multicast forward the packet to all 460 * (VNETPORTs, PHYSDEV, LOCALDEV), 461 * except the caller. 462 */ 463 if (IS_BROADCAST(ehp)) { 464 D2(vswp, "%s: BROADCAST pkt", __func__); 465 (void) vsw_forward_all(vswp, mp, caller, arg); 466 } else if (IS_MULTICAST(ehp)) { 467 D2(vswp, "%s: MULTICAST pkt", __func__); 468 (void) vsw_forward_grp(vswp, mp, caller, arg); 469 } else { 470 /* 471 * If the destination is unicast, and came 472 * from either a logical network device or 473 * the switch itself when it is plumbed, then 474 * send it out on the physical device and also 475 * up the stack if the logical interface is 476 * in promiscious mode. 477 * 478 * NOTE: The assumption here is that if we 479 * cannot find the destination in our fdb, its 480 * a unicast address, and came from either a 481 * vnet or down the stack (when plumbed) it 482 * must be destinded for an ethernet device 483 * outside our ldoms. 484 */ 485 if (caller == VSW_VNETPORT) { 486 /* promisc check copy etc */ 487 vsw_mac_rx(vswp, mrh, mp, 488 VSW_MACRX_PROMISC | 489 VSW_MACRX_COPYMSG); 490 491 if ((ret_m = vsw_tx_msg(vswp, mp)) 492 != NULL) { 493 DERR(vswp, "%s: drop mblks to " 494 "phys dev", __func__); 495 freemsgchain(ret_m); 496 } 497 498 } else if (caller == VSW_PHYSDEV) { 499 /* 500 * Pkt seen because card in promisc 501 * mode. Send up stack if plumbed in 502 * promisc mode, else drop it. 503 */ 504 vsw_mac_rx(vswp, mrh, mp, 505 VSW_MACRX_PROMISC | 506 VSW_MACRX_FREEMSG); 507 508 } else if (caller == VSW_LOCALDEV) { 509 /* 510 * Pkt came down the stack, send out 511 * over physical device. 512 */ 513 if ((ret_m = vsw_tx_msg(vswp, mp)) 514 != NULL) { 515 DERR(vswp, "%s: drop mblks to " 516 "phys dev", __func__); 517 freemsgchain(ret_m); 518 } 519 } 520 } 521 } 522 } 523 D1(vswp, "%s: exit\n", __func__); 524 } 525 526 /* 527 * Switch ethernet frame when in layer 3 mode (i.e. using IP 528 * layer to do the routing). 529 * 530 * There is a large amount of overlap between this function and 531 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 532 * both these functions. 533 */ 534 void 535 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 536 vsw_port_t *arg, mac_resource_handle_t mrh) 537 { 538 struct ether_header *ehp; 539 vsw_port_t *port = NULL; 540 mblk_t *bp = NULL; 541 mblk_t *mpt; 542 uint32_t count; 543 vsw_port_list_t *plist = &vswp->plist; 544 545 D1(vswp, "%s: enter (caller %d)", __func__, caller); 546 547 /* 548 * In layer 3 mode should only ever be switching packets 549 * between IP layer and vnet devices. So make sure thats 550 * who is invoking us. 551 */ 552 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 553 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 554 freemsgchain(mp); 555 return; 556 } 557 558 /* process the chain of packets */ 559 bp = mp; 560 while (bp) { 561 ehp = (struct ether_header *)bp->b_rptr; 562 count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); 563 ASSERT(count != 0); 564 565 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 566 __func__, MBLKSIZE(mp), MBLKL(mp)); 567 568 READ_ENTER(&plist->lockrw); 569 port = vsw_lookup_fdb(vswp, ehp); 570 if (port) { 571 /* 572 * Mark the port as in-use before releasing the lockrw. 573 */ 574 VSW_PORT_REFHOLD(port); 575 RW_EXIT(&plist->lockrw); 576 577 D2(vswp, "%s: sending to target port", __func__); 578 (void) vsw_portsend(port, mp, mpt, count); 579 580 /* 581 * Decrement ref count. 582 */ 583 VSW_PORT_REFRELE(port); 584 } else { 585 RW_EXIT(&plist->lockrw); 586 /* 587 * Destination not in FDB 588 * 589 * If the destination is broadcast or 590 * multicast forward the packet to all 591 * (VNETPORTs, PHYSDEV, LOCALDEV), 592 * except the caller. 593 */ 594 if (IS_BROADCAST(ehp)) { 595 D2(vswp, "%s: BROADCAST pkt", __func__); 596 (void) vsw_forward_all(vswp, mp, caller, arg); 597 } else if (IS_MULTICAST(ehp)) { 598 D2(vswp, "%s: MULTICAST pkt", __func__); 599 (void) vsw_forward_grp(vswp, mp, caller, arg); 600 } else { 601 /* 602 * Unicast pkt from vnet that we don't have 603 * an FDB entry for, so must be destinded for 604 * the outside world. Attempt to send up to the 605 * IP layer to allow it to deal with it. 606 */ 607 if (caller == VSW_VNETPORT) { 608 vsw_mac_rx(vswp, mrh, 609 mp, VSW_MACRX_FREEMSG); 610 } 611 } 612 } 613 } 614 615 D1(vswp, "%s: exit", __func__); 616 } 617 618 /* 619 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 620 * except the caller (port on which frame arrived). 621 */ 622 static int 623 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 624 { 625 vsw_port_list_t *plist = &vswp->plist; 626 vsw_port_t *portp; 627 mblk_t *nmp = NULL; 628 mblk_t *ret_m = NULL; 629 int skip_port = 0; 630 631 D1(vswp, "vsw_forward_all: enter\n"); 632 633 /* 634 * Broadcast message from inside ldoms so send to outside 635 * world if in either of layer 2 modes. 636 */ 637 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 638 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 639 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 640 641 nmp = vsw_dupmsgchain(mp); 642 if (nmp) { 643 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 644 DERR(vswp, "%s: dropping pkt(s) " 645 "consisting of %ld bytes of data for" 646 " physical device", __func__, MBLKL(ret_m)); 647 freemsgchain(ret_m); 648 } 649 } 650 } 651 652 if (caller == VSW_VNETPORT) 653 skip_port = 1; 654 655 /* 656 * Broadcast message from other vnet (layer 2 or 3) or outside 657 * world (layer 2 only), send up stack if plumbed. 658 */ 659 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 660 vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG); 661 } 662 663 /* send it to all VNETPORTs */ 664 READ_ENTER(&plist->lockrw); 665 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 666 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 667 /* 668 * Caution ! - don't reorder these two checks as arg 669 * will be NULL if the caller is PHYSDEV. skip_port is 670 * only set if caller is VNETPORT. 671 */ 672 if ((skip_port) && (portp == arg)) { 673 continue; 674 } else { 675 nmp = vsw_dupmsgchain(mp); 676 if (nmp) { 677 mblk_t *mpt = nmp; 678 uint32_t count = 1; 679 680 /* Find tail */ 681 while (mpt->b_next != NULL) { 682 mpt = mpt->b_next; 683 count++; 684 } 685 /* 686 * The plist->lockrw is protecting the 687 * portp from getting destroyed here. 688 * So, no ref_cnt is incremented here. 689 */ 690 (void) vsw_portsend(portp, nmp, mpt, count); 691 } else { 692 DERR(vswp, "vsw_forward_all: nmp NULL"); 693 } 694 } 695 } 696 RW_EXIT(&plist->lockrw); 697 698 freemsgchain(mp); 699 700 D1(vswp, "vsw_forward_all: exit\n"); 701 return (0); 702 } 703 704 /* 705 * Forward pkts to any devices or interfaces which have registered 706 * an interest in them (i.e. multicast groups). 707 */ 708 static int 709 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 710 { 711 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 712 mfdb_ent_t *entp = NULL; 713 mfdb_ent_t *tpp = NULL; 714 vsw_port_t *port; 715 uint64_t key = 0; 716 mblk_t *nmp = NULL; 717 mblk_t *ret_m = NULL; 718 boolean_t check_if = B_TRUE; 719 720 /* 721 * Convert address to hash table key 722 */ 723 KEY_HASH(key, ehp->ether_dhost); 724 725 D1(vswp, "%s: key 0x%llx", __func__, key); 726 727 /* 728 * If pkt came from either a vnet or down the stack (if we are 729 * plumbed) and we are in layer 2 mode, then we send the pkt out 730 * over the physical adapter, and then check to see if any other 731 * vnets are interested in it. 732 */ 733 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 734 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 735 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 736 nmp = vsw_dupmsgchain(mp); 737 if (nmp) { 738 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 739 DERR(vswp, "%s: dropping pkt(s) consisting of " 740 "%ld bytes of data for physical device", 741 __func__, MBLKL(ret_m)); 742 freemsgchain(ret_m); 743 } 744 } 745 } 746 747 READ_ENTER(&vswp->mfdbrw); 748 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 749 (mod_hash_val_t *)&entp) != 0) { 750 D3(vswp, "%s: no table entry found for addr 0x%llx", 751 __func__, key); 752 } else { 753 /* 754 * Send to list of devices associated with this address... 755 */ 756 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 757 758 /* dont send to ourselves */ 759 if ((caller == VSW_VNETPORT) && 760 (tpp->d_addr == (void *)arg)) { 761 port = (vsw_port_t *)tpp->d_addr; 762 D3(vswp, "%s: not sending to ourselves" 763 " : port %d", __func__, port->p_instance); 764 continue; 765 766 } else if ((caller == VSW_LOCALDEV) && 767 (tpp->d_type == VSW_LOCALDEV)) { 768 D2(vswp, "%s: not sending back up stack", 769 __func__); 770 continue; 771 } 772 773 if (tpp->d_type == VSW_VNETPORT) { 774 port = (vsw_port_t *)tpp->d_addr; 775 D3(vswp, "%s: sending to port %ld for addr " 776 "0x%llx", __func__, port->p_instance, key); 777 778 nmp = vsw_dupmsgchain(mp); 779 if (nmp) { 780 mblk_t *mpt = nmp; 781 uint32_t count = 1; 782 783 /* Find tail */ 784 while (mpt->b_next != NULL) { 785 mpt = mpt->b_next; 786 count++; 787 } 788 /* 789 * The vswp->mfdbrw is protecting the 790 * portp from getting destroyed here. 791 * So, no ref_cnt is incremented here. 792 */ 793 (void) vsw_portsend(port, nmp, mpt, 794 count); 795 } 796 } else { 797 vsw_mac_rx(vswp, NULL, 798 mp, VSW_MACRX_COPYMSG); 799 D2(vswp, "%s: sending up stack" 800 " for addr 0x%llx", __func__, key); 801 check_if = B_FALSE; 802 } 803 } 804 } 805 806 RW_EXIT(&vswp->mfdbrw); 807 808 /* 809 * If the pkt came from either a vnet or from physical device, 810 * and if we havent already sent the pkt up the stack then we 811 * check now if we can/should (i.e. the interface is plumbed 812 * and in promisc mode). 813 */ 814 if ((check_if) && 815 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 816 vsw_mac_rx(vswp, NULL, mp, 817 VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); 818 } 819 820 freemsgchain(mp); 821 822 D1(vswp, "%s: exit", __func__); 823 824 return (0); 825 } 826 827 /* 828 * Add an entry into FDB, for the given mac address and port_id. 829 * Returns 0 on success, 1 on failure. 830 * 831 * Lock protecting FDB must be held by calling process. 832 */ 833 int 834 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 835 { 836 uint64_t addr = 0; 837 838 D1(vswp, "%s: enter", __func__); 839 840 KEY_HASH(addr, port->p_macaddr); 841 842 D2(vswp, "%s: key = 0x%llx", __func__, addr); 843 844 /* 845 * Note: duplicate keys will be rejected by mod_hash. 846 */ 847 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 848 (mod_hash_val_t)port) != 0) { 849 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 850 return (1); 851 } 852 853 D1(vswp, "%s: exit", __func__); 854 return (0); 855 } 856 857 /* 858 * Remove an entry from FDB. 859 * Returns 0 on success, 1 on failure. 860 */ 861 int 862 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 863 { 864 uint64_t addr = 0; 865 866 D1(vswp, "%s: enter", __func__); 867 868 KEY_HASH(addr, port->p_macaddr); 869 870 D2(vswp, "%s: key = 0x%llx", __func__, addr); 871 872 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 873 874 D1(vswp, "%s: enter", __func__); 875 876 return (0); 877 } 878 879 /* 880 * Search fdb for a given mac address. 881 * Returns pointer to the entry if found, else returns NULL. 882 */ 883 static vsw_port_t * 884 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 885 { 886 uint64_t key = 0; 887 vsw_port_t *port = NULL; 888 889 D1(vswp, "%s: enter", __func__); 890 891 KEY_HASH(key, ehp->ether_dhost); 892 893 D2(vswp, "%s: key = 0x%llx", __func__, key); 894 895 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 896 (mod_hash_val_t *)&port) != 0) { 897 D2(vswp, "%s: no port found", __func__); 898 return (NULL); 899 } 900 901 D1(vswp, "%s: exit", __func__); 902 903 return (port); 904 } 905 906 /* 907 * Add or remove multicast address(es). 908 * 909 * Returns 0 on success, 1 on failure. 910 */ 911 int 912 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 913 { 914 mcst_addr_t *mcst_p = NULL; 915 vsw_t *vswp = port->p_vswp; 916 uint64_t addr = 0x0; 917 int i; 918 919 D1(vswp, "%s: enter", __func__); 920 921 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 922 923 for (i = 0; i < mcst_pkt->count; i++) { 924 /* 925 * Convert address into form that can be used 926 * as hash table key. 927 */ 928 KEY_HASH(addr, mcst_pkt->mca[i]); 929 930 /* 931 * Add or delete the specified address/port combination. 932 */ 933 if (mcst_pkt->set == 0x1) { 934 D3(vswp, "%s: adding multicast address 0x%llx for " 935 "port %ld", __func__, addr, port->p_instance); 936 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 937 /* 938 * Update the list of multicast 939 * addresses contained within the 940 * port structure to include this new 941 * one. 942 */ 943 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), 944 KM_NOSLEEP); 945 if (mcst_p == NULL) { 946 DERR(vswp, "%s: unable to alloc mem", 947 __func__); 948 (void) vsw_del_mcst(vswp, 949 VSW_VNETPORT, addr, port); 950 return (1); 951 } 952 953 mcst_p->nextp = NULL; 954 mcst_p->addr = addr; 955 ether_copy(&mcst_pkt->mca[i], &mcst_p->mca); 956 957 /* 958 * Program the address into HW. If the addr 959 * has already been programmed then the MAC 960 * just increments a ref counter (which is 961 * used when the address is being deleted) 962 */ 963 mutex_enter(&vswp->mac_lock); 964 if (vswp->mh != NULL) { 965 if (mac_multicst_add(vswp->mh, 966 (uchar_t *)&mcst_pkt->mca[i])) { 967 mutex_exit(&vswp->mac_lock); 968 cmn_err(CE_WARN, "!vsw%d: " 969 "unable to add multicast " 970 "address: %s\n", 971 vswp->instance, 972 ether_sprintf((void *) 973 &mcst_p->mca)); 974 (void) vsw_del_mcst(vswp, 975 VSW_VNETPORT, addr, port); 976 kmem_free(mcst_p, 977 sizeof (*mcst_p)); 978 return (1); 979 } 980 mcst_p->mac_added = B_TRUE; 981 } 982 mutex_exit(&vswp->mac_lock); 983 984 mutex_enter(&port->mca_lock); 985 mcst_p->nextp = port->mcap; 986 port->mcap = mcst_p; 987 mutex_exit(&port->mca_lock); 988 989 } else { 990 DERR(vswp, "%s: error adding multicast " 991 "address 0x%llx for port %ld", 992 __func__, addr, port->p_instance); 993 return (1); 994 } 995 } else { 996 /* 997 * Delete an entry from the multicast hash 998 * table and update the address list 999 * appropriately. 1000 */ 1001 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 1002 D3(vswp, "%s: deleting multicast address " 1003 "0x%llx for port %ld", __func__, addr, 1004 port->p_instance); 1005 1006 mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr); 1007 ASSERT(mcst_p != NULL); 1008 1009 /* 1010 * Remove the address from HW. The address 1011 * will actually only be removed once the ref 1012 * count within the MAC layer has dropped to 1013 * zero. I.e. we can safely call this fn even 1014 * if other ports are interested in this 1015 * address. 1016 */ 1017 mutex_enter(&vswp->mac_lock); 1018 if (vswp->mh != NULL && mcst_p->mac_added) { 1019 if (mac_multicst_remove(vswp->mh, 1020 (uchar_t *)&mcst_pkt->mca[i])) { 1021 mutex_exit(&vswp->mac_lock); 1022 cmn_err(CE_WARN, "!vsw%d: " 1023 "unable to remove mcast " 1024 "address: %s\n", 1025 vswp->instance, 1026 ether_sprintf((void *) 1027 &mcst_p->mca)); 1028 kmem_free(mcst_p, 1029 sizeof (*mcst_p)); 1030 return (1); 1031 } 1032 mcst_p->mac_added = B_FALSE; 1033 } 1034 mutex_exit(&vswp->mac_lock); 1035 kmem_free(mcst_p, sizeof (*mcst_p)); 1036 1037 } else { 1038 DERR(vswp, "%s: error deleting multicast " 1039 "addr 0x%llx for port %ld", 1040 __func__, addr, port->p_instance); 1041 return (1); 1042 } 1043 } 1044 } 1045 D1(vswp, "%s: exit", __func__); 1046 return (0); 1047 } 1048 1049 /* 1050 * Add a new multicast entry. 1051 * 1052 * Search hash table based on address. If match found then 1053 * update associated val (which is chain of ports), otherwise 1054 * create new key/val (addr/port) pair and insert into table. 1055 */ 1056 int 1057 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1058 { 1059 int dup = 0; 1060 int rv = 0; 1061 mfdb_ent_t *ment = NULL; 1062 mfdb_ent_t *tmp_ent = NULL; 1063 mfdb_ent_t *new_ent = NULL; 1064 void *tgt = NULL; 1065 1066 if (devtype == VSW_VNETPORT) { 1067 /* 1068 * Being invoked from a vnet. 1069 */ 1070 ASSERT(arg != NULL); 1071 tgt = arg; 1072 D2(NULL, "%s: port %d : address 0x%llx", __func__, 1073 ((vsw_port_t *)arg)->p_instance, addr); 1074 } else { 1075 /* 1076 * We are being invoked via the m_multicst mac entry 1077 * point. 1078 */ 1079 D2(NULL, "%s: address 0x%llx", __func__, addr); 1080 tgt = (void *)vswp; 1081 } 1082 1083 WRITE_ENTER(&vswp->mfdbrw); 1084 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1085 (mod_hash_val_t *)&ment) != 0) { 1086 1087 /* address not currently in table */ 1088 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1089 ment->d_addr = (void *)tgt; 1090 ment->d_type = devtype; 1091 ment->nextp = NULL; 1092 1093 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 1094 (mod_hash_val_t)ment) != 0) { 1095 DERR(vswp, "%s: hash table insertion failed", __func__); 1096 kmem_free(ment, sizeof (mfdb_ent_t)); 1097 rv = 1; 1098 } else { 1099 D2(vswp, "%s: added initial entry for 0x%llx to " 1100 "table", __func__, addr); 1101 } 1102 } else { 1103 /* 1104 * Address in table. Check to see if specified port 1105 * is already associated with the address. If not add 1106 * it now. 1107 */ 1108 tmp_ent = ment; 1109 while (tmp_ent != NULL) { 1110 if (tmp_ent->d_addr == (void *)tgt) { 1111 if (devtype == VSW_VNETPORT) { 1112 DERR(vswp, "%s: duplicate port entry " 1113 "found for portid %ld and key " 1114 "0x%llx", __func__, 1115 ((vsw_port_t *)arg)->p_instance, 1116 addr); 1117 } else { 1118 DERR(vswp, "%s: duplicate entry found" 1119 "for key 0x%llx", __func__, addr); 1120 } 1121 rv = 1; 1122 dup = 1; 1123 break; 1124 } 1125 tmp_ent = tmp_ent->nextp; 1126 } 1127 1128 /* 1129 * Port not on list so add it to end now. 1130 */ 1131 if (0 == dup) { 1132 D2(vswp, "%s: added entry for 0x%llx to table", 1133 __func__, addr); 1134 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 1135 new_ent->d_addr = (void *)tgt; 1136 new_ent->d_type = devtype; 1137 new_ent->nextp = NULL; 1138 1139 tmp_ent = ment; 1140 while (tmp_ent->nextp != NULL) 1141 tmp_ent = tmp_ent->nextp; 1142 1143 tmp_ent->nextp = new_ent; 1144 } 1145 } 1146 1147 RW_EXIT(&vswp->mfdbrw); 1148 return (rv); 1149 } 1150 1151 /* 1152 * Remove a multicast entry from the hashtable. 1153 * 1154 * Search hash table based on address. If match found, scan 1155 * list of ports associated with address. If specified port 1156 * found remove it from list. 1157 */ 1158 int 1159 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 1160 { 1161 mfdb_ent_t *ment = NULL; 1162 mfdb_ent_t *curr_p, *prev_p; 1163 void *tgt = NULL; 1164 1165 D1(vswp, "%s: enter", __func__); 1166 1167 if (devtype == VSW_VNETPORT) { 1168 tgt = (vsw_port_t *)arg; 1169 D2(vswp, "%s: removing port %d from mFDB for address" 1170 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr); 1171 } else { 1172 D2(vswp, "%s: removing entry", __func__); 1173 tgt = (void *)vswp; 1174 } 1175 1176 WRITE_ENTER(&vswp->mfdbrw); 1177 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 1178 (mod_hash_val_t *)&ment) != 0) { 1179 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 1180 RW_EXIT(&vswp->mfdbrw); 1181 return (1); 1182 } 1183 1184 prev_p = curr_p = ment; 1185 1186 while (curr_p != NULL) { 1187 if (curr_p->d_addr == (void *)tgt) { 1188 if (devtype == VSW_VNETPORT) { 1189 D2(vswp, "%s: port %d found", __func__, 1190 ((vsw_port_t *)tgt)->p_instance); 1191 } else { 1192 D2(vswp, "%s: instance found", __func__); 1193 } 1194 1195 if (prev_p == curr_p) { 1196 /* 1197 * head of list, if no other element is in 1198 * list then destroy this entry, otherwise 1199 * just replace it with updated value. 1200 */ 1201 ment = curr_p->nextp; 1202 if (ment == NULL) { 1203 (void) mod_hash_destroy(vswp->mfdb, 1204 (mod_hash_val_t)addr); 1205 } else { 1206 (void) mod_hash_replace(vswp->mfdb, 1207 (mod_hash_key_t)addr, 1208 (mod_hash_val_t)ment); 1209 } 1210 } else { 1211 /* 1212 * Not head of list, no need to do 1213 * replacement, just adjust list pointers. 1214 */ 1215 prev_p->nextp = curr_p->nextp; 1216 } 1217 break; 1218 } 1219 1220 prev_p = curr_p; 1221 curr_p = curr_p->nextp; 1222 } 1223 1224 RW_EXIT(&vswp->mfdbrw); 1225 1226 D1(vswp, "%s: exit", __func__); 1227 1228 if (curr_p == NULL) 1229 return (1); 1230 kmem_free(curr_p, sizeof (mfdb_ent_t)); 1231 return (0); 1232 } 1233 1234 /* 1235 * Port is being deleted, but has registered an interest in one 1236 * or more multicast groups. Using the list of addresses maintained 1237 * within the port structure find the appropriate entry in the hash 1238 * table and remove this port from the list of interested ports. 1239 */ 1240 void 1241 vsw_del_mcst_port(vsw_port_t *port) 1242 { 1243 mcst_addr_t *mcap = NULL; 1244 vsw_t *vswp = port->p_vswp; 1245 1246 D1(vswp, "%s: enter", __func__); 1247 1248 mutex_enter(&port->mca_lock); 1249 1250 while ((mcap = port->mcap) != NULL) { 1251 1252 port->mcap = mcap->nextp; 1253 1254 mutex_exit(&port->mca_lock); 1255 1256 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 1257 mcap->addr, port); 1258 1259 /* 1260 * Remove the address from HW. The address 1261 * will actually only be removed once the ref 1262 * count within the MAC layer has dropped to 1263 * zero. I.e. we can safely call this fn even 1264 * if other ports are interested in this 1265 * address. 1266 */ 1267 mutex_enter(&vswp->mac_lock); 1268 if (vswp->mh != NULL && mcap->mac_added) { 1269 (void) mac_multicst_remove(vswp->mh, 1270 (uchar_t *)&mcap->mca); 1271 } 1272 mutex_exit(&vswp->mac_lock); 1273 1274 kmem_free(mcap, sizeof (*mcap)); 1275 1276 mutex_enter(&port->mca_lock); 1277 1278 } 1279 1280 mutex_exit(&port->mca_lock); 1281 1282 D1(vswp, "%s: exit", __func__); 1283 } 1284 1285 /* 1286 * This vsw instance is detaching, but has registered an interest in one 1287 * or more multicast groups. Using the list of addresses maintained 1288 * within the vsw structure find the appropriate entry in the hash 1289 * table and remove this instance from the list of interested ports. 1290 */ 1291 void 1292 vsw_del_mcst_vsw(vsw_t *vswp) 1293 { 1294 mcst_addr_t *next_p = NULL; 1295 1296 D1(vswp, "%s: enter", __func__); 1297 1298 mutex_enter(&vswp->mca_lock); 1299 1300 while (vswp->mcap != NULL) { 1301 DERR(vswp, "%s: deleting addr 0x%llx", 1302 __func__, vswp->mcap->addr); 1303 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL); 1304 1305 next_p = vswp->mcap->nextp; 1306 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 1307 vswp->mcap = next_p; 1308 } 1309 1310 vswp->mcap = NULL; 1311 mutex_exit(&vswp->mca_lock); 1312 1313 D1(vswp, "%s: exit", __func__); 1314 } 1315 1316 static uint32_t 1317 vsw_get_same_dest_list(struct ether_header *ehp, 1318 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp) 1319 { 1320 uint32_t count = 0; 1321 mblk_t *bp; 1322 mblk_t *nbp; 1323 mblk_t *head = NULL; 1324 mblk_t *tail = NULL; 1325 mblk_t *prev = NULL; 1326 struct ether_header *behp; 1327 1328 /* process the chain of packets */ 1329 bp = *mpp; 1330 while (bp) { 1331 nbp = bp->b_next; 1332 behp = (struct ether_header *)bp->b_rptr; 1333 bp->b_prev = NULL; 1334 if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) { 1335 if (prev == NULL) { 1336 *mpp = nbp; 1337 } else { 1338 prev->b_next = nbp; 1339 } 1340 bp->b_next = NULL; 1341 if (head == NULL) { 1342 head = tail = bp; 1343 } else { 1344 tail->b_next = bp; 1345 tail = bp; 1346 } 1347 count++; 1348 } else { 1349 prev = bp; 1350 } 1351 bp = nbp; 1352 } 1353 *rhead = head; 1354 *rtail = tail; 1355 DTRACE_PROBE1(vsw_same_dest, int, count); 1356 return (count); 1357 } 1358 1359 static mblk_t * 1360 vsw_dupmsgchain(mblk_t *mp) 1361 { 1362 mblk_t *nmp = NULL; 1363 mblk_t **nmpp = &nmp; 1364 1365 for (; mp != NULL; mp = mp->b_next) { 1366 if ((*nmpp = dupmsg(mp)) == NULL) { 1367 freemsgchain(nmp); 1368 return (NULL); 1369 } 1370 1371 nmpp = &((*nmpp)->b_next); 1372 } 1373 1374 return (nmp); 1375 } 1376