1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/dls.h> 33 #include <sys/dls_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/ethernet.h> 36 #include <sys/vlan.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <netinet/tcp.h> 40 #include <netinet/udp.h> 41 #include <netinet/sctp.h> 42 43 /* global flow table, will be a per exclusive-zone table later */ 44 static mod_hash_t *flow_hash; 45 static krwlock_t flow_tab_lock; 46 47 static kmem_cache_t *flow_cache; 48 static kmem_cache_t *flow_tab_cache; 49 static flow_ops_t flow_l2_ops; 50 51 typedef struct { 52 const char *fs_name; 53 uint_t fs_offset; 54 } flow_stats_info_t; 55 56 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 57 static flow_stats_info_t flow_stats_list[] = { 58 {"rbytes", FS_OFF(fs_rbytes)}, 59 {"ipackets", FS_OFF(fs_ipackets)}, 60 {"ierrors", FS_OFF(fs_ierrors)}, 61 {"obytes", FS_OFF(fs_obytes)}, 62 {"opackets", FS_OFF(fs_opackets)}, 63 {"oerrors", FS_OFF(fs_oerrors)} 64 }; 65 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 66 67 /* 68 * Checks whether a flow mask is legal. 69 */ 70 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 71 72 static void 73 flow_stat_init(kstat_named_t *knp) 74 { 75 int i; 76 77 for (i = 0; i < FS_SIZE; i++, knp++) { 78 kstat_named_init(knp, flow_stats_list[i].fs_name, 79 KSTAT_DATA_UINT64); 80 } 81 } 82 83 static int 84 flow_stat_update(kstat_t *ksp, int rw) 85 { 86 flow_entry_t *fep = ksp->ks_private; 87 flow_stats_t *fsp = &fep->fe_flowstats; 88 kstat_named_t *knp = ksp->ks_data; 89 uint64_t *statp; 90 zoneid_t zid; 91 int i; 92 93 if (rw != KSTAT_READ) 94 return (EACCES); 95 96 zid = getzoneid(); 97 if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) { 98 for (i = 0; i < FS_SIZE; i++, knp++) 99 knp->value.ui64 = 0; 100 101 return (0); 102 } 103 104 for (i = 0; i < FS_SIZE; i++, knp++) { 105 statp = (uint64_t *) 106 ((uchar_t *)fsp + flow_stats_list[i].fs_offset); 107 108 knp->value.ui64 = *statp; 109 } 110 return (0); 111 } 112 113 static void 114 flow_stat_create(flow_entry_t *fep) 115 { 116 kstat_t *ksp; 117 kstat_named_t *knp; 118 uint_t nstats = FS_SIZE; 119 120 ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow", 121 KSTAT_TYPE_NAMED, nstats, 0); 122 if (ksp == NULL) 123 return; 124 125 ksp->ks_update = flow_stat_update; 126 ksp->ks_private = fep; 127 fep->fe_ksp = ksp; 128 129 knp = (kstat_named_t *)ksp->ks_data; 130 flow_stat_init(knp); 131 kstat_install(ksp); 132 } 133 134 void 135 flow_stat_destroy(flow_entry_t *fep) 136 { 137 if (fep->fe_ksp != NULL) { 138 kstat_delete(fep->fe_ksp); 139 fep->fe_ksp = NULL; 140 } 141 } 142 143 /* 144 * Initialize the flow table 145 */ 146 void 147 mac_flow_init() 148 { 149 flow_cache = kmem_cache_create("flow_entry_cache", 150 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 151 flow_tab_cache = kmem_cache_create("flow_tab_cache", 152 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 153 flow_hash = mod_hash_create_extended("flow_hash", 154 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 155 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 156 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 157 } 158 159 /* 160 * Cleanup and release the flow table 161 */ 162 void 163 mac_flow_fini() 164 { 165 kmem_cache_destroy(flow_cache); 166 kmem_cache_destroy(flow_tab_cache); 167 mod_hash_destroy_hash(flow_hash); 168 rw_destroy(&flow_tab_lock); 169 } 170 171 /* 172 * mac_create_flow(): create a flow_entry_t. 173 */ 174 int 175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 176 void *client_cookie, uint_t type, flow_entry_t **flentp) 177 { 178 flow_entry_t *flent = *flentp; 179 int err = 0; 180 181 if (mrp != NULL) { 182 err = mac_validate_props(mrp); 183 if (err != 0) 184 return (err); 185 } 186 187 if (flent == NULL) { 188 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 189 bzero(flent, sizeof (*flent)); 190 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 192 193 /* Initialize the receiver function to a safe routine */ 194 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 195 flent->fe_index = -1; 196 } 197 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); 198 199 /* This is an initial flow, will be configured later */ 200 if (fd == NULL) { 201 *flentp = flent; 202 return (0); 203 } 204 205 flent->fe_client_cookie = client_cookie; 206 flent->fe_type = type; 207 208 /* 209 * As flow creation is only allowed in global zone, this will 210 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will 211 * later set the right value. 212 */ 213 flent->fe_zoneid = getzoneid(); 214 215 /* Save flow desc */ 216 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 217 218 if (mrp != NULL) { 219 /* 220 * We have already set fe_resource_props for a Link. 221 */ 222 if (type & FLOW_USER) { 223 bcopy(mrp, &flent->fe_resource_props, 224 sizeof (mac_resource_props_t)); 225 } 226 /* 227 * The effective resource list should reflect the priority 228 * that we set implicitly. 229 */ 230 if (!(mrp->mrp_mask & MRP_PRIORITY)) 231 mrp->mrp_mask |= MRP_PRIORITY; 232 if (type & FLOW_USER) 233 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 234 else 235 mrp->mrp_priority = MPL_LINK_DEFAULT; 236 bcopy(mrp, &flent->fe_effective_props, 237 sizeof (mac_resource_props_t)); 238 } 239 flow_stat_create(flent); 240 241 *flentp = flent; 242 return (0); 243 } 244 245 /* 246 * Validate flow entry and add it to a flow table. 247 */ 248 int 249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 250 { 251 flow_entry_t **headp, **p; 252 flow_ops_t *ops = &ft->ft_ops; 253 flow_mask_t mask; 254 uint32_t index; 255 int err; 256 257 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 258 259 /* 260 * Check for invalid bits in mask. 261 */ 262 mask = flent->fe_flow_desc.fd_mask; 263 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 264 return (EOPNOTSUPP); 265 266 /* 267 * Validate flent. 268 */ 269 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 270 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 271 flow_entry_t *, flent, int, err); 272 return (err); 273 } 274 275 /* 276 * Flent is valid. now calculate hash and insert it 277 * into hash table. 278 */ 279 index = ops->fo_hash_fe(ft, flent); 280 281 /* 282 * We do not need a lock up until now because we were 283 * not accessing the flow table. 284 */ 285 rw_enter(&ft->ft_lock, RW_WRITER); 286 headp = &ft->ft_table[index]; 287 288 /* 289 * Check for duplicate flow. 290 */ 291 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 292 if ((*p)->fe_flow_desc.fd_mask != 293 flent->fe_flow_desc.fd_mask) 294 continue; 295 296 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 297 rw_exit(&ft->ft_lock); 298 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 299 flow_entry_t *, flent, int, err); 300 return (EALREADY); 301 } 302 } 303 304 /* 305 * Insert flow to hash list. 306 */ 307 err = ops->fo_insert_fe(ft, headp, flent); 308 if (err != 0) { 309 rw_exit(&ft->ft_lock); 310 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 311 flow_entry_t *, flent, int, err); 312 return (err); 313 } 314 315 /* 316 * Save the hash index so it can be used by mac_flow_remove(). 317 */ 318 flent->fe_index = (int)index; 319 320 /* 321 * Save the flow tab back reference. 322 */ 323 flent->fe_flow_tab = ft; 324 FLOW_MARK(flent, FE_FLOW_TAB); 325 ft->ft_flow_count++; 326 rw_exit(&ft->ft_lock); 327 return (0); 328 } 329 330 /* 331 * Remove a flow from a mac client's subflow table 332 */ 333 void 334 mac_flow_rem_subflow(flow_entry_t *flent) 335 { 336 flow_tab_t *ft = flent->fe_flow_tab; 337 mac_client_impl_t *mcip = ft->ft_mcip; 338 339 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 340 341 mac_flow_remove(ft, flent, B_FALSE); 342 if (flent->fe_mcip == NULL) { 343 /* 344 * The interface is not yet plumbed and mac_client_flow_add 345 * was not done. 346 */ 347 if (FLOW_TAB_EMPTY(ft)) { 348 mac_flow_tab_destroy(ft); 349 mcip->mci_subflow_tab = NULL; 350 } 351 return; 352 } 353 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 354 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 355 } 356 357 /* 358 * Add a flow to a mac client's subflow table and instantiate the flow 359 * in the mac by creating the associated SRSs etc. 360 */ 361 int 362 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 363 boolean_t instantiate_flow) 364 { 365 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 366 flow_tab_info_t *ftinfo; 367 flow_mask_t mask; 368 flow_tab_t *ft; 369 int err; 370 boolean_t ft_created = B_FALSE; 371 372 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 373 374 /* 375 * If the subflow table exists already just add the new subflow 376 * to the existing table, else we create a new subflow table below. 377 */ 378 ft = mcip->mci_subflow_tab; 379 if (ft == NULL) { 380 mask = flent->fe_flow_desc.fd_mask; 381 /* 382 * Try to create a new table and then add the subflow to the 383 * newly created subflow table 384 */ 385 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) 386 return (EOPNOTSUPP); 387 388 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 389 mcip->mci_mip, &ft); 390 ft_created = B_TRUE; 391 } 392 393 err = mac_flow_add(ft, flent); 394 if (err != 0) { 395 if (ft_created) 396 mac_flow_tab_destroy(ft); 397 return (err); 398 } 399 400 if (instantiate_flow) { 401 /* Now activate the flow by creating its SRSs */ 402 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 403 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 404 if (err != 0) { 405 mac_flow_remove(ft, flent, B_FALSE); 406 if (ft_created) 407 mac_flow_tab_destroy(ft); 408 return (err); 409 } 410 } else { 411 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 412 } 413 if (ft_created) { 414 ASSERT(mcip->mci_subflow_tab == NULL); 415 ft->ft_mcip = mcip; 416 mcip->mci_subflow_tab = ft; 417 if (instantiate_flow) 418 mac_client_update_classifier(mcip, B_TRUE); 419 } 420 return (0); 421 } 422 423 /* 424 * Remove flow entry from flow table. 425 */ 426 void 427 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 428 { 429 flow_entry_t **fp; 430 431 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 432 if (!(flent->fe_flags & FE_FLOW_TAB)) 433 return; 434 435 rw_enter(&ft->ft_lock, RW_WRITER); 436 /* 437 * If this is a permanent removal from the flow table, mark it 438 * CONDEMNED to prevent future references. If this is a temporary 439 * removal from the table, say to update the flow descriptor then 440 * we don't mark it CONDEMNED 441 */ 442 if (!temp) 443 FLOW_MARK(flent, FE_CONDEMNED); 444 /* 445 * Locate the specified flent. 446 */ 447 fp = &ft->ft_table[flent->fe_index]; 448 while (*fp != flent) 449 fp = &(*fp)->fe_next; 450 451 /* 452 * The flent must exist. Otherwise it's a bug. 453 */ 454 ASSERT(fp != NULL); 455 *fp = flent->fe_next; 456 flent->fe_next = NULL; 457 458 /* 459 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 460 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 461 * will panic. 462 */ 463 flent->fe_index = -1; 464 FLOW_UNMARK(flent, FE_FLOW_TAB); 465 ft->ft_flow_count--; 466 rw_exit(&ft->ft_lock); 467 } 468 469 /* 470 * This is the flow lookup routine used by the mac sw classifier engine. 471 */ 472 int 473 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 474 { 475 flow_state_t s; 476 flow_entry_t *flent; 477 flow_ops_t *ops = &ft->ft_ops; 478 boolean_t retried = B_FALSE; 479 int i, err; 480 481 s.fs_flags = flags; 482 s.fs_mp = mp; 483 retry: 484 485 /* 486 * Walk the list of predeclared accept functions. 487 * Each of these would accumulate enough state to allow the next 488 * accept routine to make progress. 489 */ 490 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 491 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 492 /* 493 * ENOBUFS indicates that the mp could be too short 494 * and may need a pullup. 495 */ 496 if (err != ENOBUFS || retried) 497 return (err); 498 499 /* 500 * Don't modify the mblk if there are references to it. 501 * Also, there is no point pulling up if b_cont is NULL. 502 */ 503 if (DB_REF(mp) > 1 || mp->b_cont == NULL || 504 pullupmsg(mp, -1) == 0) 505 return (EINVAL); 506 507 retried = B_TRUE; 508 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 509 flow_state_t *, &s); 510 goto retry; 511 } 512 } 513 514 /* 515 * The packet is considered sane. We may now attempt to 516 * find the corresponding flent. 517 */ 518 rw_enter(&ft->ft_lock, RW_READER); 519 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 520 for (; flent != NULL; flent = flent->fe_next) { 521 if (flent->fe_match(ft, flent, &s)) { 522 FLOW_TRY_REFHOLD(flent, err); 523 if (err != 0) 524 continue; 525 *flentp = flent; 526 rw_exit(&ft->ft_lock); 527 return (0); 528 } 529 } 530 rw_exit(&ft->ft_lock); 531 return (ENOENT); 532 } 533 534 /* 535 * Walk flow table. 536 * The caller is assumed to have proper perimeter protection. 537 */ 538 int 539 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 540 void *arg) 541 { 542 int err, i, cnt = 0; 543 flow_entry_t *flent; 544 545 if (ft == NULL) 546 return (0); 547 548 for (i = 0; i < ft->ft_size; i++) { 549 for (flent = ft->ft_table[i]; flent != NULL; 550 flent = flent->fe_next) { 551 cnt++; 552 err = (*fn)(flent, arg); 553 if (err != 0) 554 return (err); 555 } 556 } 557 VERIFY(cnt == ft->ft_flow_count); 558 return (0); 559 } 560 561 /* 562 * Same as the above except a mutex is used for protection here. 563 */ 564 int 565 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 566 void *arg) 567 { 568 int err; 569 570 if (ft == NULL) 571 return (0); 572 573 rw_enter(&ft->ft_lock, RW_WRITER); 574 err = mac_flow_walk_nolock(ft, fn, arg); 575 rw_exit(&ft->ft_lock); 576 return (err); 577 } 578 579 static boolean_t mac_flow_clean(flow_entry_t *); 580 581 /* 582 * Destroy a flow entry. Called when the last reference on a flow is released. 583 */ 584 void 585 mac_flow_destroy(flow_entry_t *flent) 586 { 587 ASSERT(flent->fe_refcnt == 0); 588 589 if ((flent->fe_type & FLOW_USER) != 0) { 590 ASSERT(mac_flow_clean(flent)); 591 } else { 592 mac_flow_cleanup(flent); 593 } 594 595 mutex_destroy(&flent->fe_lock); 596 cv_destroy(&flent->fe_cv); 597 flow_stat_destroy(flent); 598 kmem_cache_free(flow_cache, flent); 599 } 600 601 /* 602 * XXX eric 603 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 604 * mac_link_flow_modify() should really be moved/reworked into the 605 * two functions below. This would consolidate all the mac property 606 * checking in one place. I'm leaving this alone for now since it's 607 * out of scope of the new flows work. 608 */ 609 /* ARGSUSED */ 610 uint32_t 611 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 612 { 613 uint32_t changed_mask = 0; 614 mac_resource_props_t *fmrp = &flent->fe_effective_props; 615 int i; 616 617 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 618 (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { 619 changed_mask |= MRP_MAXBW; 620 fmrp->mrp_maxbw = mrp->mrp_maxbw; 621 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 622 fmrp->mrp_mask &= ~MRP_MAXBW; 623 } else { 624 fmrp->mrp_mask |= MRP_MAXBW; 625 } 626 } 627 628 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 629 if (fmrp->mrp_priority != mrp->mrp_priority) 630 changed_mask |= MRP_PRIORITY; 631 if (mrp->mrp_priority == MPL_RESET) { 632 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 633 fmrp->mrp_mask &= ~MRP_PRIORITY; 634 } else { 635 fmrp->mrp_priority = mrp->mrp_priority; 636 fmrp->mrp_mask |= MRP_PRIORITY; 637 } 638 } 639 640 /* modify fanout */ 641 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 642 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 643 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 644 for (i = 0; i < mrp->mrp_ncpus; i++) { 645 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 646 break; 647 } 648 if (i == mrp->mrp_ncpus) { 649 /* 650 * The new set of cpus passed is exactly 651 * the same as the existing set. 652 */ 653 return (changed_mask); 654 } 655 } 656 changed_mask |= MRP_CPUS; 657 MAC_COPY_CPUS(mrp, fmrp); 658 } 659 return (changed_mask); 660 } 661 662 void 663 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 664 { 665 uint32_t changed_mask; 666 mac_client_impl_t *mcip = flent->fe_mcip; 667 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 668 669 ASSERT(flent != NULL); 670 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 671 672 rw_enter(&ft->ft_lock, RW_WRITER); 673 674 /* Update the cached values inside the subflow entry */ 675 changed_mask = mac_flow_modify_props(flent, mrp); 676 rw_exit(&ft->ft_lock); 677 /* 678 * Push the changed parameters to the scheduling code in the 679 * SRS's, to take effect right away. 680 */ 681 if (changed_mask & MRP_MAXBW) { 682 mac_srs_update_bwlimit(flent, mrp); 683 /* 684 * If bandwidth is changed, we may have to change 685 * the number of soft ring to be used for fanout. 686 * Call mac_flow_update_fanout() if MAC_BIND_CPU 687 * is not set and there is no user supplied cpu 688 * info. This applies only to link at this time. 689 */ 690 if (!(flent->fe_type & FLOW_USER) && 691 !(changed_mask & MRP_CPUS) && 692 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 693 mac_fanout_setup(mcip, flent, mcip_mrp, 694 mac_rx_deliver, mcip, NULL); 695 } 696 } 697 if (mrp->mrp_mask & MRP_PRIORITY) 698 mac_flow_update_priority(mcip, flent); 699 700 if (changed_mask & MRP_CPUS) 701 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); 702 } 703 704 /* 705 * This function waits for a certain condition to be met and is generally 706 * used before a destructive or quiescing operation. 707 */ 708 void 709 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 710 { 711 mutex_enter(&flent->fe_lock); 712 flent->fe_flags |= FE_WAITER; 713 714 switch (event) { 715 case FLOW_DRIVER_UPCALL: 716 /* 717 * We want to make sure the driver upcalls have finished before 718 * we signal the Rx SRS worker to quit. 719 */ 720 while (flent->fe_refcnt != 1) 721 cv_wait(&flent->fe_cv, &flent->fe_lock); 722 break; 723 724 case FLOW_USER_REF: 725 /* 726 * Wait for the fe_user_refcnt to drop to 0. The flow has 727 * been removed from the global flow hash. 728 */ 729 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 730 while (flent->fe_user_refcnt != 0) 731 cv_wait(&flent->fe_cv, &flent->fe_lock); 732 break; 733 734 default: 735 ASSERT(0); 736 } 737 738 flent->fe_flags &= ~FE_WAITER; 739 mutex_exit(&flent->fe_lock); 740 } 741 742 static boolean_t 743 mac_flow_clean(flow_entry_t *flent) 744 { 745 ASSERT(flent->fe_next == NULL); 746 ASSERT(flent->fe_tx_srs == NULL); 747 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 748 ASSERT(flent->fe_mbg == NULL); 749 750 return (B_TRUE); 751 } 752 753 void 754 mac_flow_cleanup(flow_entry_t *flent) 755 { 756 if ((flent->fe_type & FLOW_USER) == 0) { 757 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 758 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 759 ASSERT(flent->fe_refcnt == 0); 760 } else { 761 ASSERT(flent->fe_refcnt == 1); 762 } 763 764 if (flent->fe_mbg != NULL) { 765 ASSERT(flent->fe_tx_srs == NULL); 766 /* This is a multicast or broadcast flow entry */ 767 mac_bcast_grp_free(flent->fe_mbg); 768 flent->fe_mbg = NULL; 769 } 770 771 if (flent->fe_tx_srs != NULL) { 772 ASSERT(flent->fe_mbg == NULL); 773 mac_srs_free(flent->fe_tx_srs); 774 flent->fe_tx_srs = NULL; 775 } 776 777 /* 778 * In the normal case fe_rx_srs_cnt is 1. However in the error case 779 * when mac_unicast_add fails we may not have set up any SRS 780 * in which case fe_rx_srs_cnt will be zero. 781 */ 782 if (flent->fe_rx_srs_cnt != 0) { 783 ASSERT(flent->fe_rx_srs_cnt == 1); 784 mac_srs_free(flent->fe_rx_srs[0]); 785 flent->fe_rx_srs[0] = NULL; 786 flent->fe_rx_srs_cnt = 0; 787 } 788 ASSERT(flent->fe_rx_srs[0] == NULL); 789 } 790 791 void 792 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 793 { 794 /* 795 * Grab the fe_lock to see a self-consistent fe_flow_desc. 796 * Updates to the fe_flow_desc happen under the fe_lock 797 * after removing the flent from the flow table 798 */ 799 mutex_enter(&flent->fe_lock); 800 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 801 mutex_exit(&flent->fe_lock); 802 } 803 804 /* 805 * Update a field of a flow entry. The mac perimeter ensures that 806 * this is the only thread doing a modify operation on this mac end point. 807 * So the flow table can't change or disappear. The ft_lock protects access 808 * to the flow entry, and holding the lock ensures that there isn't any thread 809 * accessing the flow entry or attempting a flow table lookup. However 810 * data threads that are using the flow entry based on the old descriptor 811 * will continue to use the flow entry. If strong coherence is required 812 * then the flow will have to be quiesced before the descriptor can be 813 * changed. 814 */ 815 void 816 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 817 { 818 flow_tab_t *ft = flent->fe_flow_tab; 819 flow_desc_t old_desc; 820 int err; 821 822 if (ft == NULL) { 823 /* 824 * The flow hasn't yet been inserted into the table, 825 * so only the caller knows about this flow, however for 826 * uniformity we grab the fe_lock here. 827 */ 828 mutex_enter(&flent->fe_lock); 829 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 830 mutex_exit(&flent->fe_lock); 831 } 832 833 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 834 835 /* 836 * Need to remove the flow entry from the table and reinsert it, 837 * into a potentially diference hash line. The hash depends on 838 * the new descriptor fields. However access to fe_desc itself 839 * is always under the fe_lock. This helps log and stat functions 840 * see a self-consistent fe_flow_desc. 841 */ 842 mac_flow_remove(ft, flent, B_TRUE); 843 old_desc = flent->fe_flow_desc; 844 845 mutex_enter(&flent->fe_lock); 846 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 847 mutex_exit(&flent->fe_lock); 848 849 if (mac_flow_add(ft, flent) != 0) { 850 /* 851 * The add failed say due to an invalid flow descriptor. 852 * Undo the update 853 */ 854 flent->fe_flow_desc = old_desc; 855 err = mac_flow_add(ft, flent); 856 ASSERT(err == 0); 857 } 858 } 859 860 void 861 mac_flow_set_name(flow_entry_t *flent, const char *name) 862 { 863 flow_tab_t *ft = flent->fe_flow_tab; 864 865 if (ft == NULL) { 866 /* 867 * The flow hasn't yet been inserted into the table, 868 * so only the caller knows about this flow 869 */ 870 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); 871 } else { 872 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 873 } 874 875 mutex_enter(&flent->fe_lock); 876 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); 877 mutex_exit(&flent->fe_lock); 878 } 879 880 /* 881 * Return the client-private cookie that was associated with 882 * the flow when it was created. 883 */ 884 void * 885 mac_flow_get_client_cookie(flow_entry_t *flent) 886 { 887 return (flent->fe_client_cookie); 888 } 889 890 /* 891 * Forward declarations. 892 */ 893 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 894 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 895 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 896 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 897 898 /* 899 * Create flow table. 900 */ 901 void 902 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 903 mac_impl_t *mip, flow_tab_t **ftp) 904 { 905 flow_tab_t *ft; 906 flow_ops_t *new_ops; 907 908 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 909 bzero(ft, sizeof (*ft)); 910 911 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 912 913 /* 914 * We make a copy of the ops vector instead of just pointing to it 915 * because we might want to customize the ops vector on a per table 916 * basis (e.g. for optimization). 917 */ 918 new_ops = &ft->ft_ops; 919 bcopy(ops, new_ops, sizeof (*ops)); 920 ft->ft_mask = mask; 921 ft->ft_size = size; 922 ft->ft_mip = mip; 923 924 /* 925 * Optimization for DL_ETHER media. 926 */ 927 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 928 if (new_ops->fo_hash == flow_l2_hash) 929 new_ops->fo_hash = flow_ether_hash; 930 931 if (new_ops->fo_accept[0] == flow_l2_accept) 932 new_ops->fo_accept[0] = flow_ether_accept; 933 934 } 935 *ftp = ft; 936 } 937 938 void 939 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 940 { 941 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 942 1024, mip, ftp); 943 } 944 945 /* 946 * Destroy flow table. 947 */ 948 void 949 mac_flow_tab_destroy(flow_tab_t *ft) 950 { 951 if (ft == NULL) 952 return; 953 954 ASSERT(ft->ft_flow_count == 0); 955 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 956 bzero(ft, sizeof (*ft)); 957 kmem_cache_free(flow_tab_cache, ft); 958 } 959 960 /* 961 * Add a new flow entry to the global flow hash table 962 */ 963 int 964 mac_flow_hash_add(flow_entry_t *flent) 965 { 966 int err; 967 968 rw_enter(&flow_tab_lock, RW_WRITER); 969 err = mod_hash_insert(flow_hash, 970 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 971 if (err != 0) { 972 rw_exit(&flow_tab_lock); 973 return (EEXIST); 974 } 975 /* Mark as inserted into the global flow hash table */ 976 FLOW_MARK(flent, FE_G_FLOW_HASH); 977 rw_exit(&flow_tab_lock); 978 return (err); 979 } 980 981 /* 982 * Remove a flow entry from the global flow hash table 983 */ 984 void 985 mac_flow_hash_remove(flow_entry_t *flent) 986 { 987 mod_hash_val_t val; 988 989 rw_enter(&flow_tab_lock, RW_WRITER); 990 VERIFY(mod_hash_remove(flow_hash, 991 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 992 993 /* Clear the mark that says inserted into the global flow hash table */ 994 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 995 rw_exit(&flow_tab_lock); 996 } 997 998 /* 999 * Retrieve a flow entry from the global flow hash table. 1000 */ 1001 int 1002 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1003 { 1004 int err; 1005 flow_entry_t *flent; 1006 1007 rw_enter(&flow_tab_lock, RW_READER); 1008 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1009 (mod_hash_val_t *)&flent); 1010 if (err != 0) { 1011 rw_exit(&flow_tab_lock); 1012 return (ENOENT); 1013 } 1014 ASSERT(flent != NULL); 1015 FLOW_USER_REFHOLD(flent); 1016 rw_exit(&flow_tab_lock); 1017 1018 *flentp = flent; 1019 return (0); 1020 } 1021 1022 /* 1023 * Initialize or release mac client flows by walking the subflow table. 1024 * These are typically invoked during plumb/unplumb of links. 1025 */ 1026 1027 static int 1028 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1029 { 1030 mac_client_impl_t *mcip = arg; 1031 1032 if (mac_link_flow_init(arg, flent) != 0) { 1033 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1034 flent->fe_flow_name, mcip->mci_name); 1035 } else { 1036 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1037 } 1038 return (0); 1039 } 1040 1041 void 1042 mac_link_init_flows(mac_client_handle_t mch) 1043 { 1044 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1045 1046 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1047 mac_link_init_flows_cb, mcip); 1048 /* 1049 * If mac client had subflow(s) configured before plumb, change 1050 * function to mac_rx_srs_subflow_process and in case of hardware 1051 * classification, disable polling. 1052 */ 1053 mac_client_update_classifier(mcip, B_TRUE); 1054 1055 } 1056 1057 boolean_t 1058 mac_link_has_flows(mac_client_handle_t mch) 1059 { 1060 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1061 1062 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1063 return (B_TRUE); 1064 1065 return (B_FALSE); 1066 } 1067 1068 static int 1069 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1070 { 1071 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1072 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1073 mac_link_flow_clean(arg, flent); 1074 return (0); 1075 } 1076 1077 void 1078 mac_link_release_flows(mac_client_handle_t mch) 1079 { 1080 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1081 1082 /* 1083 * Change the mci_flent callback back to mac_rx_srs_process() 1084 * because flows are about to be deactivated. 1085 */ 1086 mac_client_update_classifier(mcip, B_FALSE); 1087 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1088 mac_link_release_flows_cb, mcip); 1089 } 1090 1091 void 1092 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1093 { 1094 mac_flow_set_name(fep, new_name); 1095 if (fep->fe_ksp != NULL) { 1096 flow_stat_destroy(fep); 1097 flow_stat_create(fep); 1098 } 1099 } 1100 1101 /* 1102 * mac_link_flow_init() 1103 * Internal flow interface used for allocating SRSs and related 1104 * data structures. Not meant to be used by mac clients. 1105 */ 1106 int 1107 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1108 { 1109 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1110 mac_impl_t *mip = mcip->mci_mip; 1111 int err; 1112 1113 ASSERT(mch != NULL); 1114 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1115 1116 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1117 return (err); 1118 1119 sub_flow->fe_mcip = mcip; 1120 1121 return (0); 1122 } 1123 1124 /* 1125 * mac_link_flow_add() 1126 * Used by flowadm(1m) or kernel mac clients for creating flows. 1127 */ 1128 int 1129 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1130 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1131 { 1132 flow_entry_t *flent = NULL; 1133 int err; 1134 dls_dl_handle_t dlh; 1135 dls_link_t *dlp; 1136 boolean_t link_held = B_FALSE; 1137 boolean_t hash_added = B_FALSE; 1138 mac_perim_handle_t mph; 1139 1140 err = mac_flow_lookup_byname(flow_name, &flent); 1141 if (err == 0) { 1142 FLOW_USER_REFRELE(flent); 1143 return (EEXIST); 1144 } 1145 1146 /* 1147 * First create a flow entry given the description provided 1148 * by the caller. 1149 */ 1150 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1151 FLOW_USER | FLOW_OTHER, &flent); 1152 1153 if (err != 0) 1154 return (err); 1155 1156 /* 1157 * We've got a local variable referencing this flow now, so we need 1158 * to hold it. We'll release this flow before returning. 1159 * All failures until we return will undo any action that may internally 1160 * held the flow, so the last REFRELE will assure a clean freeing 1161 * of resources. 1162 */ 1163 FLOW_REFHOLD(flent); 1164 1165 flent->fe_link_id = linkid; 1166 FLOW_MARK(flent, FE_INCIPIENT); 1167 1168 err = mac_perim_enter_by_linkid(linkid, &mph); 1169 if (err != 0) { 1170 FLOW_FINAL_REFRELE(flent); 1171 return (err); 1172 } 1173 1174 /* 1175 * dls will eventually be merged with mac so it's ok 1176 * to call dls' internal functions. 1177 */ 1178 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1179 if (err != 0) 1180 goto bail; 1181 1182 link_held = B_TRUE; 1183 1184 /* 1185 * Add the flow to the global flow table, this table will be per 1186 * exclusive zone so each zone can have its own flow namespace. 1187 * RFE 6625651 will fix this. 1188 * 1189 */ 1190 if ((err = mac_flow_hash_add(flent)) != 0) 1191 goto bail; 1192 1193 hash_added = B_TRUE; 1194 1195 /* 1196 * do not allow flows to be configured on an anchor VNIC 1197 */ 1198 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1199 err = ENOTSUP; 1200 goto bail; 1201 } 1202 1203 /* 1204 * Save the zoneid of the underlying link in the flow entry, 1205 * this is needed to prevent non-global zone from getting 1206 * statistics information of global zone. 1207 */ 1208 flent->fe_zoneid = dlp->dl_zid; 1209 1210 /* 1211 * Add the subflow to the subflow table. Also instantiate the flow 1212 * in the mac if there is an active DLS user. The dl_mah is set when 1213 * dls_active_set() is called, typically during interface plumb. 1214 */ 1215 err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL); 1216 if (err != 0) 1217 goto bail; 1218 1219 FLOW_UNMARK(flent, FE_INCIPIENT); 1220 dls_devnet_rele_link(dlh, dlp); 1221 mac_perim_exit(mph); 1222 return (0); 1223 1224 bail: 1225 if (hash_added) 1226 mac_flow_hash_remove(flent); 1227 1228 if (link_held) 1229 dls_devnet_rele_link(dlh, dlp); 1230 1231 /* 1232 * Wait for any transient global flow hash refs to clear 1233 * and then release the creation reference on the flow 1234 */ 1235 mac_flow_wait(flent, FLOW_USER_REF); 1236 FLOW_FINAL_REFRELE(flent); 1237 mac_perim_exit(mph); 1238 return (err); 1239 } 1240 1241 /* 1242 * mac_link_flow_clean() 1243 * Internal flow interface used for freeing SRSs and related 1244 * data structures. Not meant to be used by mac clients. 1245 */ 1246 void 1247 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1248 { 1249 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1250 mac_impl_t *mip = mcip->mci_mip; 1251 boolean_t last_subflow; 1252 1253 ASSERT(mch != NULL); 1254 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1255 1256 /* 1257 * This sub flow entry may fail to be fully initialized by 1258 * mac_link_flow_init(). If so, simply return. 1259 */ 1260 if (sub_flow->fe_mcip == NULL) 1261 return; 1262 1263 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1264 /* 1265 * Tear down the data path 1266 */ 1267 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1268 sub_flow->fe_mcip = NULL; 1269 1270 /* 1271 * Delete the SRSs associated with this subflow. If this is being 1272 * driven by flowadm(1M) then the subflow will be deleted by 1273 * dls_rem_flow. However if this is a result of the interface being 1274 * unplumbed then the subflow itself won't be deleted. 1275 */ 1276 mac_flow_cleanup(sub_flow); 1277 1278 /* 1279 * If all the subflows are gone, renable some of the stuff 1280 * we disabled when adding a subflow, polling etc. 1281 */ 1282 if (last_subflow) { 1283 /* 1284 * The subflow table itself is not protected by any locks or 1285 * refcnts. Hence quiesce the client upfront before clearing 1286 * mci_subflow_tab. 1287 */ 1288 mac_client_quiesce(mcip); 1289 mac_client_update_classifier(mcip, B_FALSE); 1290 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1291 mcip->mci_subflow_tab = NULL; 1292 mac_client_restart(mcip); 1293 } 1294 } 1295 1296 /* 1297 * mac_link_flow_remove() 1298 * Used by flowadm(1m) or kernel mac clients for removing flows. 1299 */ 1300 int 1301 mac_link_flow_remove(char *flow_name) 1302 { 1303 flow_entry_t *flent; 1304 mac_perim_handle_t mph; 1305 int err; 1306 datalink_id_t linkid; 1307 1308 err = mac_flow_lookup_byname(flow_name, &flent); 1309 if (err != 0) 1310 return (err); 1311 1312 linkid = flent->fe_link_id; 1313 FLOW_USER_REFRELE(flent); 1314 1315 /* 1316 * The perim must be acquired before acquiring any other references 1317 * to maintain the lock and perimeter hierarchy. Please note the 1318 * FLOW_REFRELE above. 1319 */ 1320 err = mac_perim_enter_by_linkid(linkid, &mph); 1321 if (err != 0) 1322 return (err); 1323 1324 /* 1325 * Note the second lookup of the flow, because a concurrent thread 1326 * may have removed it already while we were waiting to enter the 1327 * link's perimeter. 1328 */ 1329 err = mac_flow_lookup_byname(flow_name, &flent); 1330 if (err != 0) { 1331 mac_perim_exit(mph); 1332 return (err); 1333 } 1334 FLOW_USER_REFRELE(flent); 1335 1336 /* 1337 * Remove the flow from the subflow table and deactivate the flow 1338 * by quiescing and removings its SRSs 1339 */ 1340 mac_flow_rem_subflow(flent); 1341 1342 /* 1343 * Finally, remove the flow from the global table. 1344 */ 1345 mac_flow_hash_remove(flent); 1346 1347 /* 1348 * Wait for any transient global flow hash refs to clear 1349 * and then release the creation reference on the flow 1350 */ 1351 mac_flow_wait(flent, FLOW_USER_REF); 1352 FLOW_FINAL_REFRELE(flent); 1353 1354 mac_perim_exit(mph); 1355 1356 return (0); 1357 } 1358 1359 /* 1360 * mac_link_flow_modify() 1361 * Modifies the properties of a flow identified by its name. 1362 */ 1363 int 1364 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1365 { 1366 flow_entry_t *flent; 1367 mac_client_impl_t *mcip; 1368 int err = 0; 1369 mac_perim_handle_t mph; 1370 datalink_id_t linkid; 1371 flow_tab_t *flow_tab; 1372 1373 err = mac_validate_props(mrp); 1374 if (err != 0) 1375 return (err); 1376 1377 err = mac_flow_lookup_byname(flow_name, &flent); 1378 if (err != 0) 1379 return (err); 1380 1381 linkid = flent->fe_link_id; 1382 FLOW_USER_REFRELE(flent); 1383 1384 /* 1385 * The perim must be acquired before acquiring any other references 1386 * to maintain the lock and perimeter hierarchy. Please note the 1387 * FLOW_REFRELE above. 1388 */ 1389 err = mac_perim_enter_by_linkid(linkid, &mph); 1390 if (err != 0) 1391 return (err); 1392 1393 /* 1394 * Note the second lookup of the flow, because a concurrent thread 1395 * may have removed it already while we were waiting to enter the 1396 * link's perimeter. 1397 */ 1398 err = mac_flow_lookup_byname(flow_name, &flent); 1399 if (err != 0) { 1400 mac_perim_exit(mph); 1401 return (err); 1402 } 1403 FLOW_USER_REFRELE(flent); 1404 1405 /* 1406 * If this flow is attached to a MAC client, then pass the request 1407 * along to the client. 1408 * Otherwise, just update the cached values. 1409 */ 1410 mcip = flent->fe_mcip; 1411 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1412 if (mcip != NULL) { 1413 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1414 err = ENOENT; 1415 } else { 1416 mac_flow_modify(flow_tab, flent, mrp); 1417 } 1418 } else { 1419 (void) mac_flow_modify_props(flent, mrp); 1420 } 1421 1422 done: 1423 mac_perim_exit(mph); 1424 return (err); 1425 } 1426 1427 1428 /* 1429 * State structure and misc functions used by mac_link_flow_walk(). 1430 */ 1431 typedef struct { 1432 int (*ws_func)(mac_flowinfo_t *, void *); 1433 void *ws_arg; 1434 } flow_walk_state_t; 1435 1436 static void 1437 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1438 { 1439 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, MAXNAMELEN); 1440 finfop->fi_link_id = flent->fe_link_id; 1441 finfop->fi_flow_desc = flent->fe_flow_desc; 1442 finfop->fi_resource_props = flent->fe_resource_props; 1443 } 1444 1445 static int 1446 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1447 { 1448 flow_walk_state_t *statep = arg; 1449 mac_flowinfo_t finfo; 1450 1451 mac_link_flowinfo_copy(&finfo, flent); 1452 return (statep->ws_func(&finfo, statep->ws_arg)); 1453 } 1454 1455 /* 1456 * mac_link_flow_walk() 1457 * Invokes callback 'func' for all flows belonging to the specified link. 1458 */ 1459 int 1460 mac_link_flow_walk(datalink_id_t linkid, 1461 int (*func)(mac_flowinfo_t *, void *), void *arg) 1462 { 1463 mac_client_impl_t *mcip; 1464 mac_perim_handle_t mph; 1465 flow_walk_state_t state; 1466 dls_dl_handle_t dlh; 1467 dls_link_t *dlp; 1468 int err; 1469 1470 err = mac_perim_enter_by_linkid(linkid, &mph); 1471 if (err != 0) 1472 return (err); 1473 1474 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1475 if (err != 0) { 1476 mac_perim_exit(mph); 1477 return (err); 1478 } 1479 1480 mcip = (mac_client_impl_t *)dlp->dl_mch; 1481 state.ws_func = func; 1482 state.ws_arg = arg; 1483 1484 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1485 mac_link_flow_walk_cb, &state); 1486 1487 dls_devnet_rele_link(dlh, dlp); 1488 mac_perim_exit(mph); 1489 return (err); 1490 } 1491 1492 /* 1493 * mac_link_flow_info() 1494 * Retrieves information about a specific flow. 1495 */ 1496 int 1497 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1498 { 1499 flow_entry_t *flent; 1500 int err; 1501 1502 err = mac_flow_lookup_byname(flow_name, &flent); 1503 if (err != 0) 1504 return (err); 1505 1506 mac_link_flowinfo_copy(finfo, flent); 1507 FLOW_USER_REFRELE(flent); 1508 return (0); 1509 } 1510 1511 #define HASH_MAC_VID(a, v, s) \ 1512 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1513 1514 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1515 1516 /* ARGSUSED */ 1517 static boolean_t 1518 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1519 { 1520 flow_l2info_t *l2 = &s->fs_l2info; 1521 flow_desc_t *fd = &flent->fe_flow_desc; 1522 1523 return (l2->l2_vid == fd->fd_vid && 1524 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1525 } 1526 1527 /* 1528 * Layer 2 hash function. 1529 * Must be paired with flow_l2_accept() within a set of flow_ops 1530 * because it assumes the dest address is already extracted. 1531 */ 1532 static uint32_t 1533 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1534 { 1535 flow_l2info_t *l2 = &s->fs_l2info; 1536 1537 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1538 } 1539 1540 /* 1541 * This is the generic layer 2 accept function. 1542 * It makes use of mac_header_info() to extract the header length, 1543 * sap, vlan ID and destination address. 1544 */ 1545 static int 1546 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1547 { 1548 boolean_t is_ether; 1549 flow_l2info_t *l2 = &s->fs_l2info; 1550 mac_header_info_t mhi; 1551 int err; 1552 1553 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1554 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1555 s->fs_mp, &mhi)) != 0) { 1556 if (err == EINVAL) 1557 err = ENOBUFS; 1558 1559 return (err); 1560 } 1561 1562 l2->l2_start = s->fs_mp->b_rptr; 1563 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1564 1565 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1566 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1567 struct ether_vlan_header *evhp = 1568 (struct ether_vlan_header *)l2->l2_start; 1569 1570 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1571 return (ENOBUFS); 1572 1573 l2->l2_sap = ntohs(evhp->ether_type); 1574 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1575 l2->l2_hdrsize = sizeof (*evhp); 1576 } else { 1577 l2->l2_sap = mhi.mhi_bindsap; 1578 l2->l2_vid = 0; 1579 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1580 } 1581 return (0); 1582 } 1583 1584 /* 1585 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1586 * accept(). The notable difference is that dest address is now extracted 1587 * by hash() rather than by accept(). This saves a few memory references 1588 * for flow tables that do not care about mac addresses. 1589 */ 1590 static uint32_t 1591 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1592 { 1593 flow_l2info_t *l2 = &s->fs_l2info; 1594 struct ether_vlan_header *evhp; 1595 1596 evhp = (struct ether_vlan_header *)l2->l2_start; 1597 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1598 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1599 } 1600 1601 /* ARGSUSED */ 1602 static int 1603 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1604 { 1605 flow_l2info_t *l2 = &s->fs_l2info; 1606 struct ether_vlan_header *evhp; 1607 uint16_t sap; 1608 1609 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1610 l2->l2_start = (uchar_t *)evhp; 1611 1612 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1613 return (ENOBUFS); 1614 1615 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1616 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1617 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1618 return (ENOBUFS); 1619 1620 l2->l2_sap = ntohs(evhp->ether_type); 1621 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1622 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1623 } else { 1624 l2->l2_sap = sap; 1625 l2->l2_vid = 0; 1626 l2->l2_hdrsize = sizeof (struct ether_header); 1627 } 1628 return (0); 1629 } 1630 1631 /* 1632 * Validates a layer 2 flow entry. 1633 */ 1634 static int 1635 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1636 { 1637 int i; 1638 flow_desc_t *fd = &flent->fe_flow_desc; 1639 1640 /* 1641 * Dest address is mandatory. 1642 */ 1643 if ((fd->fd_mask & FLOW_LINK_DST) == 0) 1644 return (EINVAL); 1645 1646 for (i = 0; i < fd->fd_mac_len; i++) { 1647 if (fd->fd_dst_mac[i] != 0) 1648 break; 1649 } 1650 if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL) 1651 return (EINVAL); 1652 1653 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1654 /* 1655 * VLAN flows are only supported over ethernet macs. 1656 */ 1657 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1658 return (EINVAL); 1659 1660 if (fd->fd_vid == 0) 1661 return (EINVAL); 1662 1663 } 1664 flent->fe_match = flow_l2_match; 1665 return (0); 1666 } 1667 1668 /* 1669 * Calculates hash index of flow entry. 1670 */ 1671 static uint32_t 1672 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1673 { 1674 flow_desc_t *fd = &flent->fe_flow_desc; 1675 1676 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1677 return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1678 } 1679 1680 /* 1681 * This is used for duplicate flow checking. 1682 */ 1683 /* ARGSUSED */ 1684 static boolean_t 1685 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1686 { 1687 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1688 1689 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1690 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1691 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1692 } 1693 1694 /* 1695 * Generic flow entry insertion function. 1696 * Used by flow tables that do not have ordering requirements. 1697 */ 1698 /* ARGSUSED */ 1699 static int 1700 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1701 flow_entry_t *flent) 1702 { 1703 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1704 1705 if (*headp != NULL) { 1706 ASSERT(flent->fe_next == NULL); 1707 flent->fe_next = *headp; 1708 } 1709 *headp = flent; 1710 return (0); 1711 } 1712 1713 /* 1714 * IP version independent DSField matching function. 1715 */ 1716 /* ARGSUSED */ 1717 static boolean_t 1718 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1719 { 1720 flow_l3info_t *l3info = &s->fs_l3info; 1721 flow_desc_t *fd = &flent->fe_flow_desc; 1722 1723 switch (l3info->l3_version) { 1724 case IPV4_VERSION: { 1725 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1726 1727 return ((ipha->ipha_type_of_service & 1728 fd->fd_dsfield_mask) == fd->fd_dsfield); 1729 } 1730 case IPV6_VERSION: { 1731 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1732 1733 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1734 fd->fd_dsfield_mask) == fd->fd_dsfield); 1735 } 1736 default: 1737 return (B_FALSE); 1738 } 1739 } 1740 1741 /* 1742 * IP v4 and v6 address matching. 1743 * The netmask only needs to be applied on the packet but not on the 1744 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1745 */ 1746 1747 /* ARGSUSED */ 1748 static boolean_t 1749 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1750 { 1751 flow_l3info_t *l3info = &s->fs_l3info; 1752 flow_desc_t *fd = &flent->fe_flow_desc; 1753 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1754 in_addr_t addr; 1755 1756 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1757 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1758 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1759 V4_PART_OF_V6(fd->fd_local_addr)); 1760 } 1761 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1762 V4_PART_OF_V6(fd->fd_remote_addr)); 1763 } 1764 1765 /* ARGSUSED */ 1766 static boolean_t 1767 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1768 { 1769 flow_l3info_t *l3info = &s->fs_l3info; 1770 flow_desc_t *fd = &flent->fe_flow_desc; 1771 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1772 in6_addr_t *addrp; 1773 1774 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1775 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1776 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1777 fd->fd_local_addr)); 1778 } 1779 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1780 } 1781 1782 /* ARGSUSED */ 1783 static boolean_t 1784 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1785 { 1786 flow_l3info_t *l3info = &s->fs_l3info; 1787 flow_desc_t *fd = &flent->fe_flow_desc; 1788 1789 return (l3info->l3_protocol == fd->fd_protocol); 1790 } 1791 1792 static uint32_t 1793 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1794 { 1795 flow_l3info_t *l3info = &s->fs_l3info; 1796 flow_mask_t mask = ft->ft_mask; 1797 1798 if ((mask & FLOW_IP_LOCAL) != 0) { 1799 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1800 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1801 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1802 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1803 /* 1804 * DSField flents are arranged as a single list. 1805 */ 1806 return (0); 1807 } 1808 /* 1809 * IP addr flents are hashed into two lists, v4 or v6. 1810 */ 1811 ASSERT(ft->ft_size >= 2); 1812 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1813 } 1814 1815 static uint32_t 1816 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1817 { 1818 flow_l3info_t *l3info = &s->fs_l3info; 1819 1820 return (l3info->l3_protocol % ft->ft_size); 1821 } 1822 1823 /* ARGSUSED */ 1824 static int 1825 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1826 { 1827 flow_l2info_t *l2info = &s->fs_l2info; 1828 flow_l3info_t *l3info = &s->fs_l3info; 1829 uint16_t sap = l2info->l2_sap; 1830 uchar_t *l3_start; 1831 1832 l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize; 1833 if (!OK_32PTR(l3_start)) 1834 return (EINVAL); 1835 1836 switch (sap) { 1837 case ETHERTYPE_IP: { 1838 ipha_t *ipha = (ipha_t *)l3_start; 1839 1840 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1841 return (ENOBUFS); 1842 1843 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1844 l3info->l3_protocol = ipha->ipha_protocol; 1845 l3info->l3_version = IPV4_VERSION; 1846 l3info->l3_fragmented = 1847 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1848 break; 1849 } 1850 case ETHERTYPE_IPV6: { 1851 ip6_t *ip6h = (ip6_t *)l3_start; 1852 uint16_t ip6_hdrlen; 1853 uint8_t nexthdr; 1854 1855 if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, 1856 &nexthdr)) { 1857 return (ENOBUFS); 1858 } 1859 l3info->l3_hdrsize = ip6_hdrlen; 1860 l3info->l3_protocol = nexthdr; 1861 l3info->l3_version = IPV6_VERSION; 1862 l3info->l3_fragmented = B_FALSE; 1863 break; 1864 } 1865 default: 1866 return (EINVAL); 1867 } 1868 return (0); 1869 } 1870 1871 /* ARGSUSED */ 1872 static int 1873 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1874 { 1875 flow_desc_t *fd = &flent->fe_flow_desc; 1876 1877 switch (fd->fd_protocol) { 1878 case IPPROTO_TCP: 1879 case IPPROTO_UDP: 1880 case IPPROTO_SCTP: 1881 case IPPROTO_ICMP: 1882 case IPPROTO_ICMPV6: 1883 flent->fe_match = flow_ip_proto_match; 1884 return (0); 1885 default: 1886 return (EINVAL); 1887 } 1888 } 1889 1890 /* ARGSUSED */ 1891 static int 1892 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1893 { 1894 flow_desc_t *fd = &flent->fe_flow_desc; 1895 flow_mask_t mask; 1896 uint8_t version; 1897 in6_addr_t *addr, *netmask; 1898 1899 /* 1900 * DSField does not require a IP version. 1901 */ 1902 if (fd->fd_mask == FLOW_IP_DSFIELD) { 1903 if (fd->fd_dsfield_mask == 0) 1904 return (EINVAL); 1905 1906 flent->fe_match = flow_ip_dsfield_match; 1907 return (0); 1908 } 1909 1910 /* 1911 * IP addresses must come with a version to avoid ambiguity. 1912 */ 1913 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 1914 return (EINVAL); 1915 1916 version = fd->fd_ipversion; 1917 if (version != IPV4_VERSION && version != IPV6_VERSION) 1918 return (EINVAL); 1919 1920 mask = fd->fd_mask & ~FLOW_IP_VERSION; 1921 switch (mask) { 1922 case FLOW_IP_LOCAL: 1923 addr = &fd->fd_local_addr; 1924 netmask = &fd->fd_local_netmask; 1925 break; 1926 case FLOW_IP_REMOTE: 1927 addr = &fd->fd_remote_addr; 1928 netmask = &fd->fd_remote_netmask; 1929 break; 1930 default: 1931 return (EINVAL); 1932 } 1933 1934 /* 1935 * Apply netmask onto specified address. 1936 */ 1937 V6_MASK_COPY(*addr, *netmask, *addr); 1938 if (version == IPV4_VERSION) { 1939 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 1940 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 1941 1942 if (v4addr == 0 || v4mask == 0) 1943 return (EINVAL); 1944 flent->fe_match = flow_ip_v4_match; 1945 } else { 1946 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 1947 IN6_IS_ADDR_UNSPECIFIED(netmask)) 1948 return (EINVAL); 1949 flent->fe_match = flow_ip_v6_match; 1950 } 1951 return (0); 1952 } 1953 1954 static uint32_t 1955 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1956 { 1957 flow_desc_t *fd = &flent->fe_flow_desc; 1958 1959 return (fd->fd_protocol % ft->ft_size); 1960 } 1961 1962 static uint32_t 1963 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1964 { 1965 flow_desc_t *fd = &flent->fe_flow_desc; 1966 1967 /* 1968 * DSField flents are arranged as a single list. 1969 */ 1970 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 1971 return (0); 1972 1973 /* 1974 * IP addr flents are hashed into two lists, v4 or v6. 1975 */ 1976 ASSERT(ft->ft_size >= 2); 1977 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 1978 } 1979 1980 /* ARGSUSED */ 1981 static boolean_t 1982 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1983 { 1984 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1985 1986 return (fd1->fd_protocol == fd2->fd_protocol); 1987 } 1988 1989 /* ARGSUSED */ 1990 static boolean_t 1991 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1992 { 1993 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1994 in6_addr_t *a1, *m1, *a2, *m2; 1995 1996 ASSERT(fd1->fd_mask == fd2->fd_mask); 1997 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 1998 return (fd1->fd_dsfield == fd2->fd_dsfield && 1999 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2000 } 2001 2002 /* 2003 * flow_ip_accept_fe() already validated the version. 2004 */ 2005 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2006 if (fd1->fd_ipversion != fd2->fd_ipversion) 2007 return (B_FALSE); 2008 2009 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2010 case FLOW_IP_LOCAL: 2011 a1 = &fd1->fd_local_addr; 2012 m1 = &fd1->fd_local_netmask; 2013 a2 = &fd2->fd_local_addr; 2014 m2 = &fd2->fd_local_netmask; 2015 break; 2016 case FLOW_IP_REMOTE: 2017 a1 = &fd1->fd_remote_addr; 2018 m1 = &fd1->fd_remote_netmask; 2019 a2 = &fd2->fd_remote_addr; 2020 m2 = &fd2->fd_remote_netmask; 2021 break; 2022 default: 2023 /* 2024 * This is unreachable given the checks in 2025 * flow_ip_accept_fe(). 2026 */ 2027 return (B_FALSE); 2028 } 2029 2030 if (fd1->fd_ipversion == IPV4_VERSION) { 2031 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2032 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2033 2034 } else { 2035 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2036 IN6_ARE_ADDR_EQUAL(m1, m2)); 2037 } 2038 } 2039 2040 static int 2041 flow_ip_mask2plen(in6_addr_t *v6mask) 2042 { 2043 int bits; 2044 int plen = IPV6_ABITS; 2045 int i; 2046 2047 for (i = 3; i >= 0; i--) { 2048 if (v6mask->s6_addr32[i] == 0) { 2049 plen -= 32; 2050 continue; 2051 } 2052 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2053 if (bits == 0) 2054 break; 2055 plen -= bits; 2056 } 2057 return (plen); 2058 } 2059 2060 /* ARGSUSED */ 2061 static int 2062 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2063 flow_entry_t *flent) 2064 { 2065 flow_entry_t **p = headp; 2066 flow_desc_t *fd0, *fd; 2067 in6_addr_t *m0, *m; 2068 int plen0, plen; 2069 2070 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2071 2072 /* 2073 * No special ordering needed for dsfield. 2074 */ 2075 fd0 = &flent->fe_flow_desc; 2076 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2077 if (*p != NULL) { 2078 ASSERT(flent->fe_next == NULL); 2079 flent->fe_next = *p; 2080 } 2081 *p = flent; 2082 return (0); 2083 } 2084 2085 /* 2086 * IP address flows are arranged in descending prefix length order. 2087 */ 2088 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2089 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2090 plen0 = flow_ip_mask2plen(m0); 2091 ASSERT(plen0 != 0); 2092 2093 for (; *p != NULL; p = &(*p)->fe_next) { 2094 fd = &(*p)->fe_flow_desc; 2095 2096 /* 2097 * Normally a dsfield flent shouldn't end up on the same 2098 * list as an IP address because flow tables are (for now) 2099 * disjoint. If we decide to support both IP and dsfield 2100 * in the same table in the future, this check will allow 2101 * for that. 2102 */ 2103 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2104 continue; 2105 2106 /* 2107 * We also allow for the mixing of local and remote address 2108 * flents within one list. 2109 */ 2110 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2111 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2112 plen = flow_ip_mask2plen(m); 2113 2114 if (plen <= plen0) 2115 break; 2116 } 2117 if (*p != NULL) { 2118 ASSERT(flent->fe_next == NULL); 2119 flent->fe_next = *p; 2120 } 2121 *p = flent; 2122 return (0); 2123 } 2124 2125 /* 2126 * Transport layer protocol and port matching functions. 2127 */ 2128 2129 /* ARGSUSED */ 2130 static boolean_t 2131 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2132 { 2133 flow_l3info_t *l3info = &s->fs_l3info; 2134 flow_l4info_t *l4info = &s->fs_l4info; 2135 flow_desc_t *fd = &flent->fe_flow_desc; 2136 2137 return (fd->fd_protocol == l3info->l3_protocol && 2138 fd->fd_local_port == l4info->l4_hash_port); 2139 } 2140 2141 /* ARGSUSED */ 2142 static boolean_t 2143 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2144 { 2145 flow_l3info_t *l3info = &s->fs_l3info; 2146 flow_l4info_t *l4info = &s->fs_l4info; 2147 flow_desc_t *fd = &flent->fe_flow_desc; 2148 2149 return (fd->fd_protocol == l3info->l3_protocol && 2150 fd->fd_remote_port == l4info->l4_hash_port); 2151 } 2152 2153 /* 2154 * Transport hash function. 2155 * Since we only support either local or remote port flows, 2156 * we only need to extract one of the ports to be used for 2157 * matching. 2158 */ 2159 static uint32_t 2160 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2161 { 2162 flow_l3info_t *l3info = &s->fs_l3info; 2163 flow_l4info_t *l4info = &s->fs_l4info; 2164 uint8_t proto = l3info->l3_protocol; 2165 boolean_t dst_or_src; 2166 2167 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2168 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2169 } else { 2170 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2171 } 2172 2173 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2174 l4info->l4_src_port; 2175 2176 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2177 } 2178 2179 /* 2180 * Unlike other accept() functions above, we do not need to get the header 2181 * size because this is our highest layer so far. If we want to do support 2182 * other higher layer protocols, we would need to save the l4_hdrsize 2183 * in the code below. 2184 */ 2185 2186 /* ARGSUSED */ 2187 static int 2188 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2189 { 2190 flow_l3info_t *l3info = &s->fs_l3info; 2191 flow_l4info_t *l4info = &s->fs_l4info; 2192 uint8_t proto = l3info->l3_protocol; 2193 uchar_t *l4_start; 2194 2195 l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize; 2196 if (!OK_32PTR(l4_start)) 2197 return (EINVAL); 2198 2199 if (l3info->l3_fragmented == B_TRUE) 2200 return (EINVAL); 2201 2202 switch (proto) { 2203 case IPPROTO_TCP: { 2204 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2205 2206 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2207 return (ENOBUFS); 2208 2209 l4info->l4_src_port = tcph->th_sport; 2210 l4info->l4_dst_port = tcph->th_dport; 2211 break; 2212 } 2213 case IPPROTO_UDP: { 2214 struct udphdr *udph = (struct udphdr *)l4_start; 2215 2216 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2217 return (ENOBUFS); 2218 2219 l4info->l4_src_port = udph->uh_sport; 2220 l4info->l4_dst_port = udph->uh_dport; 2221 break; 2222 } 2223 case IPPROTO_SCTP: { 2224 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2225 2226 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2227 return (ENOBUFS); 2228 2229 l4info->l4_src_port = sctph->sh_sport; 2230 l4info->l4_dst_port = sctph->sh_dport; 2231 break; 2232 } 2233 default: 2234 return (EINVAL); 2235 } 2236 2237 return (0); 2238 } 2239 2240 /* 2241 * Validates transport flow entry. 2242 * The protocol field must be present. 2243 */ 2244 2245 /* ARGSUSED */ 2246 static int 2247 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2248 { 2249 flow_desc_t *fd = &flent->fe_flow_desc; 2250 flow_mask_t mask = fd->fd_mask; 2251 2252 if ((mask & FLOW_IP_PROTOCOL) == 0) 2253 return (EINVAL); 2254 2255 switch (fd->fd_protocol) { 2256 case IPPROTO_TCP: 2257 case IPPROTO_UDP: 2258 case IPPROTO_SCTP: 2259 break; 2260 default: 2261 return (EINVAL); 2262 } 2263 2264 switch (mask & ~FLOW_IP_PROTOCOL) { 2265 case FLOW_ULP_PORT_LOCAL: 2266 if (fd->fd_local_port == 0) 2267 return (EINVAL); 2268 2269 flent->fe_match = flow_transport_lport_match; 2270 break; 2271 case FLOW_ULP_PORT_REMOTE: 2272 if (fd->fd_remote_port == 0) 2273 return (EINVAL); 2274 2275 flent->fe_match = flow_transport_rport_match; 2276 break; 2277 case 0: 2278 /* 2279 * transport-only flows conflicts with our table type. 2280 */ 2281 return (EOPNOTSUPP); 2282 default: 2283 return (EINVAL); 2284 } 2285 2286 return (0); 2287 } 2288 2289 static uint32_t 2290 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2291 { 2292 flow_desc_t *fd = &flent->fe_flow_desc; 2293 uint16_t port = 0; 2294 2295 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2296 fd->fd_local_port : fd->fd_remote_port; 2297 2298 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2299 } 2300 2301 /* ARGSUSED */ 2302 static boolean_t 2303 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2304 { 2305 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2306 2307 if (fd1->fd_protocol != fd2->fd_protocol) 2308 return (B_FALSE); 2309 2310 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2311 return (fd1->fd_local_port == fd2->fd_local_port); 2312 2313 return (fd1->fd_remote_port == fd2->fd_remote_port); 2314 } 2315 2316 static flow_ops_t flow_l2_ops = { 2317 flow_l2_accept_fe, 2318 flow_l2_hash_fe, 2319 flow_l2_match_fe, 2320 flow_generic_insert_fe, 2321 flow_l2_hash, 2322 {flow_l2_accept} 2323 }; 2324 2325 static flow_ops_t flow_ip_ops = { 2326 flow_ip_accept_fe, 2327 flow_ip_hash_fe, 2328 flow_ip_match_fe, 2329 flow_ip_insert_fe, 2330 flow_ip_hash, 2331 {flow_l2_accept, flow_ip_accept} 2332 }; 2333 2334 static flow_ops_t flow_ip_proto_ops = { 2335 flow_ip_proto_accept_fe, 2336 flow_ip_proto_hash_fe, 2337 flow_ip_proto_match_fe, 2338 flow_generic_insert_fe, 2339 flow_ip_proto_hash, 2340 {flow_l2_accept, flow_ip_accept} 2341 }; 2342 2343 static flow_ops_t flow_transport_ops = { 2344 flow_transport_accept_fe, 2345 flow_transport_hash_fe, 2346 flow_transport_match_fe, 2347 flow_generic_insert_fe, 2348 flow_transport_hash, 2349 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2350 }; 2351 2352 static flow_tab_info_t flow_tab_info_list[] = { 2353 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2354 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2355 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2356 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2357 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024} 2358 }; 2359 2360 #define FLOW_MAX_TAB_INFO \ 2361 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2362 2363 static flow_tab_info_t * 2364 mac_flow_tab_info_get(flow_mask_t mask) 2365 { 2366 int i; 2367 2368 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2369 if (mask == flow_tab_info_list[i].fti_mask) 2370 return (&flow_tab_info_list[i]); 2371 } 2372 return (NULL); 2373 } 2374