1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/dls.h> 33 #include <sys/dls_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/ethernet.h> 36 #include <sys/vlan.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <netinet/tcp.h> 40 #include <netinet/udp.h> 41 #include <netinet/sctp.h> 42 43 /* global flow table, will be a per exclusive-zone table later */ 44 static mod_hash_t *flow_hash; 45 static krwlock_t flow_tab_lock; 46 47 static kmem_cache_t *flow_cache; 48 static kmem_cache_t *flow_tab_cache; 49 static flow_ops_t flow_l2_ops; 50 51 typedef struct { 52 const char *fs_name; 53 uint_t fs_offset; 54 } flow_stats_info_t; 55 56 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 57 static flow_stats_info_t flow_stats_list[] = { 58 {"rbytes", FS_OFF(fs_rbytes)}, 59 {"ipackets", FS_OFF(fs_ipackets)}, 60 {"ierrors", FS_OFF(fs_ierrors)}, 61 {"obytes", FS_OFF(fs_obytes)}, 62 {"opackets", FS_OFF(fs_opackets)}, 63 {"oerrors", FS_OFF(fs_oerrors)} 64 }; 65 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 66 67 /* 68 * Checks whether a flow mask is legal. 69 */ 70 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 71 72 static void 73 flow_stat_init(kstat_named_t *knp) 74 { 75 int i; 76 77 for (i = 0; i < FS_SIZE; i++, knp++) { 78 kstat_named_init(knp, flow_stats_list[i].fs_name, 79 KSTAT_DATA_UINT64); 80 } 81 } 82 83 static int 84 flow_stat_update(kstat_t *ksp, int rw) 85 { 86 flow_entry_t *fep = ksp->ks_private; 87 flow_stats_t *fsp = &fep->fe_flowstats; 88 kstat_named_t *knp = ksp->ks_data; 89 uint64_t *statp; 90 zoneid_t zid; 91 int i; 92 93 if (rw != KSTAT_READ) 94 return (EACCES); 95 96 zid = getzoneid(); 97 if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) { 98 for (i = 0; i < FS_SIZE; i++, knp++) 99 knp->value.ui64 = 0; 100 101 return (0); 102 } 103 104 for (i = 0; i < FS_SIZE; i++, knp++) { 105 statp = (uint64_t *) 106 ((uchar_t *)fsp + flow_stats_list[i].fs_offset); 107 108 knp->value.ui64 = *statp; 109 } 110 return (0); 111 } 112 113 static void 114 flow_stat_create(flow_entry_t *fep) 115 { 116 kstat_t *ksp; 117 kstat_named_t *knp; 118 uint_t nstats = FS_SIZE; 119 120 ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow", 121 KSTAT_TYPE_NAMED, nstats, 0); 122 if (ksp == NULL) 123 return; 124 125 ksp->ks_update = flow_stat_update; 126 ksp->ks_private = fep; 127 fep->fe_ksp = ksp; 128 129 knp = (kstat_named_t *)ksp->ks_data; 130 flow_stat_init(knp); 131 kstat_install(ksp); 132 } 133 134 void 135 flow_stat_destroy(flow_entry_t *fep) 136 { 137 if (fep->fe_ksp != NULL) { 138 kstat_delete(fep->fe_ksp); 139 fep->fe_ksp = NULL; 140 } 141 } 142 143 /* 144 * Initialize the flow table 145 */ 146 void 147 mac_flow_init() 148 { 149 flow_cache = kmem_cache_create("flow_entry_cache", 150 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 151 flow_tab_cache = kmem_cache_create("flow_tab_cache", 152 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 153 flow_hash = mod_hash_create_extended("flow_hash", 154 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 155 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 156 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 157 } 158 159 /* 160 * Cleanup and release the flow table 161 */ 162 void 163 mac_flow_fini() 164 { 165 kmem_cache_destroy(flow_cache); 166 kmem_cache_destroy(flow_tab_cache); 167 mod_hash_destroy_hash(flow_hash); 168 rw_destroy(&flow_tab_lock); 169 } 170 171 /* 172 * mac_create_flow(): create a flow_entry_t. 173 */ 174 int 175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 176 void *client_cookie, uint_t type, flow_entry_t **flentp) 177 { 178 flow_entry_t *flent = *flentp; 179 int err = 0; 180 181 if (mrp != NULL) { 182 err = mac_validate_props(mrp); 183 if (err != 0) 184 return (err); 185 } 186 187 if (flent == NULL) { 188 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 189 bzero(flent, sizeof (*flent)); 190 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 192 193 /* Initialize the receiver function to a safe routine */ 194 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 195 flent->fe_index = -1; 196 } 197 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 198 199 /* This is an initial flow, will be configured later */ 200 if (fd == NULL) { 201 *flentp = flent; 202 return (0); 203 } 204 205 flent->fe_client_cookie = client_cookie; 206 flent->fe_type = type; 207 208 /* 209 * As flow creation is only allowed in global zone, this will 210 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will 211 * later set the right value. 212 */ 213 flent->fe_zoneid = getzoneid(); 214 215 /* Save flow desc */ 216 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 217 218 if (mrp != NULL) { 219 /* 220 * We have already set fe_resource_props for a Link. 221 */ 222 if (type & FLOW_USER) { 223 bcopy(mrp, &flent->fe_resource_props, 224 sizeof (mac_resource_props_t)); 225 } 226 /* 227 * The effective resource list should reflect the priority 228 * that we set implicitly. 229 */ 230 if (!(mrp->mrp_mask & MRP_PRIORITY)) 231 mrp->mrp_mask |= MRP_PRIORITY; 232 if (type & FLOW_USER) 233 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 234 else 235 mrp->mrp_priority = MPL_LINK_DEFAULT; 236 bcopy(mrp, &flent->fe_effective_props, 237 sizeof (mac_resource_props_t)); 238 } 239 flow_stat_create(flent); 240 241 *flentp = flent; 242 return (0); 243 } 244 245 /* 246 * Validate flow entry and add it to a flow table. 247 */ 248 int 249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 250 { 251 flow_entry_t **headp, **p; 252 flow_ops_t *ops = &ft->ft_ops; 253 flow_mask_t mask; 254 uint32_t index; 255 int err; 256 257 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 258 259 /* 260 * Check for invalid bits in mask. 261 */ 262 mask = flent->fe_flow_desc.fd_mask; 263 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 264 return (EOPNOTSUPP); 265 266 /* 267 * Validate flent. 268 */ 269 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 270 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 271 flow_entry_t *, flent, int, err); 272 return (err); 273 } 274 275 /* 276 * Flent is valid. now calculate hash and insert it 277 * into hash table. 278 */ 279 index = ops->fo_hash_fe(ft, flent); 280 281 /* 282 * We do not need a lock up until now because we were 283 * not accessing the flow table. 284 */ 285 rw_enter(&ft->ft_lock, RW_WRITER); 286 headp = &ft->ft_table[index]; 287 288 /* 289 * Check for duplicate flow. 290 */ 291 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 292 if ((*p)->fe_flow_desc.fd_mask != 293 flent->fe_flow_desc.fd_mask) 294 continue; 295 296 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 297 rw_exit(&ft->ft_lock); 298 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 299 flow_entry_t *, flent, int, err); 300 return (EALREADY); 301 } 302 } 303 304 /* 305 * Insert flow to hash list. 306 */ 307 err = ops->fo_insert_fe(ft, headp, flent); 308 if (err != 0) { 309 rw_exit(&ft->ft_lock); 310 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 311 flow_entry_t *, flent, int, err); 312 return (err); 313 } 314 315 /* 316 * Save the hash index so it can be used by mac_flow_remove(). 317 */ 318 flent->fe_index = (int)index; 319 320 /* 321 * Save the flow tab back reference. 322 */ 323 flent->fe_flow_tab = ft; 324 FLOW_MARK(flent, FE_FLOW_TAB); 325 ft->ft_flow_count++; 326 rw_exit(&ft->ft_lock); 327 return (0); 328 } 329 330 /* 331 * Remove a flow from a mac client's subflow table 332 */ 333 void 334 mac_flow_rem_subflow(flow_entry_t *flent) 335 { 336 flow_tab_t *ft = flent->fe_flow_tab; 337 mac_client_impl_t *mcip = ft->ft_mcip; 338 339 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 340 341 mac_flow_remove(ft, flent, B_FALSE); 342 if (flent->fe_mcip == NULL) { 343 /* 344 * The interface is not yet plumbed and mac_client_flow_add 345 * was not done. 346 */ 347 if (FLOW_TAB_EMPTY(ft)) { 348 mac_flow_tab_destroy(ft); 349 mcip->mci_subflow_tab = NULL; 350 } 351 return; 352 } 353 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 354 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 355 } 356 357 /* 358 * Add a flow to a mac client's subflow table and instantiate the flow 359 * in the mac by creating the associated SRSs etc. 360 */ 361 int 362 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 363 boolean_t instantiate_flow) 364 { 365 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 366 flow_tab_info_t *ftinfo; 367 flow_mask_t mask; 368 flow_tab_t *ft; 369 int err; 370 boolean_t ft_created = B_FALSE; 371 372 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 373 374 /* 375 * If the subflow table exists already just add the new subflow 376 * to the existing table, else we create a new subflow table below. 377 */ 378 ft = mcip->mci_subflow_tab; 379 if (ft == NULL) { 380 mask = flent->fe_flow_desc.fd_mask; 381 /* 382 * Try to create a new table and then add the subflow to the 383 * newly created subflow table 384 */ 385 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) 386 return (EOPNOTSUPP); 387 388 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 389 mcip->mci_mip, &ft); 390 ft_created = B_TRUE; 391 } 392 393 err = mac_flow_add(ft, flent); 394 if (err != 0) { 395 if (ft_created) 396 mac_flow_tab_destroy(ft); 397 return (err); 398 } 399 400 if (instantiate_flow) { 401 /* Now activate the flow by creating its SRSs */ 402 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 403 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 404 if (err != 0) { 405 mac_flow_remove(ft, flent, B_FALSE); 406 if (ft_created) 407 mac_flow_tab_destroy(ft); 408 return (err); 409 } 410 } else { 411 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 412 } 413 if (ft_created) { 414 ASSERT(mcip->mci_subflow_tab == NULL); 415 ft->ft_mcip = mcip; 416 mcip->mci_subflow_tab = ft; 417 if (instantiate_flow) 418 mac_client_update_classifier(mcip, B_TRUE); 419 } 420 return (0); 421 } 422 423 /* 424 * Remove flow entry from flow table. 425 */ 426 void 427 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 428 { 429 flow_entry_t **fp; 430 431 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 432 if (!(flent->fe_flags & FE_FLOW_TAB)) 433 return; 434 435 rw_enter(&ft->ft_lock, RW_WRITER); 436 /* 437 * If this is a permanent removal from the flow table, mark it 438 * CONDEMNED to prevent future references. If this is a temporary 439 * removal from the table, say to update the flow descriptor then 440 * we don't mark it CONDEMNED 441 */ 442 if (!temp) 443 FLOW_MARK(flent, FE_CONDEMNED); 444 /* 445 * Locate the specified flent. 446 */ 447 fp = &ft->ft_table[flent->fe_index]; 448 while (*fp != flent) 449 fp = &(*fp)->fe_next; 450 451 /* 452 * The flent must exist. Otherwise it's a bug. 453 */ 454 ASSERT(fp != NULL); 455 *fp = flent->fe_next; 456 flent->fe_next = NULL; 457 458 /* 459 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 460 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 461 * will panic. 462 */ 463 flent->fe_index = -1; 464 FLOW_UNMARK(flent, FE_FLOW_TAB); 465 ft->ft_flow_count--; 466 rw_exit(&ft->ft_lock); 467 } 468 469 /* 470 * This is the flow lookup routine used by the mac sw classifier engine. 471 */ 472 int 473 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 474 { 475 flow_state_t s; 476 flow_entry_t *flent; 477 flow_ops_t *ops = &ft->ft_ops; 478 boolean_t retried = B_FALSE; 479 int i, err; 480 481 s.fs_flags = flags; 482 s.fs_mp = mp; 483 retry: 484 485 /* 486 * Walk the list of predeclared accept functions. 487 * Each of these would accumulate enough state to allow the next 488 * accept routine to make progress. 489 */ 490 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 491 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 492 /* 493 * ENOBUFS indicates that the mp could be too short 494 * and may need a pullup. 495 */ 496 if (err != ENOBUFS || retried) 497 return (err); 498 499 /* 500 * Don't modify the mblk if there are references to it. 501 * Also, there is no point pulling up if b_cont is NULL. 502 */ 503 if (DB_REF(mp) > 1 || mp->b_cont == NULL || 504 pullupmsg(mp, -1) == 0) 505 return (EINVAL); 506 507 retried = B_TRUE; 508 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 509 flow_state_t *, &s); 510 goto retry; 511 } 512 } 513 514 /* 515 * The packet is considered sane. We may now attempt to 516 * find the corresponding flent. 517 */ 518 rw_enter(&ft->ft_lock, RW_READER); 519 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 520 for (; flent != NULL; flent = flent->fe_next) { 521 if (flent->fe_match(ft, flent, &s)) { 522 FLOW_TRY_REFHOLD(flent, err); 523 if (err != 0) 524 continue; 525 *flentp = flent; 526 rw_exit(&ft->ft_lock); 527 return (0); 528 } 529 } 530 rw_exit(&ft->ft_lock); 531 return (ENOENT); 532 } 533 534 /* 535 * Walk flow table. 536 * The caller is assumed to have proper perimeter protection. 537 */ 538 int 539 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 540 void *arg) 541 { 542 int err, i, cnt = 0; 543 flow_entry_t *flent; 544 545 if (ft == NULL) 546 return (0); 547 548 for (i = 0; i < ft->ft_size; i++) { 549 for (flent = ft->ft_table[i]; flent != NULL; 550 flent = flent->fe_next) { 551 cnt++; 552 err = (*fn)(flent, arg); 553 if (err != 0) 554 return (err); 555 } 556 } 557 VERIFY(cnt == ft->ft_flow_count); 558 return (0); 559 } 560 561 /* 562 * Same as the above except a mutex is used for protection here. 563 */ 564 int 565 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 566 void *arg) 567 { 568 int err; 569 570 if (ft == NULL) 571 return (0); 572 573 rw_enter(&ft->ft_lock, RW_WRITER); 574 err = mac_flow_walk_nolock(ft, fn, arg); 575 rw_exit(&ft->ft_lock); 576 return (err); 577 } 578 579 static boolean_t mac_flow_clean(flow_entry_t *); 580 581 /* 582 * Destroy a flow entry. Called when the last reference on a flow is released. 583 */ 584 void 585 mac_flow_destroy(flow_entry_t *flent) 586 { 587 ASSERT(flent->fe_refcnt == 0); 588 589 if ((flent->fe_type & FLOW_USER) != 0) { 590 ASSERT(mac_flow_clean(flent)); 591 } else { 592 mac_flow_cleanup(flent); 593 } 594 595 mutex_destroy(&flent->fe_lock); 596 cv_destroy(&flent->fe_cv); 597 flow_stat_destroy(flent); 598 kmem_cache_free(flow_cache, flent); 599 } 600 601 /* 602 * XXX eric 603 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 604 * mac_link_flow_modify() should really be moved/reworked into the 605 * two functions below. This would consolidate all the mac property 606 * checking in one place. I'm leaving this alone for now since it's 607 * out of scope of the new flows work. 608 */ 609 /* ARGSUSED */ 610 uint32_t 611 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 612 { 613 uint32_t changed_mask = 0; 614 mac_resource_props_t *fmrp = &flent->fe_effective_props; 615 int i; 616 617 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 618 (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { 619 changed_mask |= MRP_MAXBW; 620 fmrp->mrp_maxbw = mrp->mrp_maxbw; 621 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 622 fmrp->mrp_mask &= ~MRP_MAXBW; 623 } else { 624 fmrp->mrp_mask |= MRP_MAXBW; 625 } 626 } 627 628 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 629 if (fmrp->mrp_priority != mrp->mrp_priority) 630 changed_mask |= MRP_PRIORITY; 631 if (mrp->mrp_priority == MPL_RESET) { 632 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 633 fmrp->mrp_mask &= ~MRP_PRIORITY; 634 } else { 635 fmrp->mrp_priority = mrp->mrp_priority; 636 fmrp->mrp_mask |= MRP_PRIORITY; 637 } 638 } 639 640 /* modify fanout */ 641 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 642 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 643 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 644 for (i = 0; i < mrp->mrp_ncpus; i++) { 645 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 646 break; 647 } 648 if (i == mrp->mrp_ncpus) { 649 /* 650 * The new set of cpus passed is exactly 651 * the same as the existing set. 652 */ 653 return (changed_mask); 654 } 655 } 656 changed_mask |= MRP_CPUS; 657 MAC_COPY_CPUS(mrp, fmrp); 658 } 659 return (changed_mask); 660 } 661 662 void 663 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 664 { 665 uint32_t changed_mask; 666 mac_client_impl_t *mcip = flent->fe_mcip; 667 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 668 669 ASSERT(flent != NULL); 670 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 671 672 rw_enter(&ft->ft_lock, RW_WRITER); 673 674 /* Update the cached values inside the subflow entry */ 675 changed_mask = mac_flow_modify_props(flent, mrp); 676 rw_exit(&ft->ft_lock); 677 /* 678 * Push the changed parameters to the scheduling code in the 679 * SRS's, to take effect right away. 680 */ 681 if (changed_mask & MRP_MAXBW) { 682 mac_srs_update_bwlimit(flent, mrp); 683 /* 684 * If bandwidth is changed, we may have to change 685 * the number of soft ring to be used for fanout. 686 * Call mac_flow_update_fanout() if MAC_BIND_CPU 687 * is not set and there is no user supplied cpu 688 * info. This applies only to link at this time. 689 */ 690 if (!(flent->fe_type & FLOW_USER) && 691 !(changed_mask & MRP_CPUS) && 692 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 693 mac_fanout_setup(mcip, flent, mcip_mrp, 694 mac_rx_deliver, mcip, NULL); 695 } 696 } 697 if (mrp->mrp_mask & MRP_PRIORITY) 698 mac_flow_update_priority(mcip, flent); 699 700 if (changed_mask & MRP_CPUS) 701 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); 702 } 703 704 /* 705 * This function waits for a certain condition to be met and is generally 706 * used before a destructive or quiescing operation. 707 */ 708 void 709 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 710 { 711 mutex_enter(&flent->fe_lock); 712 flent->fe_flags |= FE_WAITER; 713 714 switch (event) { 715 case FLOW_DRIVER_UPCALL: 716 /* 717 * We want to make sure the driver upcalls have finished before 718 * we signal the Rx SRS worker to quit. 719 */ 720 while (flent->fe_refcnt != 1) 721 cv_wait(&flent->fe_cv, &flent->fe_lock); 722 break; 723 724 case FLOW_USER_REF: 725 /* 726 * Wait for the fe_user_refcnt to drop to 0. The flow has 727 * been removed from the global flow hash. 728 */ 729 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 730 while (flent->fe_user_refcnt != 0) 731 cv_wait(&flent->fe_cv, &flent->fe_lock); 732 break; 733 734 default: 735 ASSERT(0); 736 } 737 738 flent->fe_flags &= ~FE_WAITER; 739 mutex_exit(&flent->fe_lock); 740 } 741 742 static boolean_t 743 mac_flow_clean(flow_entry_t *flent) 744 { 745 ASSERT(flent->fe_next == NULL); 746 ASSERT(flent->fe_tx_srs == NULL); 747 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 748 ASSERT(flent->fe_mbg == NULL); 749 750 return (B_TRUE); 751 } 752 753 void 754 mac_flow_cleanup(flow_entry_t *flent) 755 { 756 if ((flent->fe_type & FLOW_USER) == 0) { 757 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 758 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 759 ASSERT(flent->fe_refcnt == 0); 760 } else { 761 ASSERT(flent->fe_refcnt == 1); 762 } 763 764 if (flent->fe_mbg != NULL) { 765 ASSERT(flent->fe_tx_srs == NULL); 766 /* This is a multicast or broadcast flow entry */ 767 mac_bcast_grp_free(flent->fe_mbg); 768 flent->fe_mbg = NULL; 769 } 770 771 if (flent->fe_tx_srs != NULL) { 772 ASSERT(flent->fe_mbg == NULL); 773 mac_srs_free(flent->fe_tx_srs); 774 flent->fe_tx_srs = NULL; 775 } 776 777 /* 778 * In the normal case fe_rx_srs_cnt is 1. However in the error case 779 * when mac_unicast_add fails we may not have set up any SRS 780 * in which case fe_rx_srs_cnt will be zero. 781 */ 782 if (flent->fe_rx_srs_cnt != 0) { 783 ASSERT(flent->fe_rx_srs_cnt == 1); 784 mac_srs_free(flent->fe_rx_srs[0]); 785 flent->fe_rx_srs[0] = NULL; 786 flent->fe_rx_srs_cnt = 0; 787 } 788 ASSERT(flent->fe_rx_srs[0] == NULL); 789 } 790 791 void 792 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 793 { 794 /* 795 * Grab the fe_lock to see a self-consistent fe_flow_desc. 796 * Updates to the fe_flow_desc happen under the fe_lock 797 * after removing the flent from the flow table 798 */ 799 mutex_enter(&flent->fe_lock); 800 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 801 mutex_exit(&flent->fe_lock); 802 } 803 804 /* 805 * Update a field of a flow entry. The mac perimeter ensures that 806 * this is the only thread doing a modify operation on this mac end point. 807 * So the flow table can't change or disappear. The ft_lock protects access 808 * to the flow entry, and holding the lock ensures that there isn't any thread 809 * accessing the flow entry or attempting a flow table lookup. However 810 * data threads that are using the flow entry based on the old descriptor 811 * will continue to use the flow entry. If strong coherence is required 812 * then the flow will have to be quiesced before the descriptor can be 813 * changed. 814 */ 815 void 816 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 817 { 818 flow_tab_t *ft = flent->fe_flow_tab; 819 flow_desc_t old_desc; 820 int err; 821 822 if (ft == NULL) { 823 /* 824 * The flow hasn't yet been inserted into the table, 825 * so only the caller knows about this flow, however for 826 * uniformity we grab the fe_lock here. 827 */ 828 mutex_enter(&flent->fe_lock); 829 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 830 mutex_exit(&flent->fe_lock); 831 } 832 833 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 834 835 /* 836 * Need to remove the flow entry from the table and reinsert it, 837 * into a potentially diference hash line. The hash depends on 838 * the new descriptor fields. However access to fe_desc itself 839 * is always under the fe_lock. This helps log and stat functions 840 * see a self-consistent fe_flow_desc. 841 */ 842 mac_flow_remove(ft, flent, B_TRUE); 843 old_desc = flent->fe_flow_desc; 844 845 mutex_enter(&flent->fe_lock); 846 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 847 mutex_exit(&flent->fe_lock); 848 849 if (mac_flow_add(ft, flent) != 0) { 850 /* 851 * The add failed say due to an invalid flow descriptor. 852 * Undo the update 853 */ 854 flent->fe_flow_desc = old_desc; 855 err = mac_flow_add(ft, flent); 856 ASSERT(err == 0); 857 } 858 } 859 860 void 861 mac_flow_set_name(flow_entry_t *flent, const char *name) 862 { 863 flow_tab_t *ft = flent->fe_flow_tab; 864 865 if (ft == NULL) { 866 /* 867 * The flow hasn't yet been inserted into the table, 868 * so only the caller knows about this flow 869 */ 870 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 871 } else { 872 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 873 } 874 875 mutex_enter(&flent->fe_lock); 876 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 877 mutex_exit(&flent->fe_lock); 878 } 879 880 /* 881 * Return the client-private cookie that was associated with 882 * the flow when it was created. 883 */ 884 void * 885 mac_flow_get_client_cookie(flow_entry_t *flent) 886 { 887 return (flent->fe_client_cookie); 888 } 889 890 /* 891 * Forward declarations. 892 */ 893 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 894 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 895 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 896 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 897 898 /* 899 * Create flow table. 900 */ 901 void 902 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 903 mac_impl_t *mip, flow_tab_t **ftp) 904 { 905 flow_tab_t *ft; 906 flow_ops_t *new_ops; 907 908 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 909 bzero(ft, sizeof (*ft)); 910 911 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 912 913 /* 914 * We make a copy of the ops vector instead of just pointing to it 915 * because we might want to customize the ops vector on a per table 916 * basis (e.g. for optimization). 917 */ 918 new_ops = &ft->ft_ops; 919 bcopy(ops, new_ops, sizeof (*ops)); 920 ft->ft_mask = mask; 921 ft->ft_size = size; 922 ft->ft_mip = mip; 923 924 /* 925 * Optimization for DL_ETHER media. 926 */ 927 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 928 if (new_ops->fo_hash == flow_l2_hash) 929 new_ops->fo_hash = flow_ether_hash; 930 931 if (new_ops->fo_accept[0] == flow_l2_accept) 932 new_ops->fo_accept[0] = flow_ether_accept; 933 934 } 935 *ftp = ft; 936 } 937 938 void 939 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 940 { 941 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 942 1024, mip, ftp); 943 } 944 945 /* 946 * Destroy flow table. 947 */ 948 void 949 mac_flow_tab_destroy(flow_tab_t *ft) 950 { 951 if (ft == NULL) 952 return; 953 954 ASSERT(ft->ft_flow_count == 0); 955 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 956 bzero(ft, sizeof (*ft)); 957 kmem_cache_free(flow_tab_cache, ft); 958 } 959 960 /* 961 * Add a new flow entry to the global flow hash table 962 */ 963 int 964 mac_flow_hash_add(flow_entry_t *flent) 965 { 966 int err; 967 968 rw_enter(&flow_tab_lock, RW_WRITER); 969 err = mod_hash_insert(flow_hash, 970 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 971 if (err != 0) { 972 rw_exit(&flow_tab_lock); 973 return (EEXIST); 974 } 975 /* Mark as inserted into the global flow hash table */ 976 FLOW_MARK(flent, FE_G_FLOW_HASH); 977 rw_exit(&flow_tab_lock); 978 return (err); 979 } 980 981 /* 982 * Remove a flow entry from the global flow hash table 983 */ 984 void 985 mac_flow_hash_remove(flow_entry_t *flent) 986 { 987 mod_hash_val_t val; 988 989 rw_enter(&flow_tab_lock, RW_WRITER); 990 VERIFY(mod_hash_remove(flow_hash, 991 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 992 993 /* Clear the mark that says inserted into the global flow hash table */ 994 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 995 rw_exit(&flow_tab_lock); 996 } 997 998 /* 999 * Retrieve a flow entry from the global flow hash table. 1000 */ 1001 int 1002 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1003 { 1004 int err; 1005 flow_entry_t *flent; 1006 1007 rw_enter(&flow_tab_lock, RW_READER); 1008 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1009 (mod_hash_val_t *)&flent); 1010 if (err != 0) { 1011 rw_exit(&flow_tab_lock); 1012 return (ENOENT); 1013 } 1014 ASSERT(flent != NULL); 1015 FLOW_USER_REFHOLD(flent); 1016 rw_exit(&flow_tab_lock); 1017 1018 *flentp = flent; 1019 return (0); 1020 } 1021 1022 /* 1023 * Initialize or release mac client flows by walking the subflow table. 1024 * These are typically invoked during plumb/unplumb of links. 1025 */ 1026 1027 static int 1028 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1029 { 1030 mac_client_impl_t *mcip = arg; 1031 1032 if (mac_link_flow_init(arg, flent) != 0) { 1033 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1034 flent->fe_flow_name, mcip->mci_name); 1035 } else { 1036 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1037 } 1038 return (0); 1039 } 1040 1041 void 1042 mac_link_init_flows(mac_client_handle_t mch) 1043 { 1044 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1045 1046 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1047 mac_link_init_flows_cb, mcip); 1048 /* 1049 * If mac client had subflow(s) configured before plumb, change 1050 * function to mac_rx_srs_subflow_process and in case of hardware 1051 * classification, disable polling. 1052 */ 1053 mac_client_update_classifier(mcip, B_TRUE); 1054 1055 } 1056 1057 boolean_t 1058 mac_link_has_flows(mac_client_handle_t mch) 1059 { 1060 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1061 1062 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1063 return (B_TRUE); 1064 1065 return (B_FALSE); 1066 } 1067 1068 static int 1069 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1070 { 1071 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1072 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1073 mac_link_flow_clean(arg, flent); 1074 return (0); 1075 } 1076 1077 void 1078 mac_link_release_flows(mac_client_handle_t mch) 1079 { 1080 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1081 1082 /* 1083 * Change the mci_flent callback back to mac_rx_srs_process() 1084 * because flows are about to be deactivated. 1085 */ 1086 mac_client_update_classifier(mcip, B_FALSE); 1087 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1088 mac_link_release_flows_cb, mcip); 1089 } 1090 1091 void 1092 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1093 { 1094 mac_flow_set_name(fep, new_name); 1095 if (fep->fe_ksp != NULL) { 1096 flow_stat_destroy(fep); 1097 flow_stat_create(fep); 1098 } 1099 } 1100 1101 /* 1102 * mac_link_flow_init() 1103 * Internal flow interface used for allocating SRSs and related 1104 * data structures. Not meant to be used by mac clients. 1105 */ 1106 int 1107 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1108 { 1109 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1110 mac_impl_t *mip = mcip->mci_mip; 1111 int err; 1112 1113 ASSERT(mch != NULL); 1114 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1115 1116 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1117 return (err); 1118 1119 sub_flow->fe_mcip = mcip; 1120 1121 return (0); 1122 } 1123 1124 /* 1125 * mac_link_flow_add() 1126 * Used by flowadm(1m) or kernel mac clients for creating flows. 1127 */ 1128 int 1129 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1130 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1131 { 1132 flow_entry_t *flent = NULL; 1133 int err; 1134 dls_dl_handle_t dlh; 1135 dls_link_t *dlp; 1136 boolean_t link_held = B_FALSE; 1137 boolean_t hash_added = B_FALSE; 1138 mac_perim_handle_t mph; 1139 1140 err = mac_flow_lookup_byname(flow_name, &flent); 1141 if (err == 0) { 1142 FLOW_USER_REFRELE(flent); 1143 return (EEXIST); 1144 } 1145 1146 /* 1147 * First create a flow entry given the description provided 1148 * by the caller. 1149 */ 1150 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1151 FLOW_USER | FLOW_OTHER, &flent); 1152 1153 if (err != 0) 1154 return (err); 1155 1156 /* 1157 * We've got a local variable referencing this flow now, so we need 1158 * to hold it. We'll release this flow before returning. 1159 * All failures until we return will undo any action that may internally 1160 * held the flow, so the last REFRELE will assure a clean freeing 1161 * of resources. 1162 */ 1163 FLOW_REFHOLD(flent); 1164 1165 flent->fe_link_id = linkid; 1166 FLOW_MARK(flent, FE_INCIPIENT); 1167 1168 err = mac_perim_enter_by_linkid(linkid, &mph); 1169 if (err != 0) { 1170 FLOW_FINAL_REFRELE(flent); 1171 return (err); 1172 } 1173 1174 /* 1175 * dls will eventually be merged with mac so it's ok 1176 * to call dls' internal functions. 1177 */ 1178 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1179 if (err != 0) 1180 goto bail; 1181 1182 link_held = B_TRUE; 1183 1184 /* 1185 * Add the flow to the global flow table, this table will be per 1186 * exclusive zone so each zone can have its own flow namespace. 1187 * RFE 6625651 will fix this. 1188 * 1189 */ 1190 if ((err = mac_flow_hash_add(flent)) != 0) 1191 goto bail; 1192 1193 hash_added = B_TRUE; 1194 1195 /* 1196 * do not allow flows to be configured on an anchor VNIC 1197 */ 1198 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1199 err = ENOTSUP; 1200 goto bail; 1201 } 1202 1203 /* 1204 * Save the zoneid of the underlying link in the flow entry, 1205 * this is needed to prevent non-global zone from getting 1206 * statistics information of global zone. 1207 */ 1208 flent->fe_zoneid = dlp->dl_zid; 1209 1210 /* 1211 * Add the subflow to the subflow table. Also instantiate the flow 1212 * in the mac if there is an active DLS user. The dl_mah is set when 1213 * dls_active_set() is called, typically during interface plumb. 1214 */ 1215 err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL); 1216 if (err != 0) 1217 goto bail; 1218 1219 FLOW_UNMARK(flent, FE_INCIPIENT); 1220 dls_devnet_rele_link(dlh, dlp); 1221 mac_perim_exit(mph); 1222 return (0); 1223 1224 bail: 1225 if (hash_added) 1226 mac_flow_hash_remove(flent); 1227 1228 if (link_held) 1229 dls_devnet_rele_link(dlh, dlp); 1230 1231 /* 1232 * Wait for any transient global flow hash refs to clear 1233 * and then release the creation reference on the flow 1234 */ 1235 mac_flow_wait(flent, FLOW_USER_REF); 1236 FLOW_FINAL_REFRELE(flent); 1237 mac_perim_exit(mph); 1238 return (err); 1239 } 1240 1241 /* 1242 * mac_link_flow_clean() 1243 * Internal flow interface used for freeing SRSs and related 1244 * data structures. Not meant to be used by mac clients. 1245 */ 1246 void 1247 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1248 { 1249 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1250 mac_impl_t *mip = mcip->mci_mip; 1251 boolean_t last_subflow; 1252 1253 ASSERT(mch != NULL); 1254 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1255 1256 /* 1257 * This sub flow entry may fail to be fully initialized by 1258 * mac_link_flow_init(). If so, simply return. 1259 */ 1260 if (sub_flow->fe_mcip == NULL) 1261 return; 1262 1263 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1264 /* 1265 * Tear down the data path 1266 */ 1267 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1268 sub_flow->fe_mcip = NULL; 1269 1270 /* 1271 * Delete the SRSs associated with this subflow. If this is being 1272 * driven by flowadm(1M) then the subflow will be deleted by 1273 * dls_rem_flow. However if this is a result of the interface being 1274 * unplumbed then the subflow itself won't be deleted. 1275 */ 1276 mac_flow_cleanup(sub_flow); 1277 1278 /* 1279 * If all the subflows are gone, renable some of the stuff 1280 * we disabled when adding a subflow, polling etc. 1281 */ 1282 if (last_subflow) { 1283 /* 1284 * The subflow table itself is not protected by any locks or 1285 * refcnts. Hence quiesce the client upfront before clearing 1286 * mci_subflow_tab. 1287 */ 1288 mac_client_quiesce(mcip); 1289 mac_client_update_classifier(mcip, B_FALSE); 1290 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1291 mcip->mci_subflow_tab = NULL; 1292 mac_client_restart(mcip); 1293 } 1294 } 1295 1296 /* 1297 * mac_link_flow_remove() 1298 * Used by flowadm(1m) or kernel mac clients for removing flows. 1299 */ 1300 int 1301 mac_link_flow_remove(char *flow_name) 1302 { 1303 flow_entry_t *flent; 1304 mac_perim_handle_t mph; 1305 int err; 1306 datalink_id_t linkid; 1307 1308 err = mac_flow_lookup_byname(flow_name, &flent); 1309 if (err != 0) 1310 return (err); 1311 1312 linkid = flent->fe_link_id; 1313 FLOW_USER_REFRELE(flent); 1314 1315 /* 1316 * The perim must be acquired before acquiring any other references 1317 * to maintain the lock and perimeter hierarchy. Please note the 1318 * FLOW_REFRELE above. 1319 */ 1320 err = mac_perim_enter_by_linkid(linkid, &mph); 1321 if (err != 0) 1322 return (err); 1323 1324 /* 1325 * Note the second lookup of the flow, because a concurrent thread 1326 * may have removed it already while we were waiting to enter the 1327 * link's perimeter. 1328 */ 1329 err = mac_flow_lookup_byname(flow_name, &flent); 1330 if (err != 0) { 1331 mac_perim_exit(mph); 1332 return (err); 1333 } 1334 FLOW_USER_REFRELE(flent); 1335 1336 /* 1337 * Remove the flow from the subflow table and deactivate the flow 1338 * by quiescing and removings its SRSs 1339 */ 1340 mac_flow_rem_subflow(flent); 1341 1342 /* 1343 * Finally, remove the flow from the global table. 1344 */ 1345 mac_flow_hash_remove(flent); 1346 1347 /* 1348 * Wait for any transient global flow hash refs to clear 1349 * and then release the creation reference on the flow 1350 */ 1351 mac_flow_wait(flent, FLOW_USER_REF); 1352 FLOW_FINAL_REFRELE(flent); 1353 1354 mac_perim_exit(mph); 1355 1356 return (0); 1357 } 1358 1359 /* 1360 * mac_link_flow_modify() 1361 * Modifies the properties of a flow identified by its name. 1362 */ 1363 int 1364 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1365 { 1366 flow_entry_t *flent; 1367 mac_client_impl_t *mcip; 1368 int err = 0; 1369 mac_perim_handle_t mph; 1370 datalink_id_t linkid; 1371 flow_tab_t *flow_tab; 1372 1373 err = mac_validate_props(mrp); 1374 if (err != 0) 1375 return (err); 1376 1377 err = mac_flow_lookup_byname(flow_name, &flent); 1378 if (err != 0) 1379 return (err); 1380 1381 linkid = flent->fe_link_id; 1382 FLOW_USER_REFRELE(flent); 1383 1384 /* 1385 * The perim must be acquired before acquiring any other references 1386 * to maintain the lock and perimeter hierarchy. Please note the 1387 * FLOW_REFRELE above. 1388 */ 1389 err = mac_perim_enter_by_linkid(linkid, &mph); 1390 if (err != 0) 1391 return (err); 1392 1393 /* 1394 * Note the second lookup of the flow, because a concurrent thread 1395 * may have removed it already while we were waiting to enter the 1396 * link's perimeter. 1397 */ 1398 err = mac_flow_lookup_byname(flow_name, &flent); 1399 if (err != 0) { 1400 mac_perim_exit(mph); 1401 return (err); 1402 } 1403 FLOW_USER_REFRELE(flent); 1404 1405 /* 1406 * If this flow is attached to a MAC client, then pass the request 1407 * along to the client. 1408 * Otherwise, just update the cached values. 1409 */ 1410 mcip = flent->fe_mcip; 1411 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1412 if (mcip != NULL) { 1413 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1414 err = ENOENT; 1415 } else { 1416 mac_flow_modify(flow_tab, flent, mrp); 1417 } 1418 } else { 1419 (void) mac_flow_modify_props(flent, mrp); 1420 } 1421 1422 done: 1423 mac_perim_exit(mph); 1424 return (err); 1425 } 1426 1427 1428 /* 1429 * State structure and misc functions used by mac_link_flow_walk(). 1430 */ 1431 typedef struct { 1432 int (*ws_func)(mac_flowinfo_t *, void *); 1433 void *ws_arg; 1434 } flow_walk_state_t; 1435 1436 static void 1437 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1438 { 1439 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1440 MAXFLOWNAMELEN); 1441 finfop->fi_link_id = flent->fe_link_id; 1442 finfop->fi_flow_desc = flent->fe_flow_desc; 1443 finfop->fi_resource_props = flent->fe_resource_props; 1444 } 1445 1446 static int 1447 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1448 { 1449 flow_walk_state_t *statep = arg; 1450 mac_flowinfo_t finfo; 1451 1452 mac_link_flowinfo_copy(&finfo, flent); 1453 return (statep->ws_func(&finfo, statep->ws_arg)); 1454 } 1455 1456 /* 1457 * mac_link_flow_walk() 1458 * Invokes callback 'func' for all flows belonging to the specified link. 1459 */ 1460 int 1461 mac_link_flow_walk(datalink_id_t linkid, 1462 int (*func)(mac_flowinfo_t *, void *), void *arg) 1463 { 1464 mac_client_impl_t *mcip; 1465 mac_perim_handle_t mph; 1466 flow_walk_state_t state; 1467 dls_dl_handle_t dlh; 1468 dls_link_t *dlp; 1469 int err; 1470 1471 err = mac_perim_enter_by_linkid(linkid, &mph); 1472 if (err != 0) 1473 return (err); 1474 1475 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1476 if (err != 0) { 1477 mac_perim_exit(mph); 1478 return (err); 1479 } 1480 1481 mcip = (mac_client_impl_t *)dlp->dl_mch; 1482 state.ws_func = func; 1483 state.ws_arg = arg; 1484 1485 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1486 mac_link_flow_walk_cb, &state); 1487 1488 dls_devnet_rele_link(dlh, dlp); 1489 mac_perim_exit(mph); 1490 return (err); 1491 } 1492 1493 /* 1494 * mac_link_flow_info() 1495 * Retrieves information about a specific flow. 1496 */ 1497 int 1498 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1499 { 1500 flow_entry_t *flent; 1501 int err; 1502 1503 err = mac_flow_lookup_byname(flow_name, &flent); 1504 if (err != 0) 1505 return (err); 1506 1507 mac_link_flowinfo_copy(finfo, flent); 1508 FLOW_USER_REFRELE(flent); 1509 return (0); 1510 } 1511 1512 #define HASH_MAC_VID(a, v, s) \ 1513 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1514 1515 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1516 1517 /* ARGSUSED */ 1518 static boolean_t 1519 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1520 { 1521 flow_l2info_t *l2 = &s->fs_l2info; 1522 flow_desc_t *fd = &flent->fe_flow_desc; 1523 1524 return (l2->l2_vid == fd->fd_vid && 1525 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1526 } 1527 1528 /* 1529 * Layer 2 hash function. 1530 * Must be paired with flow_l2_accept() within a set of flow_ops 1531 * because it assumes the dest address is already extracted. 1532 */ 1533 static uint32_t 1534 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1535 { 1536 flow_l2info_t *l2 = &s->fs_l2info; 1537 1538 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1539 } 1540 1541 /* 1542 * This is the generic layer 2 accept function. 1543 * It makes use of mac_header_info() to extract the header length, 1544 * sap, vlan ID and destination address. 1545 */ 1546 static int 1547 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1548 { 1549 boolean_t is_ether; 1550 flow_l2info_t *l2 = &s->fs_l2info; 1551 mac_header_info_t mhi; 1552 int err; 1553 1554 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1555 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1556 s->fs_mp, &mhi)) != 0) { 1557 if (err == EINVAL) 1558 err = ENOBUFS; 1559 1560 return (err); 1561 } 1562 1563 l2->l2_start = s->fs_mp->b_rptr; 1564 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1565 1566 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1567 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1568 struct ether_vlan_header *evhp = 1569 (struct ether_vlan_header *)l2->l2_start; 1570 1571 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1572 return (ENOBUFS); 1573 1574 l2->l2_sap = ntohs(evhp->ether_type); 1575 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1576 l2->l2_hdrsize = sizeof (*evhp); 1577 } else { 1578 l2->l2_sap = mhi.mhi_bindsap; 1579 l2->l2_vid = 0; 1580 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1581 } 1582 return (0); 1583 } 1584 1585 /* 1586 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1587 * accept(). The notable difference is that dest address is now extracted 1588 * by hash() rather than by accept(). This saves a few memory references 1589 * for flow tables that do not care about mac addresses. 1590 */ 1591 static uint32_t 1592 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1593 { 1594 flow_l2info_t *l2 = &s->fs_l2info; 1595 struct ether_vlan_header *evhp; 1596 1597 evhp = (struct ether_vlan_header *)l2->l2_start; 1598 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1599 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1600 } 1601 1602 /* ARGSUSED */ 1603 static int 1604 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1605 { 1606 flow_l2info_t *l2 = &s->fs_l2info; 1607 struct ether_vlan_header *evhp; 1608 uint16_t sap; 1609 1610 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1611 l2->l2_start = (uchar_t *)evhp; 1612 1613 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1614 return (ENOBUFS); 1615 1616 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1617 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1618 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1619 return (ENOBUFS); 1620 1621 l2->l2_sap = ntohs(evhp->ether_type); 1622 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1623 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1624 } else { 1625 l2->l2_sap = sap; 1626 l2->l2_vid = 0; 1627 l2->l2_hdrsize = sizeof (struct ether_header); 1628 } 1629 return (0); 1630 } 1631 1632 /* 1633 * Validates a layer 2 flow entry. 1634 */ 1635 static int 1636 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1637 { 1638 int i; 1639 flow_desc_t *fd = &flent->fe_flow_desc; 1640 1641 /* 1642 * Dest address is mandatory. 1643 */ 1644 if ((fd->fd_mask & FLOW_LINK_DST) == 0) 1645 return (EINVAL); 1646 1647 for (i = 0; i < fd->fd_mac_len; i++) { 1648 if (fd->fd_dst_mac[i] != 0) 1649 break; 1650 } 1651 if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL) 1652 return (EINVAL); 1653 1654 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1655 /* 1656 * VLAN flows are only supported over ethernet macs. 1657 */ 1658 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1659 return (EINVAL); 1660 1661 if (fd->fd_vid == 0) 1662 return (EINVAL); 1663 1664 } 1665 flent->fe_match = flow_l2_match; 1666 return (0); 1667 } 1668 1669 /* 1670 * Calculates hash index of flow entry. 1671 */ 1672 static uint32_t 1673 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1674 { 1675 flow_desc_t *fd = &flent->fe_flow_desc; 1676 1677 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1678 return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1679 } 1680 1681 /* 1682 * This is used for duplicate flow checking. 1683 */ 1684 /* ARGSUSED */ 1685 static boolean_t 1686 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1687 { 1688 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1689 1690 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1691 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1692 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1693 } 1694 1695 /* 1696 * Generic flow entry insertion function. 1697 * Used by flow tables that do not have ordering requirements. 1698 */ 1699 /* ARGSUSED */ 1700 static int 1701 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1702 flow_entry_t *flent) 1703 { 1704 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1705 1706 if (*headp != NULL) { 1707 ASSERT(flent->fe_next == NULL); 1708 flent->fe_next = *headp; 1709 } 1710 *headp = flent; 1711 return (0); 1712 } 1713 1714 /* 1715 * IP version independent DSField matching function. 1716 */ 1717 /* ARGSUSED */ 1718 static boolean_t 1719 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1720 { 1721 flow_l3info_t *l3info = &s->fs_l3info; 1722 flow_desc_t *fd = &flent->fe_flow_desc; 1723 1724 switch (l3info->l3_version) { 1725 case IPV4_VERSION: { 1726 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1727 1728 return ((ipha->ipha_type_of_service & 1729 fd->fd_dsfield_mask) == fd->fd_dsfield); 1730 } 1731 case IPV6_VERSION: { 1732 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1733 1734 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1735 fd->fd_dsfield_mask) == fd->fd_dsfield); 1736 } 1737 default: 1738 return (B_FALSE); 1739 } 1740 } 1741 1742 /* 1743 * IP v4 and v6 address matching. 1744 * The netmask only needs to be applied on the packet but not on the 1745 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1746 */ 1747 1748 /* ARGSUSED */ 1749 static boolean_t 1750 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1751 { 1752 flow_l3info_t *l3info = &s->fs_l3info; 1753 flow_desc_t *fd = &flent->fe_flow_desc; 1754 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1755 in_addr_t addr; 1756 1757 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1758 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1759 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1760 V4_PART_OF_V6(fd->fd_local_addr)); 1761 } 1762 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1763 V4_PART_OF_V6(fd->fd_remote_addr)); 1764 } 1765 1766 /* ARGSUSED */ 1767 static boolean_t 1768 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1769 { 1770 flow_l3info_t *l3info = &s->fs_l3info; 1771 flow_desc_t *fd = &flent->fe_flow_desc; 1772 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1773 in6_addr_t *addrp; 1774 1775 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1776 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1777 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1778 fd->fd_local_addr)); 1779 } 1780 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1781 } 1782 1783 /* ARGSUSED */ 1784 static boolean_t 1785 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1786 { 1787 flow_l3info_t *l3info = &s->fs_l3info; 1788 flow_desc_t *fd = &flent->fe_flow_desc; 1789 1790 return (l3info->l3_protocol == fd->fd_protocol); 1791 } 1792 1793 static uint32_t 1794 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1795 { 1796 flow_l3info_t *l3info = &s->fs_l3info; 1797 flow_mask_t mask = ft->ft_mask; 1798 1799 if ((mask & FLOW_IP_LOCAL) != 0) { 1800 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1801 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1802 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1803 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1804 /* 1805 * DSField flents are arranged as a single list. 1806 */ 1807 return (0); 1808 } 1809 /* 1810 * IP addr flents are hashed into two lists, v4 or v6. 1811 */ 1812 ASSERT(ft->ft_size >= 2); 1813 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1814 } 1815 1816 static uint32_t 1817 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1818 { 1819 flow_l3info_t *l3info = &s->fs_l3info; 1820 1821 return (l3info->l3_protocol % ft->ft_size); 1822 } 1823 1824 /* ARGSUSED */ 1825 static int 1826 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1827 { 1828 flow_l2info_t *l2info = &s->fs_l2info; 1829 flow_l3info_t *l3info = &s->fs_l3info; 1830 uint16_t sap = l2info->l2_sap; 1831 uchar_t *l3_start; 1832 1833 l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize; 1834 if (!OK_32PTR(l3_start)) 1835 return (EINVAL); 1836 1837 switch (sap) { 1838 case ETHERTYPE_IP: { 1839 ipha_t *ipha = (ipha_t *)l3_start; 1840 1841 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1842 return (ENOBUFS); 1843 1844 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1845 l3info->l3_protocol = ipha->ipha_protocol; 1846 l3info->l3_version = IPV4_VERSION; 1847 l3info->l3_fragmented = 1848 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1849 break; 1850 } 1851 case ETHERTYPE_IPV6: { 1852 ip6_t *ip6h = (ip6_t *)l3_start; 1853 uint16_t ip6_hdrlen; 1854 uint8_t nexthdr; 1855 1856 if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, 1857 &nexthdr)) { 1858 return (ENOBUFS); 1859 } 1860 l3info->l3_hdrsize = ip6_hdrlen; 1861 l3info->l3_protocol = nexthdr; 1862 l3info->l3_version = IPV6_VERSION; 1863 l3info->l3_fragmented = B_FALSE; 1864 break; 1865 } 1866 default: 1867 return (EINVAL); 1868 } 1869 return (0); 1870 } 1871 1872 /* ARGSUSED */ 1873 static int 1874 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1875 { 1876 flow_desc_t *fd = &flent->fe_flow_desc; 1877 1878 switch (fd->fd_protocol) { 1879 case IPPROTO_TCP: 1880 case IPPROTO_UDP: 1881 case IPPROTO_SCTP: 1882 case IPPROTO_ICMP: 1883 case IPPROTO_ICMPV6: 1884 flent->fe_match = flow_ip_proto_match; 1885 return (0); 1886 default: 1887 return (EINVAL); 1888 } 1889 } 1890 1891 /* ARGSUSED */ 1892 static int 1893 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1894 { 1895 flow_desc_t *fd = &flent->fe_flow_desc; 1896 flow_mask_t mask; 1897 uint8_t version; 1898 in6_addr_t *addr, *netmask; 1899 1900 /* 1901 * DSField does not require a IP version. 1902 */ 1903 if (fd->fd_mask == FLOW_IP_DSFIELD) { 1904 if (fd->fd_dsfield_mask == 0) 1905 return (EINVAL); 1906 1907 flent->fe_match = flow_ip_dsfield_match; 1908 return (0); 1909 } 1910 1911 /* 1912 * IP addresses must come with a version to avoid ambiguity. 1913 */ 1914 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 1915 return (EINVAL); 1916 1917 version = fd->fd_ipversion; 1918 if (version != IPV4_VERSION && version != IPV6_VERSION) 1919 return (EINVAL); 1920 1921 mask = fd->fd_mask & ~FLOW_IP_VERSION; 1922 switch (mask) { 1923 case FLOW_IP_LOCAL: 1924 addr = &fd->fd_local_addr; 1925 netmask = &fd->fd_local_netmask; 1926 break; 1927 case FLOW_IP_REMOTE: 1928 addr = &fd->fd_remote_addr; 1929 netmask = &fd->fd_remote_netmask; 1930 break; 1931 default: 1932 return (EINVAL); 1933 } 1934 1935 /* 1936 * Apply netmask onto specified address. 1937 */ 1938 V6_MASK_COPY(*addr, *netmask, *addr); 1939 if (version == IPV4_VERSION) { 1940 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 1941 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 1942 1943 if (v4addr == 0 || v4mask == 0) 1944 return (EINVAL); 1945 flent->fe_match = flow_ip_v4_match; 1946 } else { 1947 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 1948 IN6_IS_ADDR_UNSPECIFIED(netmask)) 1949 return (EINVAL); 1950 flent->fe_match = flow_ip_v6_match; 1951 } 1952 return (0); 1953 } 1954 1955 static uint32_t 1956 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1957 { 1958 flow_desc_t *fd = &flent->fe_flow_desc; 1959 1960 return (fd->fd_protocol % ft->ft_size); 1961 } 1962 1963 static uint32_t 1964 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1965 { 1966 flow_desc_t *fd = &flent->fe_flow_desc; 1967 1968 /* 1969 * DSField flents are arranged as a single list. 1970 */ 1971 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 1972 return (0); 1973 1974 /* 1975 * IP addr flents are hashed into two lists, v4 or v6. 1976 */ 1977 ASSERT(ft->ft_size >= 2); 1978 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 1979 } 1980 1981 /* ARGSUSED */ 1982 static boolean_t 1983 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1984 { 1985 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1986 1987 return (fd1->fd_protocol == fd2->fd_protocol); 1988 } 1989 1990 /* ARGSUSED */ 1991 static boolean_t 1992 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1993 { 1994 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1995 in6_addr_t *a1, *m1, *a2, *m2; 1996 1997 ASSERT(fd1->fd_mask == fd2->fd_mask); 1998 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 1999 return (fd1->fd_dsfield == fd2->fd_dsfield && 2000 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2001 } 2002 2003 /* 2004 * flow_ip_accept_fe() already validated the version. 2005 */ 2006 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2007 if (fd1->fd_ipversion != fd2->fd_ipversion) 2008 return (B_FALSE); 2009 2010 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2011 case FLOW_IP_LOCAL: 2012 a1 = &fd1->fd_local_addr; 2013 m1 = &fd1->fd_local_netmask; 2014 a2 = &fd2->fd_local_addr; 2015 m2 = &fd2->fd_local_netmask; 2016 break; 2017 case FLOW_IP_REMOTE: 2018 a1 = &fd1->fd_remote_addr; 2019 m1 = &fd1->fd_remote_netmask; 2020 a2 = &fd2->fd_remote_addr; 2021 m2 = &fd2->fd_remote_netmask; 2022 break; 2023 default: 2024 /* 2025 * This is unreachable given the checks in 2026 * flow_ip_accept_fe(). 2027 */ 2028 return (B_FALSE); 2029 } 2030 2031 if (fd1->fd_ipversion == IPV4_VERSION) { 2032 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2033 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2034 2035 } else { 2036 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2037 IN6_ARE_ADDR_EQUAL(m1, m2)); 2038 } 2039 } 2040 2041 static int 2042 flow_ip_mask2plen(in6_addr_t *v6mask) 2043 { 2044 int bits; 2045 int plen = IPV6_ABITS; 2046 int i; 2047 2048 for (i = 3; i >= 0; i--) { 2049 if (v6mask->s6_addr32[i] == 0) { 2050 plen -= 32; 2051 continue; 2052 } 2053 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2054 if (bits == 0) 2055 break; 2056 plen -= bits; 2057 } 2058 return (plen); 2059 } 2060 2061 /* ARGSUSED */ 2062 static int 2063 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2064 flow_entry_t *flent) 2065 { 2066 flow_entry_t **p = headp; 2067 flow_desc_t *fd0, *fd; 2068 in6_addr_t *m0, *m; 2069 int plen0, plen; 2070 2071 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2072 2073 /* 2074 * No special ordering needed for dsfield. 2075 */ 2076 fd0 = &flent->fe_flow_desc; 2077 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2078 if (*p != NULL) { 2079 ASSERT(flent->fe_next == NULL); 2080 flent->fe_next = *p; 2081 } 2082 *p = flent; 2083 return (0); 2084 } 2085 2086 /* 2087 * IP address flows are arranged in descending prefix length order. 2088 */ 2089 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2090 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2091 plen0 = flow_ip_mask2plen(m0); 2092 ASSERT(plen0 != 0); 2093 2094 for (; *p != NULL; p = &(*p)->fe_next) { 2095 fd = &(*p)->fe_flow_desc; 2096 2097 /* 2098 * Normally a dsfield flent shouldn't end up on the same 2099 * list as an IP address because flow tables are (for now) 2100 * disjoint. If we decide to support both IP and dsfield 2101 * in the same table in the future, this check will allow 2102 * for that. 2103 */ 2104 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2105 continue; 2106 2107 /* 2108 * We also allow for the mixing of local and remote address 2109 * flents within one list. 2110 */ 2111 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2112 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2113 plen = flow_ip_mask2plen(m); 2114 2115 if (plen <= plen0) 2116 break; 2117 } 2118 if (*p != NULL) { 2119 ASSERT(flent->fe_next == NULL); 2120 flent->fe_next = *p; 2121 } 2122 *p = flent; 2123 return (0); 2124 } 2125 2126 /* 2127 * Transport layer protocol and port matching functions. 2128 */ 2129 2130 /* ARGSUSED */ 2131 static boolean_t 2132 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2133 { 2134 flow_l3info_t *l3info = &s->fs_l3info; 2135 flow_l4info_t *l4info = &s->fs_l4info; 2136 flow_desc_t *fd = &flent->fe_flow_desc; 2137 2138 return (fd->fd_protocol == l3info->l3_protocol && 2139 fd->fd_local_port == l4info->l4_hash_port); 2140 } 2141 2142 /* ARGSUSED */ 2143 static boolean_t 2144 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2145 { 2146 flow_l3info_t *l3info = &s->fs_l3info; 2147 flow_l4info_t *l4info = &s->fs_l4info; 2148 flow_desc_t *fd = &flent->fe_flow_desc; 2149 2150 return (fd->fd_protocol == l3info->l3_protocol && 2151 fd->fd_remote_port == l4info->l4_hash_port); 2152 } 2153 2154 /* 2155 * Transport hash function. 2156 * Since we only support either local or remote port flows, 2157 * we only need to extract one of the ports to be used for 2158 * matching. 2159 */ 2160 static uint32_t 2161 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2162 { 2163 flow_l3info_t *l3info = &s->fs_l3info; 2164 flow_l4info_t *l4info = &s->fs_l4info; 2165 uint8_t proto = l3info->l3_protocol; 2166 boolean_t dst_or_src; 2167 2168 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2169 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2170 } else { 2171 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2172 } 2173 2174 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2175 l4info->l4_src_port; 2176 2177 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2178 } 2179 2180 /* 2181 * Unlike other accept() functions above, we do not need to get the header 2182 * size because this is our highest layer so far. If we want to do support 2183 * other higher layer protocols, we would need to save the l4_hdrsize 2184 * in the code below. 2185 */ 2186 2187 /* ARGSUSED */ 2188 static int 2189 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2190 { 2191 flow_l3info_t *l3info = &s->fs_l3info; 2192 flow_l4info_t *l4info = &s->fs_l4info; 2193 uint8_t proto = l3info->l3_protocol; 2194 uchar_t *l4_start; 2195 2196 l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize; 2197 if (!OK_32PTR(l4_start)) 2198 return (EINVAL); 2199 2200 if (l3info->l3_fragmented == B_TRUE) 2201 return (EINVAL); 2202 2203 switch (proto) { 2204 case IPPROTO_TCP: { 2205 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2206 2207 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2208 return (ENOBUFS); 2209 2210 l4info->l4_src_port = tcph->th_sport; 2211 l4info->l4_dst_port = tcph->th_dport; 2212 break; 2213 } 2214 case IPPROTO_UDP: { 2215 struct udphdr *udph = (struct udphdr *)l4_start; 2216 2217 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2218 return (ENOBUFS); 2219 2220 l4info->l4_src_port = udph->uh_sport; 2221 l4info->l4_dst_port = udph->uh_dport; 2222 break; 2223 } 2224 case IPPROTO_SCTP: { 2225 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2226 2227 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2228 return (ENOBUFS); 2229 2230 l4info->l4_src_port = sctph->sh_sport; 2231 l4info->l4_dst_port = sctph->sh_dport; 2232 break; 2233 } 2234 default: 2235 return (EINVAL); 2236 } 2237 2238 return (0); 2239 } 2240 2241 /* 2242 * Validates transport flow entry. 2243 * The protocol field must be present. 2244 */ 2245 2246 /* ARGSUSED */ 2247 static int 2248 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2249 { 2250 flow_desc_t *fd = &flent->fe_flow_desc; 2251 flow_mask_t mask = fd->fd_mask; 2252 2253 if ((mask & FLOW_IP_PROTOCOL) == 0) 2254 return (EINVAL); 2255 2256 switch (fd->fd_protocol) { 2257 case IPPROTO_TCP: 2258 case IPPROTO_UDP: 2259 case IPPROTO_SCTP: 2260 break; 2261 default: 2262 return (EINVAL); 2263 } 2264 2265 switch (mask & ~FLOW_IP_PROTOCOL) { 2266 case FLOW_ULP_PORT_LOCAL: 2267 if (fd->fd_local_port == 0) 2268 return (EINVAL); 2269 2270 flent->fe_match = flow_transport_lport_match; 2271 break; 2272 case FLOW_ULP_PORT_REMOTE: 2273 if (fd->fd_remote_port == 0) 2274 return (EINVAL); 2275 2276 flent->fe_match = flow_transport_rport_match; 2277 break; 2278 case 0: 2279 /* 2280 * transport-only flows conflicts with our table type. 2281 */ 2282 return (EOPNOTSUPP); 2283 default: 2284 return (EINVAL); 2285 } 2286 2287 return (0); 2288 } 2289 2290 static uint32_t 2291 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2292 { 2293 flow_desc_t *fd = &flent->fe_flow_desc; 2294 uint16_t port = 0; 2295 2296 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2297 fd->fd_local_port : fd->fd_remote_port; 2298 2299 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2300 } 2301 2302 /* ARGSUSED */ 2303 static boolean_t 2304 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2305 { 2306 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2307 2308 if (fd1->fd_protocol != fd2->fd_protocol) 2309 return (B_FALSE); 2310 2311 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2312 return (fd1->fd_local_port == fd2->fd_local_port); 2313 2314 return (fd1->fd_remote_port == fd2->fd_remote_port); 2315 } 2316 2317 static flow_ops_t flow_l2_ops = { 2318 flow_l2_accept_fe, 2319 flow_l2_hash_fe, 2320 flow_l2_match_fe, 2321 flow_generic_insert_fe, 2322 flow_l2_hash, 2323 {flow_l2_accept} 2324 }; 2325 2326 static flow_ops_t flow_ip_ops = { 2327 flow_ip_accept_fe, 2328 flow_ip_hash_fe, 2329 flow_ip_match_fe, 2330 flow_ip_insert_fe, 2331 flow_ip_hash, 2332 {flow_l2_accept, flow_ip_accept} 2333 }; 2334 2335 static flow_ops_t flow_ip_proto_ops = { 2336 flow_ip_proto_accept_fe, 2337 flow_ip_proto_hash_fe, 2338 flow_ip_proto_match_fe, 2339 flow_generic_insert_fe, 2340 flow_ip_proto_hash, 2341 {flow_l2_accept, flow_ip_accept} 2342 }; 2343 2344 static flow_ops_t flow_transport_ops = { 2345 flow_transport_accept_fe, 2346 flow_transport_hash_fe, 2347 flow_transport_match_fe, 2348 flow_generic_insert_fe, 2349 flow_transport_hash, 2350 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2351 }; 2352 2353 static flow_tab_info_t flow_tab_info_list[] = { 2354 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2355 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2356 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2357 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2358 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024} 2359 }; 2360 2361 #define FLOW_MAX_TAB_INFO \ 2362 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2363 2364 static flow_tab_info_t * 2365 mac_flow_tab_info_get(flow_mask_t mask) 2366 { 2367 int i; 2368 2369 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2370 if (mask == flow_tab_info_list[i].fti_mask) 2371 return (&flow_tab_info_list[i]); 2372 } 2373 return (NULL); 2374 } 2375