1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/dls.h> 33 #include <sys/dls_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/ethernet.h> 36 #include <sys/vlan.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <netinet/tcp.h> 40 #include <netinet/udp.h> 41 #include <netinet/sctp.h> 42 43 /* global flow table, will be a per exclusive-zone table later */ 44 static mod_hash_t *flow_hash; 45 static krwlock_t flow_tab_lock; 46 47 static kmem_cache_t *flow_cache; 48 static kmem_cache_t *flow_tab_cache; 49 static flow_ops_t flow_l2_ops; 50 51 typedef struct { 52 const char *fs_name; 53 uint_t fs_offset; 54 } flow_stats_info_t; 55 56 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 57 static flow_stats_info_t flow_stats_list[] = { 58 {"rbytes", FS_OFF(fs_rbytes)}, 59 {"ipackets", FS_OFF(fs_ipackets)}, 60 {"ierrors", FS_OFF(fs_ierrors)}, 61 {"obytes", FS_OFF(fs_obytes)}, 62 {"opackets", FS_OFF(fs_opackets)}, 63 {"oerrors", FS_OFF(fs_oerrors)} 64 }; 65 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 66 67 /* 68 * Checks whether a flow mask is legal. 69 */ 70 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 71 72 static void 73 flow_stat_init(kstat_named_t *knp) 74 { 75 int i; 76 77 for (i = 0; i < FS_SIZE; i++, knp++) { 78 kstat_named_init(knp, flow_stats_list[i].fs_name, 79 KSTAT_DATA_UINT64); 80 } 81 } 82 83 static int 84 flow_stat_update(kstat_t *ksp, int rw) 85 { 86 flow_entry_t *fep = ksp->ks_private; 87 flow_stats_t *fsp = &fep->fe_flowstats; 88 kstat_named_t *knp = ksp->ks_data; 89 uint64_t *statp; 90 zoneid_t zid; 91 int i; 92 93 if (rw != KSTAT_READ) 94 return (EACCES); 95 96 zid = getzoneid(); 97 if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) { 98 for (i = 0; i < FS_SIZE; i++, knp++) 99 knp->value.ui64 = 0; 100 101 return (0); 102 } 103 104 for (i = 0; i < FS_SIZE; i++, knp++) { 105 statp = (uint64_t *) 106 ((uchar_t *)fsp + flow_stats_list[i].fs_offset); 107 108 knp->value.ui64 = *statp; 109 } 110 return (0); 111 } 112 113 static void 114 flow_stat_create(flow_entry_t *fep) 115 { 116 kstat_t *ksp; 117 kstat_named_t *knp; 118 uint_t nstats = FS_SIZE; 119 120 ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow", 121 KSTAT_TYPE_NAMED, nstats, 0); 122 if (ksp == NULL) 123 return; 124 125 ksp->ks_update = flow_stat_update; 126 ksp->ks_private = fep; 127 fep->fe_ksp = ksp; 128 129 knp = (kstat_named_t *)ksp->ks_data; 130 flow_stat_init(knp); 131 kstat_install(ksp); 132 } 133 134 void 135 flow_stat_destroy(flow_entry_t *fep) 136 { 137 if (fep->fe_ksp != NULL) { 138 kstat_delete(fep->fe_ksp); 139 fep->fe_ksp = NULL; 140 } 141 } 142 143 /* 144 * Initialize the flow table 145 */ 146 void 147 mac_flow_init() 148 { 149 flow_cache = kmem_cache_create("flow_entry_cache", 150 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 151 flow_tab_cache = kmem_cache_create("flow_tab_cache", 152 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 153 flow_hash = mod_hash_create_extended("flow_hash", 154 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 155 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 156 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 157 } 158 159 /* 160 * Cleanup and release the flow table 161 */ 162 void 163 mac_flow_fini() 164 { 165 kmem_cache_destroy(flow_cache); 166 kmem_cache_destroy(flow_tab_cache); 167 mod_hash_destroy_hash(flow_hash); 168 rw_destroy(&flow_tab_lock); 169 } 170 171 /* 172 * mac_create_flow(): create a flow_entry_t. 173 */ 174 int 175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 176 void *client_cookie, uint_t type, flow_entry_t **flentp) 177 { 178 flow_entry_t *flent = *flentp; 179 int err = 0; 180 181 if (mrp != NULL) { 182 err = mac_validate_props(mrp); 183 if (err != 0) 184 return (err); 185 } 186 187 if (flent == NULL) { 188 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 189 bzero(flent, sizeof (*flent)); 190 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 192 193 /* Initialize the receiver function to a safe routine */ 194 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 195 flent->fe_index = -1; 196 } 197 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 198 199 /* This is an initial flow, will be configured later */ 200 if (fd == NULL) { 201 *flentp = flent; 202 return (0); 203 } 204 205 flent->fe_client_cookie = client_cookie; 206 flent->fe_type = type; 207 208 /* 209 * As flow creation is only allowed in global zone, this will 210 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will 211 * later set the right value. 212 */ 213 flent->fe_zoneid = getzoneid(); 214 215 /* Save flow desc */ 216 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 217 218 if (mrp != NULL) { 219 /* 220 * We have already set fe_resource_props for a Link. 221 */ 222 if (type & FLOW_USER) { 223 bcopy(mrp, &flent->fe_resource_props, 224 sizeof (mac_resource_props_t)); 225 } 226 /* 227 * The effective resource list should reflect the priority 228 * that we set implicitly. 229 */ 230 if (!(mrp->mrp_mask & MRP_PRIORITY)) 231 mrp->mrp_mask |= MRP_PRIORITY; 232 if (type & FLOW_USER) 233 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 234 else 235 mrp->mrp_priority = MPL_LINK_DEFAULT; 236 bcopy(mrp, &flent->fe_effective_props, 237 sizeof (mac_resource_props_t)); 238 } 239 flow_stat_create(flent); 240 241 *flentp = flent; 242 return (0); 243 } 244 245 /* 246 * Validate flow entry and add it to a flow table. 247 */ 248 int 249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 250 { 251 flow_entry_t **headp, **p; 252 flow_ops_t *ops = &ft->ft_ops; 253 flow_mask_t mask; 254 uint32_t index; 255 int err; 256 257 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 258 259 /* 260 * Check for invalid bits in mask. 261 */ 262 mask = flent->fe_flow_desc.fd_mask; 263 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 264 return (EOPNOTSUPP); 265 266 /* 267 * Validate flent. 268 */ 269 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 270 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 271 flow_entry_t *, flent, int, err); 272 return (err); 273 } 274 275 /* 276 * Flent is valid. now calculate hash and insert it 277 * into hash table. 278 */ 279 index = ops->fo_hash_fe(ft, flent); 280 281 /* 282 * We do not need a lock up until now because we were 283 * not accessing the flow table. 284 */ 285 rw_enter(&ft->ft_lock, RW_WRITER); 286 headp = &ft->ft_table[index]; 287 288 /* 289 * Check for duplicate flow. 290 */ 291 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 292 if ((*p)->fe_flow_desc.fd_mask != 293 flent->fe_flow_desc.fd_mask) 294 continue; 295 296 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 297 rw_exit(&ft->ft_lock); 298 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 299 flow_entry_t *, flent, int, err); 300 return (EALREADY); 301 } 302 } 303 304 /* 305 * Insert flow to hash list. 306 */ 307 err = ops->fo_insert_fe(ft, headp, flent); 308 if (err != 0) { 309 rw_exit(&ft->ft_lock); 310 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 311 flow_entry_t *, flent, int, err); 312 return (err); 313 } 314 315 /* 316 * Save the hash index so it can be used by mac_flow_remove(). 317 */ 318 flent->fe_index = (int)index; 319 320 /* 321 * Save the flow tab back reference. 322 */ 323 flent->fe_flow_tab = ft; 324 FLOW_MARK(flent, FE_FLOW_TAB); 325 ft->ft_flow_count++; 326 rw_exit(&ft->ft_lock); 327 return (0); 328 } 329 330 /* 331 * Remove a flow from a mac client's subflow table 332 */ 333 void 334 mac_flow_rem_subflow(flow_entry_t *flent) 335 { 336 flow_tab_t *ft = flent->fe_flow_tab; 337 mac_client_impl_t *mcip = ft->ft_mcip; 338 339 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 340 341 mac_flow_remove(ft, flent, B_FALSE); 342 if (flent->fe_mcip == NULL) { 343 /* 344 * The interface is not yet plumbed and mac_client_flow_add 345 * was not done. 346 */ 347 if (FLOW_TAB_EMPTY(ft)) { 348 mac_flow_tab_destroy(ft); 349 mcip->mci_subflow_tab = NULL; 350 } 351 return; 352 } 353 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 354 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 355 } 356 357 /* 358 * Add a flow to a mac client's subflow table and instantiate the flow 359 * in the mac by creating the associated SRSs etc. 360 */ 361 int 362 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 363 boolean_t instantiate_flow) 364 { 365 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 366 flow_tab_info_t *ftinfo; 367 flow_mask_t mask; 368 flow_tab_t *ft; 369 int err; 370 boolean_t ft_created = B_FALSE; 371 372 ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 373 374 /* 375 * If the subflow table exists already just add the new subflow 376 * to the existing table, else we create a new subflow table below. 377 */ 378 ft = mcip->mci_subflow_tab; 379 if (ft == NULL) { 380 mask = flent->fe_flow_desc.fd_mask; 381 /* 382 * Try to create a new table and then add the subflow to the 383 * newly created subflow table 384 */ 385 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) 386 return (EOPNOTSUPP); 387 388 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 389 mcip->mci_mip, &ft); 390 ft_created = B_TRUE; 391 } 392 393 err = mac_flow_add(ft, flent); 394 if (err != 0) { 395 if (ft_created) 396 mac_flow_tab_destroy(ft); 397 return (err); 398 } 399 400 if (instantiate_flow) { 401 /* Now activate the flow by creating its SRSs */ 402 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 403 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 404 if (err != 0) { 405 mac_flow_remove(ft, flent, B_FALSE); 406 if (ft_created) 407 mac_flow_tab_destroy(ft); 408 return (err); 409 } 410 } else { 411 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 412 } 413 if (ft_created) { 414 ASSERT(mcip->mci_subflow_tab == NULL); 415 ft->ft_mcip = mcip; 416 mcip->mci_subflow_tab = ft; 417 if (instantiate_flow) 418 mac_client_update_classifier(mcip, B_TRUE); 419 } 420 return (0); 421 } 422 423 /* 424 * Remove flow entry from flow table. 425 */ 426 void 427 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 428 { 429 flow_entry_t **fp; 430 431 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 432 if (!(flent->fe_flags & FE_FLOW_TAB)) 433 return; 434 435 rw_enter(&ft->ft_lock, RW_WRITER); 436 /* 437 * If this is a permanent removal from the flow table, mark it 438 * CONDEMNED to prevent future references. If this is a temporary 439 * removal from the table, say to update the flow descriptor then 440 * we don't mark it CONDEMNED 441 */ 442 if (!temp) 443 FLOW_MARK(flent, FE_CONDEMNED); 444 /* 445 * Locate the specified flent. 446 */ 447 fp = &ft->ft_table[flent->fe_index]; 448 while (*fp != flent) 449 fp = &(*fp)->fe_next; 450 451 /* 452 * The flent must exist. Otherwise it's a bug. 453 */ 454 ASSERT(fp != NULL); 455 *fp = flent->fe_next; 456 flent->fe_next = NULL; 457 458 /* 459 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 460 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 461 * will panic. 462 */ 463 flent->fe_index = -1; 464 FLOW_UNMARK(flent, FE_FLOW_TAB); 465 ft->ft_flow_count--; 466 rw_exit(&ft->ft_lock); 467 } 468 469 /* 470 * This is the flow lookup routine used by the mac sw classifier engine. 471 */ 472 int 473 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 474 { 475 flow_state_t s; 476 flow_entry_t *flent; 477 flow_ops_t *ops = &ft->ft_ops; 478 boolean_t retried = B_FALSE; 479 int i, err; 480 481 s.fs_flags = flags; 482 retry: 483 s.fs_mp = mp; 484 485 /* 486 * Walk the list of predeclared accept functions. 487 * Each of these would accumulate enough state to allow the next 488 * accept routine to make progress. 489 */ 490 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 491 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 492 mblk_t *last; 493 494 /* 495 * ENOBUFS indicates that the mp could be too short 496 * and may need a pullup. 497 */ 498 if (err != ENOBUFS || retried) 499 return (err); 500 501 /* 502 * The pullup is done on the last processed mblk, not 503 * the starting one. pullup is not done if the mblk 504 * has references or if b_cont is NULL. 505 */ 506 last = s.fs_mp; 507 if (DB_REF(last) > 1 || last->b_cont == NULL || 508 pullupmsg(last, -1) == 0) 509 return (EINVAL); 510 511 retried = B_TRUE; 512 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 513 flow_state_t *, &s); 514 goto retry; 515 } 516 } 517 518 /* 519 * The packet is considered sane. We may now attempt to 520 * find the corresponding flent. 521 */ 522 rw_enter(&ft->ft_lock, RW_READER); 523 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 524 for (; flent != NULL; flent = flent->fe_next) { 525 if (flent->fe_match(ft, flent, &s)) { 526 FLOW_TRY_REFHOLD(flent, err); 527 if (err != 0) 528 continue; 529 *flentp = flent; 530 rw_exit(&ft->ft_lock); 531 return (0); 532 } 533 } 534 rw_exit(&ft->ft_lock); 535 return (ENOENT); 536 } 537 538 /* 539 * Walk flow table. 540 * The caller is assumed to have proper perimeter protection. 541 */ 542 int 543 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 544 void *arg) 545 { 546 int err, i, cnt = 0; 547 flow_entry_t *flent; 548 549 if (ft == NULL) 550 return (0); 551 552 for (i = 0; i < ft->ft_size; i++) { 553 for (flent = ft->ft_table[i]; flent != NULL; 554 flent = flent->fe_next) { 555 cnt++; 556 err = (*fn)(flent, arg); 557 if (err != 0) 558 return (err); 559 } 560 } 561 VERIFY(cnt == ft->ft_flow_count); 562 return (0); 563 } 564 565 /* 566 * Same as the above except a mutex is used for protection here. 567 */ 568 int 569 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 570 void *arg) 571 { 572 int err; 573 574 if (ft == NULL) 575 return (0); 576 577 rw_enter(&ft->ft_lock, RW_WRITER); 578 err = mac_flow_walk_nolock(ft, fn, arg); 579 rw_exit(&ft->ft_lock); 580 return (err); 581 } 582 583 static boolean_t mac_flow_clean(flow_entry_t *); 584 585 /* 586 * Destroy a flow entry. Called when the last reference on a flow is released. 587 */ 588 void 589 mac_flow_destroy(flow_entry_t *flent) 590 { 591 ASSERT(flent->fe_refcnt == 0); 592 593 if ((flent->fe_type & FLOW_USER) != 0) { 594 ASSERT(mac_flow_clean(flent)); 595 } else { 596 mac_flow_cleanup(flent); 597 } 598 599 mutex_destroy(&flent->fe_lock); 600 cv_destroy(&flent->fe_cv); 601 flow_stat_destroy(flent); 602 kmem_cache_free(flow_cache, flent); 603 } 604 605 /* 606 * XXX eric 607 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 608 * mac_link_flow_modify() should really be moved/reworked into the 609 * two functions below. This would consolidate all the mac property 610 * checking in one place. I'm leaving this alone for now since it's 611 * out of scope of the new flows work. 612 */ 613 /* ARGSUSED */ 614 uint32_t 615 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 616 { 617 uint32_t changed_mask = 0; 618 mac_resource_props_t *fmrp = &flent->fe_effective_props; 619 int i; 620 621 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 622 (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { 623 changed_mask |= MRP_MAXBW; 624 fmrp->mrp_maxbw = mrp->mrp_maxbw; 625 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 626 fmrp->mrp_mask &= ~MRP_MAXBW; 627 } else { 628 fmrp->mrp_mask |= MRP_MAXBW; 629 } 630 } 631 632 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 633 if (fmrp->mrp_priority != mrp->mrp_priority) 634 changed_mask |= MRP_PRIORITY; 635 if (mrp->mrp_priority == MPL_RESET) { 636 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 637 fmrp->mrp_mask &= ~MRP_PRIORITY; 638 } else { 639 fmrp->mrp_priority = mrp->mrp_priority; 640 fmrp->mrp_mask |= MRP_PRIORITY; 641 } 642 } 643 644 /* modify fanout */ 645 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 646 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 647 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 648 for (i = 0; i < mrp->mrp_ncpus; i++) { 649 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 650 break; 651 } 652 if (i == mrp->mrp_ncpus) { 653 /* 654 * The new set of cpus passed is exactly 655 * the same as the existing set. 656 */ 657 return (changed_mask); 658 } 659 } 660 changed_mask |= MRP_CPUS; 661 MAC_COPY_CPUS(mrp, fmrp); 662 } 663 return (changed_mask); 664 } 665 666 void 667 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 668 { 669 uint32_t changed_mask; 670 mac_client_impl_t *mcip = flent->fe_mcip; 671 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 672 673 ASSERT(flent != NULL); 674 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 675 676 rw_enter(&ft->ft_lock, RW_WRITER); 677 678 /* Update the cached values inside the subflow entry */ 679 changed_mask = mac_flow_modify_props(flent, mrp); 680 rw_exit(&ft->ft_lock); 681 /* 682 * Push the changed parameters to the scheduling code in the 683 * SRS's, to take effect right away. 684 */ 685 if (changed_mask & MRP_MAXBW) { 686 mac_srs_update_bwlimit(flent, mrp); 687 /* 688 * If bandwidth is changed, we may have to change 689 * the number of soft ring to be used for fanout. 690 * Call mac_flow_update_fanout() if MAC_BIND_CPU 691 * is not set and there is no user supplied cpu 692 * info. This applies only to link at this time. 693 */ 694 if (!(flent->fe_type & FLOW_USER) && 695 !(changed_mask & MRP_CPUS) && 696 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 697 mac_fanout_setup(mcip, flent, mcip_mrp, 698 mac_rx_deliver, mcip, NULL); 699 } 700 } 701 if (mrp->mrp_mask & MRP_PRIORITY) 702 mac_flow_update_priority(mcip, flent); 703 704 if (changed_mask & MRP_CPUS) 705 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); 706 } 707 708 /* 709 * This function waits for a certain condition to be met and is generally 710 * used before a destructive or quiescing operation. 711 */ 712 void 713 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 714 { 715 mutex_enter(&flent->fe_lock); 716 flent->fe_flags |= FE_WAITER; 717 718 switch (event) { 719 case FLOW_DRIVER_UPCALL: 720 /* 721 * We want to make sure the driver upcalls have finished before 722 * we signal the Rx SRS worker to quit. 723 */ 724 while (flent->fe_refcnt != 1) 725 cv_wait(&flent->fe_cv, &flent->fe_lock); 726 break; 727 728 case FLOW_USER_REF: 729 /* 730 * Wait for the fe_user_refcnt to drop to 0. The flow has 731 * been removed from the global flow hash. 732 */ 733 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 734 while (flent->fe_user_refcnt != 0) 735 cv_wait(&flent->fe_cv, &flent->fe_lock); 736 break; 737 738 default: 739 ASSERT(0); 740 } 741 742 flent->fe_flags &= ~FE_WAITER; 743 mutex_exit(&flent->fe_lock); 744 } 745 746 static boolean_t 747 mac_flow_clean(flow_entry_t *flent) 748 { 749 ASSERT(flent->fe_next == NULL); 750 ASSERT(flent->fe_tx_srs == NULL); 751 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 752 ASSERT(flent->fe_mbg == NULL); 753 754 return (B_TRUE); 755 } 756 757 void 758 mac_flow_cleanup(flow_entry_t *flent) 759 { 760 if ((flent->fe_type & FLOW_USER) == 0) { 761 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 762 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 763 ASSERT(flent->fe_refcnt == 0); 764 } else { 765 ASSERT(flent->fe_refcnt == 1); 766 } 767 768 if (flent->fe_mbg != NULL) { 769 ASSERT(flent->fe_tx_srs == NULL); 770 /* This is a multicast or broadcast flow entry */ 771 mac_bcast_grp_free(flent->fe_mbg); 772 flent->fe_mbg = NULL; 773 } 774 775 if (flent->fe_tx_srs != NULL) { 776 ASSERT(flent->fe_mbg == NULL); 777 mac_srs_free(flent->fe_tx_srs); 778 flent->fe_tx_srs = NULL; 779 } 780 781 /* 782 * In the normal case fe_rx_srs_cnt is 1. However in the error case 783 * when mac_unicast_add fails we may not have set up any SRS 784 * in which case fe_rx_srs_cnt will be zero. 785 */ 786 if (flent->fe_rx_srs_cnt != 0) { 787 ASSERT(flent->fe_rx_srs_cnt == 1); 788 mac_srs_free(flent->fe_rx_srs[0]); 789 flent->fe_rx_srs[0] = NULL; 790 flent->fe_rx_srs_cnt = 0; 791 } 792 ASSERT(flent->fe_rx_srs[0] == NULL); 793 } 794 795 void 796 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 797 { 798 /* 799 * Grab the fe_lock to see a self-consistent fe_flow_desc. 800 * Updates to the fe_flow_desc happen under the fe_lock 801 * after removing the flent from the flow table 802 */ 803 mutex_enter(&flent->fe_lock); 804 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 805 mutex_exit(&flent->fe_lock); 806 } 807 808 /* 809 * Update a field of a flow entry. The mac perimeter ensures that 810 * this is the only thread doing a modify operation on this mac end point. 811 * So the flow table can't change or disappear. The ft_lock protects access 812 * to the flow entry, and holding the lock ensures that there isn't any thread 813 * accessing the flow entry or attempting a flow table lookup. However 814 * data threads that are using the flow entry based on the old descriptor 815 * will continue to use the flow entry. If strong coherence is required 816 * then the flow will have to be quiesced before the descriptor can be 817 * changed. 818 */ 819 void 820 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 821 { 822 flow_tab_t *ft = flent->fe_flow_tab; 823 flow_desc_t old_desc; 824 int err; 825 826 if (ft == NULL) { 827 /* 828 * The flow hasn't yet been inserted into the table, 829 * so only the caller knows about this flow, however for 830 * uniformity we grab the fe_lock here. 831 */ 832 mutex_enter(&flent->fe_lock); 833 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 834 mutex_exit(&flent->fe_lock); 835 } 836 837 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 838 839 /* 840 * Need to remove the flow entry from the table and reinsert it, 841 * into a potentially diference hash line. The hash depends on 842 * the new descriptor fields. However access to fe_desc itself 843 * is always under the fe_lock. This helps log and stat functions 844 * see a self-consistent fe_flow_desc. 845 */ 846 mac_flow_remove(ft, flent, B_TRUE); 847 old_desc = flent->fe_flow_desc; 848 849 mutex_enter(&flent->fe_lock); 850 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 851 mutex_exit(&flent->fe_lock); 852 853 if (mac_flow_add(ft, flent) != 0) { 854 /* 855 * The add failed say due to an invalid flow descriptor. 856 * Undo the update 857 */ 858 flent->fe_flow_desc = old_desc; 859 err = mac_flow_add(ft, flent); 860 ASSERT(err == 0); 861 } 862 } 863 864 void 865 mac_flow_set_name(flow_entry_t *flent, const char *name) 866 { 867 flow_tab_t *ft = flent->fe_flow_tab; 868 869 if (ft == NULL) { 870 /* 871 * The flow hasn't yet been inserted into the table, 872 * so only the caller knows about this flow 873 */ 874 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 875 } else { 876 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 877 } 878 879 mutex_enter(&flent->fe_lock); 880 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 881 mutex_exit(&flent->fe_lock); 882 } 883 884 /* 885 * Return the client-private cookie that was associated with 886 * the flow when it was created. 887 */ 888 void * 889 mac_flow_get_client_cookie(flow_entry_t *flent) 890 { 891 return (flent->fe_client_cookie); 892 } 893 894 /* 895 * Forward declarations. 896 */ 897 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 898 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 899 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 900 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 901 902 /* 903 * Create flow table. 904 */ 905 void 906 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 907 mac_impl_t *mip, flow_tab_t **ftp) 908 { 909 flow_tab_t *ft; 910 flow_ops_t *new_ops; 911 912 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 913 bzero(ft, sizeof (*ft)); 914 915 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 916 917 /* 918 * We make a copy of the ops vector instead of just pointing to it 919 * because we might want to customize the ops vector on a per table 920 * basis (e.g. for optimization). 921 */ 922 new_ops = &ft->ft_ops; 923 bcopy(ops, new_ops, sizeof (*ops)); 924 ft->ft_mask = mask; 925 ft->ft_size = size; 926 ft->ft_mip = mip; 927 928 /* 929 * Optimization for DL_ETHER media. 930 */ 931 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 932 if (new_ops->fo_hash == flow_l2_hash) 933 new_ops->fo_hash = flow_ether_hash; 934 935 if (new_ops->fo_accept[0] == flow_l2_accept) 936 new_ops->fo_accept[0] = flow_ether_accept; 937 938 } 939 *ftp = ft; 940 } 941 942 void 943 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 944 { 945 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 946 1024, mip, ftp); 947 } 948 949 /* 950 * Destroy flow table. 951 */ 952 void 953 mac_flow_tab_destroy(flow_tab_t *ft) 954 { 955 if (ft == NULL) 956 return; 957 958 ASSERT(ft->ft_flow_count == 0); 959 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 960 bzero(ft, sizeof (*ft)); 961 kmem_cache_free(flow_tab_cache, ft); 962 } 963 964 /* 965 * Add a new flow entry to the global flow hash table 966 */ 967 int 968 mac_flow_hash_add(flow_entry_t *flent) 969 { 970 int err; 971 972 rw_enter(&flow_tab_lock, RW_WRITER); 973 err = mod_hash_insert(flow_hash, 974 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 975 if (err != 0) { 976 rw_exit(&flow_tab_lock); 977 return (EEXIST); 978 } 979 /* Mark as inserted into the global flow hash table */ 980 FLOW_MARK(flent, FE_G_FLOW_HASH); 981 rw_exit(&flow_tab_lock); 982 return (err); 983 } 984 985 /* 986 * Remove a flow entry from the global flow hash table 987 */ 988 void 989 mac_flow_hash_remove(flow_entry_t *flent) 990 { 991 mod_hash_val_t val; 992 993 rw_enter(&flow_tab_lock, RW_WRITER); 994 VERIFY(mod_hash_remove(flow_hash, 995 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 996 997 /* Clear the mark that says inserted into the global flow hash table */ 998 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 999 rw_exit(&flow_tab_lock); 1000 } 1001 1002 /* 1003 * Retrieve a flow entry from the global flow hash table. 1004 */ 1005 int 1006 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1007 { 1008 int err; 1009 flow_entry_t *flent; 1010 1011 rw_enter(&flow_tab_lock, RW_READER); 1012 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1013 (mod_hash_val_t *)&flent); 1014 if (err != 0) { 1015 rw_exit(&flow_tab_lock); 1016 return (ENOENT); 1017 } 1018 ASSERT(flent != NULL); 1019 FLOW_USER_REFHOLD(flent); 1020 rw_exit(&flow_tab_lock); 1021 1022 *flentp = flent; 1023 return (0); 1024 } 1025 1026 /* 1027 * Initialize or release mac client flows by walking the subflow table. 1028 * These are typically invoked during plumb/unplumb of links. 1029 */ 1030 1031 static int 1032 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1033 { 1034 mac_client_impl_t *mcip = arg; 1035 1036 if (mac_link_flow_init(arg, flent) != 0) { 1037 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1038 flent->fe_flow_name, mcip->mci_name); 1039 } else { 1040 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1041 } 1042 return (0); 1043 } 1044 1045 void 1046 mac_link_init_flows(mac_client_handle_t mch) 1047 { 1048 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1049 1050 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1051 mac_link_init_flows_cb, mcip); 1052 /* 1053 * If mac client had subflow(s) configured before plumb, change 1054 * function to mac_rx_srs_subflow_process and in case of hardware 1055 * classification, disable polling. 1056 */ 1057 mac_client_update_classifier(mcip, B_TRUE); 1058 1059 } 1060 1061 boolean_t 1062 mac_link_has_flows(mac_client_handle_t mch) 1063 { 1064 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1065 1066 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1067 return (B_TRUE); 1068 1069 return (B_FALSE); 1070 } 1071 1072 static int 1073 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1074 { 1075 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1076 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1077 mac_link_flow_clean(arg, flent); 1078 return (0); 1079 } 1080 1081 void 1082 mac_link_release_flows(mac_client_handle_t mch) 1083 { 1084 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1085 1086 /* 1087 * Change the mci_flent callback back to mac_rx_srs_process() 1088 * because flows are about to be deactivated. 1089 */ 1090 mac_client_update_classifier(mcip, B_FALSE); 1091 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1092 mac_link_release_flows_cb, mcip); 1093 } 1094 1095 void 1096 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1097 { 1098 mac_flow_set_name(fep, new_name); 1099 if (fep->fe_ksp != NULL) { 1100 flow_stat_destroy(fep); 1101 flow_stat_create(fep); 1102 } 1103 } 1104 1105 /* 1106 * mac_link_flow_init() 1107 * Internal flow interface used for allocating SRSs and related 1108 * data structures. Not meant to be used by mac clients. 1109 */ 1110 int 1111 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1112 { 1113 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1114 mac_impl_t *mip = mcip->mci_mip; 1115 int err; 1116 1117 ASSERT(mch != NULL); 1118 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1119 1120 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1121 return (err); 1122 1123 sub_flow->fe_mcip = mcip; 1124 1125 return (0); 1126 } 1127 1128 /* 1129 * mac_link_flow_add() 1130 * Used by flowadm(1m) or kernel mac clients for creating flows. 1131 */ 1132 int 1133 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1134 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1135 { 1136 flow_entry_t *flent = NULL; 1137 int err; 1138 dls_dl_handle_t dlh; 1139 dls_link_t *dlp; 1140 boolean_t link_held = B_FALSE; 1141 boolean_t hash_added = B_FALSE; 1142 mac_perim_handle_t mph; 1143 1144 err = mac_flow_lookup_byname(flow_name, &flent); 1145 if (err == 0) { 1146 FLOW_USER_REFRELE(flent); 1147 return (EEXIST); 1148 } 1149 1150 /* 1151 * First create a flow entry given the description provided 1152 * by the caller. 1153 */ 1154 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1155 FLOW_USER | FLOW_OTHER, &flent); 1156 1157 if (err != 0) 1158 return (err); 1159 1160 /* 1161 * We've got a local variable referencing this flow now, so we need 1162 * to hold it. We'll release this flow before returning. 1163 * All failures until we return will undo any action that may internally 1164 * held the flow, so the last REFRELE will assure a clean freeing 1165 * of resources. 1166 */ 1167 FLOW_REFHOLD(flent); 1168 1169 flent->fe_link_id = linkid; 1170 FLOW_MARK(flent, FE_INCIPIENT); 1171 1172 err = mac_perim_enter_by_linkid(linkid, &mph); 1173 if (err != 0) { 1174 FLOW_FINAL_REFRELE(flent); 1175 return (err); 1176 } 1177 1178 /* 1179 * dls will eventually be merged with mac so it's ok 1180 * to call dls' internal functions. 1181 */ 1182 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1183 if (err != 0) 1184 goto bail; 1185 1186 link_held = B_TRUE; 1187 1188 /* 1189 * Add the flow to the global flow table, this table will be per 1190 * exclusive zone so each zone can have its own flow namespace. 1191 * RFE 6625651 will fix this. 1192 * 1193 */ 1194 if ((err = mac_flow_hash_add(flent)) != 0) 1195 goto bail; 1196 1197 hash_added = B_TRUE; 1198 1199 /* 1200 * do not allow flows to be configured on an anchor VNIC 1201 */ 1202 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1203 err = ENOTSUP; 1204 goto bail; 1205 } 1206 1207 /* 1208 * Save the zoneid of the underlying link in the flow entry, 1209 * this is needed to prevent non-global zone from getting 1210 * statistics information of global zone. 1211 */ 1212 flent->fe_zoneid = dlp->dl_zid; 1213 1214 /* 1215 * Add the subflow to the subflow table. Also instantiate the flow 1216 * in the mac if there is an active user (we check if the MAC client's 1217 * datapath has been setup). 1218 */ 1219 err = mac_flow_add_subflow(dlp->dl_mch, flent, 1220 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); 1221 if (err != 0) 1222 goto bail; 1223 1224 FLOW_UNMARK(flent, FE_INCIPIENT); 1225 dls_devnet_rele_link(dlh, dlp); 1226 mac_perim_exit(mph); 1227 return (0); 1228 1229 bail: 1230 if (hash_added) 1231 mac_flow_hash_remove(flent); 1232 1233 if (link_held) 1234 dls_devnet_rele_link(dlh, dlp); 1235 1236 /* 1237 * Wait for any transient global flow hash refs to clear 1238 * and then release the creation reference on the flow 1239 */ 1240 mac_flow_wait(flent, FLOW_USER_REF); 1241 FLOW_FINAL_REFRELE(flent); 1242 mac_perim_exit(mph); 1243 return (err); 1244 } 1245 1246 /* 1247 * mac_link_flow_clean() 1248 * Internal flow interface used for freeing SRSs and related 1249 * data structures. Not meant to be used by mac clients. 1250 */ 1251 void 1252 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1253 { 1254 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1255 mac_impl_t *mip = mcip->mci_mip; 1256 boolean_t last_subflow; 1257 1258 ASSERT(mch != NULL); 1259 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1260 1261 /* 1262 * This sub flow entry may fail to be fully initialized by 1263 * mac_link_flow_init(). If so, simply return. 1264 */ 1265 if (sub_flow->fe_mcip == NULL) 1266 return; 1267 1268 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1269 /* 1270 * Tear down the data path 1271 */ 1272 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1273 sub_flow->fe_mcip = NULL; 1274 1275 /* 1276 * Delete the SRSs associated with this subflow. If this is being 1277 * driven by flowadm(1M) then the subflow will be deleted by 1278 * dls_rem_flow. However if this is a result of the interface being 1279 * unplumbed then the subflow itself won't be deleted. 1280 */ 1281 mac_flow_cleanup(sub_flow); 1282 1283 /* 1284 * If all the subflows are gone, renable some of the stuff 1285 * we disabled when adding a subflow, polling etc. 1286 */ 1287 if (last_subflow) { 1288 /* 1289 * The subflow table itself is not protected by any locks or 1290 * refcnts. Hence quiesce the client upfront before clearing 1291 * mci_subflow_tab. 1292 */ 1293 mac_client_quiesce(mcip); 1294 mac_client_update_classifier(mcip, B_FALSE); 1295 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1296 mcip->mci_subflow_tab = NULL; 1297 mac_client_restart(mcip); 1298 } 1299 } 1300 1301 /* 1302 * mac_link_flow_remove() 1303 * Used by flowadm(1m) or kernel mac clients for removing flows. 1304 */ 1305 int 1306 mac_link_flow_remove(char *flow_name) 1307 { 1308 flow_entry_t *flent; 1309 mac_perim_handle_t mph; 1310 int err; 1311 datalink_id_t linkid; 1312 1313 err = mac_flow_lookup_byname(flow_name, &flent); 1314 if (err != 0) 1315 return (err); 1316 1317 linkid = flent->fe_link_id; 1318 FLOW_USER_REFRELE(flent); 1319 1320 /* 1321 * The perim must be acquired before acquiring any other references 1322 * to maintain the lock and perimeter hierarchy. Please note the 1323 * FLOW_REFRELE above. 1324 */ 1325 err = mac_perim_enter_by_linkid(linkid, &mph); 1326 if (err != 0) 1327 return (err); 1328 1329 /* 1330 * Note the second lookup of the flow, because a concurrent thread 1331 * may have removed it already while we were waiting to enter the 1332 * link's perimeter. 1333 */ 1334 err = mac_flow_lookup_byname(flow_name, &flent); 1335 if (err != 0) { 1336 mac_perim_exit(mph); 1337 return (err); 1338 } 1339 FLOW_USER_REFRELE(flent); 1340 1341 /* 1342 * Remove the flow from the subflow table and deactivate the flow 1343 * by quiescing and removings its SRSs 1344 */ 1345 mac_flow_rem_subflow(flent); 1346 1347 /* 1348 * Finally, remove the flow from the global table. 1349 */ 1350 mac_flow_hash_remove(flent); 1351 1352 /* 1353 * Wait for any transient global flow hash refs to clear 1354 * and then release the creation reference on the flow 1355 */ 1356 mac_flow_wait(flent, FLOW_USER_REF); 1357 FLOW_FINAL_REFRELE(flent); 1358 1359 mac_perim_exit(mph); 1360 1361 return (0); 1362 } 1363 1364 /* 1365 * mac_link_flow_modify() 1366 * Modifies the properties of a flow identified by its name. 1367 */ 1368 int 1369 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1370 { 1371 flow_entry_t *flent; 1372 mac_client_impl_t *mcip; 1373 int err = 0; 1374 mac_perim_handle_t mph; 1375 datalink_id_t linkid; 1376 flow_tab_t *flow_tab; 1377 1378 err = mac_validate_props(mrp); 1379 if (err != 0) 1380 return (err); 1381 1382 err = mac_flow_lookup_byname(flow_name, &flent); 1383 if (err != 0) 1384 return (err); 1385 1386 linkid = flent->fe_link_id; 1387 FLOW_USER_REFRELE(flent); 1388 1389 /* 1390 * The perim must be acquired before acquiring any other references 1391 * to maintain the lock and perimeter hierarchy. Please note the 1392 * FLOW_REFRELE above. 1393 */ 1394 err = mac_perim_enter_by_linkid(linkid, &mph); 1395 if (err != 0) 1396 return (err); 1397 1398 /* 1399 * Note the second lookup of the flow, because a concurrent thread 1400 * may have removed it already while we were waiting to enter the 1401 * link's perimeter. 1402 */ 1403 err = mac_flow_lookup_byname(flow_name, &flent); 1404 if (err != 0) { 1405 mac_perim_exit(mph); 1406 return (err); 1407 } 1408 FLOW_USER_REFRELE(flent); 1409 1410 /* 1411 * If this flow is attached to a MAC client, then pass the request 1412 * along to the client. 1413 * Otherwise, just update the cached values. 1414 */ 1415 mcip = flent->fe_mcip; 1416 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1417 if (mcip != NULL) { 1418 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1419 err = ENOENT; 1420 } else { 1421 mac_flow_modify(flow_tab, flent, mrp); 1422 } 1423 } else { 1424 (void) mac_flow_modify_props(flent, mrp); 1425 } 1426 1427 done: 1428 mac_perim_exit(mph); 1429 return (err); 1430 } 1431 1432 1433 /* 1434 * State structure and misc functions used by mac_link_flow_walk(). 1435 */ 1436 typedef struct { 1437 int (*ws_func)(mac_flowinfo_t *, void *); 1438 void *ws_arg; 1439 } flow_walk_state_t; 1440 1441 static void 1442 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1443 { 1444 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1445 MAXFLOWNAMELEN); 1446 finfop->fi_link_id = flent->fe_link_id; 1447 finfop->fi_flow_desc = flent->fe_flow_desc; 1448 finfop->fi_resource_props = flent->fe_resource_props; 1449 } 1450 1451 static int 1452 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1453 { 1454 flow_walk_state_t *statep = arg; 1455 mac_flowinfo_t finfo; 1456 1457 mac_link_flowinfo_copy(&finfo, flent); 1458 return (statep->ws_func(&finfo, statep->ws_arg)); 1459 } 1460 1461 /* 1462 * mac_link_flow_walk() 1463 * Invokes callback 'func' for all flows belonging to the specified link. 1464 */ 1465 int 1466 mac_link_flow_walk(datalink_id_t linkid, 1467 int (*func)(mac_flowinfo_t *, void *), void *arg) 1468 { 1469 mac_client_impl_t *mcip; 1470 mac_perim_handle_t mph; 1471 flow_walk_state_t state; 1472 dls_dl_handle_t dlh; 1473 dls_link_t *dlp; 1474 int err; 1475 1476 err = mac_perim_enter_by_linkid(linkid, &mph); 1477 if (err != 0) 1478 return (err); 1479 1480 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1481 if (err != 0) { 1482 mac_perim_exit(mph); 1483 return (err); 1484 } 1485 1486 mcip = (mac_client_impl_t *)dlp->dl_mch; 1487 state.ws_func = func; 1488 state.ws_arg = arg; 1489 1490 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1491 mac_link_flow_walk_cb, &state); 1492 1493 dls_devnet_rele_link(dlh, dlp); 1494 mac_perim_exit(mph); 1495 return (err); 1496 } 1497 1498 /* 1499 * mac_link_flow_info() 1500 * Retrieves information about a specific flow. 1501 */ 1502 int 1503 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1504 { 1505 flow_entry_t *flent; 1506 int err; 1507 1508 err = mac_flow_lookup_byname(flow_name, &flent); 1509 if (err != 0) 1510 return (err); 1511 1512 mac_link_flowinfo_copy(finfo, flent); 1513 FLOW_USER_REFRELE(flent); 1514 return (0); 1515 } 1516 1517 #define HASH_MAC_VID(a, v, s) \ 1518 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1519 1520 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1521 1522 #define CHECK_AND_ADJUST_START_PTR(s, start) { \ 1523 if ((s)->fs_mp->b_wptr == (start)) { \ 1524 mblk_t *next = (s)->fs_mp->b_cont; \ 1525 if (next == NULL) \ 1526 return (EINVAL); \ 1527 \ 1528 (s)->fs_mp = next; \ 1529 (start) = next->b_rptr; \ 1530 } \ 1531 } 1532 1533 /* ARGSUSED */ 1534 static boolean_t 1535 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1536 { 1537 flow_l2info_t *l2 = &s->fs_l2info; 1538 flow_desc_t *fd = &flent->fe_flow_desc; 1539 1540 return (l2->l2_vid == fd->fd_vid && 1541 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1542 } 1543 1544 /* 1545 * Layer 2 hash function. 1546 * Must be paired with flow_l2_accept() within a set of flow_ops 1547 * because it assumes the dest address is already extracted. 1548 */ 1549 static uint32_t 1550 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1551 { 1552 flow_l2info_t *l2 = &s->fs_l2info; 1553 1554 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1555 } 1556 1557 /* 1558 * This is the generic layer 2 accept function. 1559 * It makes use of mac_header_info() to extract the header length, 1560 * sap, vlan ID and destination address. 1561 */ 1562 static int 1563 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1564 { 1565 boolean_t is_ether; 1566 flow_l2info_t *l2 = &s->fs_l2info; 1567 mac_header_info_t mhi; 1568 int err; 1569 1570 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1571 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1572 s->fs_mp, &mhi)) != 0) { 1573 if (err == EINVAL) 1574 err = ENOBUFS; 1575 1576 return (err); 1577 } 1578 1579 l2->l2_start = s->fs_mp->b_rptr; 1580 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1581 1582 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1583 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1584 struct ether_vlan_header *evhp = 1585 (struct ether_vlan_header *)l2->l2_start; 1586 1587 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1588 return (ENOBUFS); 1589 1590 l2->l2_sap = ntohs(evhp->ether_type); 1591 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1592 l2->l2_hdrsize = sizeof (*evhp); 1593 } else { 1594 l2->l2_sap = mhi.mhi_bindsap; 1595 l2->l2_vid = 0; 1596 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1597 } 1598 return (0); 1599 } 1600 1601 /* 1602 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1603 * accept(). The notable difference is that dest address is now extracted 1604 * by hash() rather than by accept(). This saves a few memory references 1605 * for flow tables that do not care about mac addresses. 1606 */ 1607 static uint32_t 1608 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1609 { 1610 flow_l2info_t *l2 = &s->fs_l2info; 1611 struct ether_vlan_header *evhp; 1612 1613 evhp = (struct ether_vlan_header *)l2->l2_start; 1614 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1615 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1616 } 1617 1618 /* ARGSUSED */ 1619 static int 1620 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1621 { 1622 flow_l2info_t *l2 = &s->fs_l2info; 1623 struct ether_vlan_header *evhp; 1624 uint16_t sap; 1625 1626 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1627 l2->l2_start = (uchar_t *)evhp; 1628 1629 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1630 return (ENOBUFS); 1631 1632 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1633 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1634 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1635 return (ENOBUFS); 1636 1637 l2->l2_sap = ntohs(evhp->ether_type); 1638 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1639 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1640 } else { 1641 l2->l2_sap = sap; 1642 l2->l2_vid = 0; 1643 l2->l2_hdrsize = sizeof (struct ether_header); 1644 } 1645 return (0); 1646 } 1647 1648 /* 1649 * Validates a layer 2 flow entry. 1650 */ 1651 static int 1652 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1653 { 1654 int i; 1655 flow_desc_t *fd = &flent->fe_flow_desc; 1656 1657 /* 1658 * Dest address is mandatory. 1659 */ 1660 if ((fd->fd_mask & FLOW_LINK_DST) == 0) 1661 return (EINVAL); 1662 1663 for (i = 0; i < fd->fd_mac_len; i++) { 1664 if (fd->fd_dst_mac[i] != 0) 1665 break; 1666 } 1667 if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL) 1668 return (EINVAL); 1669 1670 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1671 /* 1672 * VLAN flows are only supported over ethernet macs. 1673 */ 1674 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1675 return (EINVAL); 1676 1677 if (fd->fd_vid == 0) 1678 return (EINVAL); 1679 1680 } 1681 flent->fe_match = flow_l2_match; 1682 return (0); 1683 } 1684 1685 /* 1686 * Calculates hash index of flow entry. 1687 */ 1688 static uint32_t 1689 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1690 { 1691 flow_desc_t *fd = &flent->fe_flow_desc; 1692 1693 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1694 return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1695 } 1696 1697 /* 1698 * This is used for duplicate flow checking. 1699 */ 1700 /* ARGSUSED */ 1701 static boolean_t 1702 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1703 { 1704 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1705 1706 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1707 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1708 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1709 } 1710 1711 /* 1712 * Generic flow entry insertion function. 1713 * Used by flow tables that do not have ordering requirements. 1714 */ 1715 /* ARGSUSED */ 1716 static int 1717 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1718 flow_entry_t *flent) 1719 { 1720 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1721 1722 if (*headp != NULL) { 1723 ASSERT(flent->fe_next == NULL); 1724 flent->fe_next = *headp; 1725 } 1726 *headp = flent; 1727 return (0); 1728 } 1729 1730 /* 1731 * IP version independent DSField matching function. 1732 */ 1733 /* ARGSUSED */ 1734 static boolean_t 1735 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1736 { 1737 flow_l3info_t *l3info = &s->fs_l3info; 1738 flow_desc_t *fd = &flent->fe_flow_desc; 1739 1740 switch (l3info->l3_version) { 1741 case IPV4_VERSION: { 1742 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1743 1744 return ((ipha->ipha_type_of_service & 1745 fd->fd_dsfield_mask) == fd->fd_dsfield); 1746 } 1747 case IPV6_VERSION: { 1748 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1749 1750 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1751 fd->fd_dsfield_mask) == fd->fd_dsfield); 1752 } 1753 default: 1754 return (B_FALSE); 1755 } 1756 } 1757 1758 /* 1759 * IP v4 and v6 address matching. 1760 * The netmask only needs to be applied on the packet but not on the 1761 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1762 */ 1763 1764 /* ARGSUSED */ 1765 static boolean_t 1766 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1767 { 1768 flow_l3info_t *l3info = &s->fs_l3info; 1769 flow_desc_t *fd = &flent->fe_flow_desc; 1770 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1771 in_addr_t addr; 1772 1773 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1774 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1775 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1776 V4_PART_OF_V6(fd->fd_local_addr)); 1777 } 1778 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1779 V4_PART_OF_V6(fd->fd_remote_addr)); 1780 } 1781 1782 /* ARGSUSED */ 1783 static boolean_t 1784 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1785 { 1786 flow_l3info_t *l3info = &s->fs_l3info; 1787 flow_desc_t *fd = &flent->fe_flow_desc; 1788 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1789 in6_addr_t *addrp; 1790 1791 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1792 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1793 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1794 fd->fd_local_addr)); 1795 } 1796 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1797 } 1798 1799 /* ARGSUSED */ 1800 static boolean_t 1801 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1802 { 1803 flow_l3info_t *l3info = &s->fs_l3info; 1804 flow_desc_t *fd = &flent->fe_flow_desc; 1805 1806 return (l3info->l3_protocol == fd->fd_protocol); 1807 } 1808 1809 static uint32_t 1810 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1811 { 1812 flow_l3info_t *l3info = &s->fs_l3info; 1813 flow_mask_t mask = ft->ft_mask; 1814 1815 if ((mask & FLOW_IP_LOCAL) != 0) { 1816 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1817 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1818 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1819 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1820 /* 1821 * DSField flents are arranged as a single list. 1822 */ 1823 return (0); 1824 } 1825 /* 1826 * IP addr flents are hashed into two lists, v4 or v6. 1827 */ 1828 ASSERT(ft->ft_size >= 2); 1829 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1830 } 1831 1832 static uint32_t 1833 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1834 { 1835 flow_l3info_t *l3info = &s->fs_l3info; 1836 1837 return (l3info->l3_protocol % ft->ft_size); 1838 } 1839 1840 /* ARGSUSED */ 1841 static int 1842 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1843 { 1844 flow_l2info_t *l2info = &s->fs_l2info; 1845 flow_l3info_t *l3info = &s->fs_l3info; 1846 uint16_t sap = l2info->l2_sap; 1847 uchar_t *l3_start; 1848 1849 l3_start = l2info->l2_start + l2info->l2_hdrsize; 1850 1851 /* 1852 * Adjust start pointer if we're at the end of an mblk. 1853 */ 1854 CHECK_AND_ADJUST_START_PTR(s, l3_start); 1855 1856 l3info->l3_start = l3_start; 1857 if (!OK_32PTR(l3_start)) 1858 return (EINVAL); 1859 1860 switch (sap) { 1861 case ETHERTYPE_IP: { 1862 ipha_t *ipha = (ipha_t *)l3_start; 1863 1864 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1865 return (ENOBUFS); 1866 1867 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1868 l3info->l3_protocol = ipha->ipha_protocol; 1869 l3info->l3_version = IPV4_VERSION; 1870 l3info->l3_fragmented = 1871 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1872 break; 1873 } 1874 case ETHERTYPE_IPV6: { 1875 ip6_t *ip6h = (ip6_t *)l3_start; 1876 uint16_t ip6_hdrlen; 1877 uint8_t nexthdr; 1878 1879 if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, 1880 &nexthdr)) { 1881 return (ENOBUFS); 1882 } 1883 l3info->l3_hdrsize = ip6_hdrlen; 1884 l3info->l3_protocol = nexthdr; 1885 l3info->l3_version = IPV6_VERSION; 1886 l3info->l3_fragmented = B_FALSE; 1887 break; 1888 } 1889 default: 1890 return (EINVAL); 1891 } 1892 return (0); 1893 } 1894 1895 /* ARGSUSED */ 1896 static int 1897 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1898 { 1899 flow_desc_t *fd = &flent->fe_flow_desc; 1900 1901 switch (fd->fd_protocol) { 1902 case IPPROTO_TCP: 1903 case IPPROTO_UDP: 1904 case IPPROTO_SCTP: 1905 case IPPROTO_ICMP: 1906 case IPPROTO_ICMPV6: 1907 flent->fe_match = flow_ip_proto_match; 1908 return (0); 1909 default: 1910 return (EINVAL); 1911 } 1912 } 1913 1914 /* ARGSUSED */ 1915 static int 1916 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1917 { 1918 flow_desc_t *fd = &flent->fe_flow_desc; 1919 flow_mask_t mask; 1920 uint8_t version; 1921 in6_addr_t *addr, *netmask; 1922 1923 /* 1924 * DSField does not require a IP version. 1925 */ 1926 if (fd->fd_mask == FLOW_IP_DSFIELD) { 1927 if (fd->fd_dsfield_mask == 0) 1928 return (EINVAL); 1929 1930 flent->fe_match = flow_ip_dsfield_match; 1931 return (0); 1932 } 1933 1934 /* 1935 * IP addresses must come with a version to avoid ambiguity. 1936 */ 1937 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 1938 return (EINVAL); 1939 1940 version = fd->fd_ipversion; 1941 if (version != IPV4_VERSION && version != IPV6_VERSION) 1942 return (EINVAL); 1943 1944 mask = fd->fd_mask & ~FLOW_IP_VERSION; 1945 switch (mask) { 1946 case FLOW_IP_LOCAL: 1947 addr = &fd->fd_local_addr; 1948 netmask = &fd->fd_local_netmask; 1949 break; 1950 case FLOW_IP_REMOTE: 1951 addr = &fd->fd_remote_addr; 1952 netmask = &fd->fd_remote_netmask; 1953 break; 1954 default: 1955 return (EINVAL); 1956 } 1957 1958 /* 1959 * Apply netmask onto specified address. 1960 */ 1961 V6_MASK_COPY(*addr, *netmask, *addr); 1962 if (version == IPV4_VERSION) { 1963 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 1964 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 1965 1966 if (v4addr == 0 || v4mask == 0) 1967 return (EINVAL); 1968 flent->fe_match = flow_ip_v4_match; 1969 } else { 1970 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 1971 IN6_IS_ADDR_UNSPECIFIED(netmask)) 1972 return (EINVAL); 1973 flent->fe_match = flow_ip_v6_match; 1974 } 1975 return (0); 1976 } 1977 1978 static uint32_t 1979 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1980 { 1981 flow_desc_t *fd = &flent->fe_flow_desc; 1982 1983 return (fd->fd_protocol % ft->ft_size); 1984 } 1985 1986 static uint32_t 1987 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1988 { 1989 flow_desc_t *fd = &flent->fe_flow_desc; 1990 1991 /* 1992 * DSField flents are arranged as a single list. 1993 */ 1994 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 1995 return (0); 1996 1997 /* 1998 * IP addr flents are hashed into two lists, v4 or v6. 1999 */ 2000 ASSERT(ft->ft_size >= 2); 2001 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 2002 } 2003 2004 /* ARGSUSED */ 2005 static boolean_t 2006 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2007 { 2008 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2009 2010 return (fd1->fd_protocol == fd2->fd_protocol); 2011 } 2012 2013 /* ARGSUSED */ 2014 static boolean_t 2015 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2016 { 2017 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2018 in6_addr_t *a1, *m1, *a2, *m2; 2019 2020 ASSERT(fd1->fd_mask == fd2->fd_mask); 2021 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 2022 return (fd1->fd_dsfield == fd2->fd_dsfield && 2023 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2024 } 2025 2026 /* 2027 * flow_ip_accept_fe() already validated the version. 2028 */ 2029 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2030 if (fd1->fd_ipversion != fd2->fd_ipversion) 2031 return (B_FALSE); 2032 2033 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2034 case FLOW_IP_LOCAL: 2035 a1 = &fd1->fd_local_addr; 2036 m1 = &fd1->fd_local_netmask; 2037 a2 = &fd2->fd_local_addr; 2038 m2 = &fd2->fd_local_netmask; 2039 break; 2040 case FLOW_IP_REMOTE: 2041 a1 = &fd1->fd_remote_addr; 2042 m1 = &fd1->fd_remote_netmask; 2043 a2 = &fd2->fd_remote_addr; 2044 m2 = &fd2->fd_remote_netmask; 2045 break; 2046 default: 2047 /* 2048 * This is unreachable given the checks in 2049 * flow_ip_accept_fe(). 2050 */ 2051 return (B_FALSE); 2052 } 2053 2054 if (fd1->fd_ipversion == IPV4_VERSION) { 2055 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2056 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2057 2058 } else { 2059 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2060 IN6_ARE_ADDR_EQUAL(m1, m2)); 2061 } 2062 } 2063 2064 static int 2065 flow_ip_mask2plen(in6_addr_t *v6mask) 2066 { 2067 int bits; 2068 int plen = IPV6_ABITS; 2069 int i; 2070 2071 for (i = 3; i >= 0; i--) { 2072 if (v6mask->s6_addr32[i] == 0) { 2073 plen -= 32; 2074 continue; 2075 } 2076 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2077 if (bits == 0) 2078 break; 2079 plen -= bits; 2080 } 2081 return (plen); 2082 } 2083 2084 /* ARGSUSED */ 2085 static int 2086 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2087 flow_entry_t *flent) 2088 { 2089 flow_entry_t **p = headp; 2090 flow_desc_t *fd0, *fd; 2091 in6_addr_t *m0, *m; 2092 int plen0, plen; 2093 2094 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2095 2096 /* 2097 * No special ordering needed for dsfield. 2098 */ 2099 fd0 = &flent->fe_flow_desc; 2100 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2101 if (*p != NULL) { 2102 ASSERT(flent->fe_next == NULL); 2103 flent->fe_next = *p; 2104 } 2105 *p = flent; 2106 return (0); 2107 } 2108 2109 /* 2110 * IP address flows are arranged in descending prefix length order. 2111 */ 2112 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2113 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2114 plen0 = flow_ip_mask2plen(m0); 2115 ASSERT(plen0 != 0); 2116 2117 for (; *p != NULL; p = &(*p)->fe_next) { 2118 fd = &(*p)->fe_flow_desc; 2119 2120 /* 2121 * Normally a dsfield flent shouldn't end up on the same 2122 * list as an IP address because flow tables are (for now) 2123 * disjoint. If we decide to support both IP and dsfield 2124 * in the same table in the future, this check will allow 2125 * for that. 2126 */ 2127 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2128 continue; 2129 2130 /* 2131 * We also allow for the mixing of local and remote address 2132 * flents within one list. 2133 */ 2134 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2135 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2136 plen = flow_ip_mask2plen(m); 2137 2138 if (plen <= plen0) 2139 break; 2140 } 2141 if (*p != NULL) { 2142 ASSERT(flent->fe_next == NULL); 2143 flent->fe_next = *p; 2144 } 2145 *p = flent; 2146 return (0); 2147 } 2148 2149 /* 2150 * Transport layer protocol and port matching functions. 2151 */ 2152 2153 /* ARGSUSED */ 2154 static boolean_t 2155 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2156 { 2157 flow_l3info_t *l3info = &s->fs_l3info; 2158 flow_l4info_t *l4info = &s->fs_l4info; 2159 flow_desc_t *fd = &flent->fe_flow_desc; 2160 2161 return (fd->fd_protocol == l3info->l3_protocol && 2162 fd->fd_local_port == l4info->l4_hash_port); 2163 } 2164 2165 /* ARGSUSED */ 2166 static boolean_t 2167 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2168 { 2169 flow_l3info_t *l3info = &s->fs_l3info; 2170 flow_l4info_t *l4info = &s->fs_l4info; 2171 flow_desc_t *fd = &flent->fe_flow_desc; 2172 2173 return (fd->fd_protocol == l3info->l3_protocol && 2174 fd->fd_remote_port == l4info->l4_hash_port); 2175 } 2176 2177 /* 2178 * Transport hash function. 2179 * Since we only support either local or remote port flows, 2180 * we only need to extract one of the ports to be used for 2181 * matching. 2182 */ 2183 static uint32_t 2184 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2185 { 2186 flow_l3info_t *l3info = &s->fs_l3info; 2187 flow_l4info_t *l4info = &s->fs_l4info; 2188 uint8_t proto = l3info->l3_protocol; 2189 boolean_t dst_or_src; 2190 2191 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2192 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2193 } else { 2194 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2195 } 2196 2197 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2198 l4info->l4_src_port; 2199 2200 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2201 } 2202 2203 /* 2204 * Unlike other accept() functions above, we do not need to get the header 2205 * size because this is our highest layer so far. If we want to do support 2206 * other higher layer protocols, we would need to save the l4_hdrsize 2207 * in the code below. 2208 */ 2209 2210 /* ARGSUSED */ 2211 static int 2212 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2213 { 2214 flow_l3info_t *l3info = &s->fs_l3info; 2215 flow_l4info_t *l4info = &s->fs_l4info; 2216 uint8_t proto = l3info->l3_protocol; 2217 uchar_t *l4_start; 2218 2219 l4_start = l3info->l3_start + l3info->l3_hdrsize; 2220 2221 /* 2222 * Adjust start pointer if we're at the end of an mblk. 2223 */ 2224 CHECK_AND_ADJUST_START_PTR(s, l4_start); 2225 2226 l4info->l4_start = l4_start; 2227 if (!OK_32PTR(l4_start)) 2228 return (EINVAL); 2229 2230 if (l3info->l3_fragmented == B_TRUE) 2231 return (EINVAL); 2232 2233 switch (proto) { 2234 case IPPROTO_TCP: { 2235 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2236 2237 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2238 return (ENOBUFS); 2239 2240 l4info->l4_src_port = tcph->th_sport; 2241 l4info->l4_dst_port = tcph->th_dport; 2242 break; 2243 } 2244 case IPPROTO_UDP: { 2245 struct udphdr *udph = (struct udphdr *)l4_start; 2246 2247 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2248 return (ENOBUFS); 2249 2250 l4info->l4_src_port = udph->uh_sport; 2251 l4info->l4_dst_port = udph->uh_dport; 2252 break; 2253 } 2254 case IPPROTO_SCTP: { 2255 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2256 2257 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2258 return (ENOBUFS); 2259 2260 l4info->l4_src_port = sctph->sh_sport; 2261 l4info->l4_dst_port = sctph->sh_dport; 2262 break; 2263 } 2264 default: 2265 return (EINVAL); 2266 } 2267 2268 return (0); 2269 } 2270 2271 /* 2272 * Validates transport flow entry. 2273 * The protocol field must be present. 2274 */ 2275 2276 /* ARGSUSED */ 2277 static int 2278 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2279 { 2280 flow_desc_t *fd = &flent->fe_flow_desc; 2281 flow_mask_t mask = fd->fd_mask; 2282 2283 if ((mask & FLOW_IP_PROTOCOL) == 0) 2284 return (EINVAL); 2285 2286 switch (fd->fd_protocol) { 2287 case IPPROTO_TCP: 2288 case IPPROTO_UDP: 2289 case IPPROTO_SCTP: 2290 break; 2291 default: 2292 return (EINVAL); 2293 } 2294 2295 switch (mask & ~FLOW_IP_PROTOCOL) { 2296 case FLOW_ULP_PORT_LOCAL: 2297 if (fd->fd_local_port == 0) 2298 return (EINVAL); 2299 2300 flent->fe_match = flow_transport_lport_match; 2301 break; 2302 case FLOW_ULP_PORT_REMOTE: 2303 if (fd->fd_remote_port == 0) 2304 return (EINVAL); 2305 2306 flent->fe_match = flow_transport_rport_match; 2307 break; 2308 case 0: 2309 /* 2310 * transport-only flows conflicts with our table type. 2311 */ 2312 return (EOPNOTSUPP); 2313 default: 2314 return (EINVAL); 2315 } 2316 2317 return (0); 2318 } 2319 2320 static uint32_t 2321 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2322 { 2323 flow_desc_t *fd = &flent->fe_flow_desc; 2324 uint16_t port = 0; 2325 2326 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2327 fd->fd_local_port : fd->fd_remote_port; 2328 2329 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2330 } 2331 2332 /* ARGSUSED */ 2333 static boolean_t 2334 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2335 { 2336 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2337 2338 if (fd1->fd_protocol != fd2->fd_protocol) 2339 return (B_FALSE); 2340 2341 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2342 return (fd1->fd_local_port == fd2->fd_local_port); 2343 2344 return (fd1->fd_remote_port == fd2->fd_remote_port); 2345 } 2346 2347 static flow_ops_t flow_l2_ops = { 2348 flow_l2_accept_fe, 2349 flow_l2_hash_fe, 2350 flow_l2_match_fe, 2351 flow_generic_insert_fe, 2352 flow_l2_hash, 2353 {flow_l2_accept} 2354 }; 2355 2356 static flow_ops_t flow_ip_ops = { 2357 flow_ip_accept_fe, 2358 flow_ip_hash_fe, 2359 flow_ip_match_fe, 2360 flow_ip_insert_fe, 2361 flow_ip_hash, 2362 {flow_l2_accept, flow_ip_accept} 2363 }; 2364 2365 static flow_ops_t flow_ip_proto_ops = { 2366 flow_ip_proto_accept_fe, 2367 flow_ip_proto_hash_fe, 2368 flow_ip_proto_match_fe, 2369 flow_generic_insert_fe, 2370 flow_ip_proto_hash, 2371 {flow_l2_accept, flow_ip_accept} 2372 }; 2373 2374 static flow_ops_t flow_transport_ops = { 2375 flow_transport_accept_fe, 2376 flow_transport_hash_fe, 2377 flow_transport_match_fe, 2378 flow_generic_insert_fe, 2379 flow_transport_hash, 2380 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2381 }; 2382 2383 static flow_tab_info_t flow_tab_info_list[] = { 2384 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2385 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2386 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2387 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2388 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024} 2389 }; 2390 2391 #define FLOW_MAX_TAB_INFO \ 2392 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2393 2394 static flow_tab_info_t * 2395 mac_flow_tab_info_get(flow_mask_t mask) 2396 { 2397 int i; 2398 2399 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2400 if (mask == flow_tab_info_list[i].fti_mask) 2401 return (&flow_tab_info_list[i]); 2402 } 2403 return (NULL); 2404 } 2405