1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/dls.h> 33 #include <sys/dls_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/ethernet.h> 36 #include <sys/vlan.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <netinet/tcp.h> 40 #include <netinet/udp.h> 41 #include <netinet/sctp.h> 42 43 /* global flow table, will be a per exclusive-zone table later */ 44 static mod_hash_t *flow_hash; 45 static krwlock_t flow_tab_lock; 46 47 static kmem_cache_t *flow_cache; 48 static kmem_cache_t *flow_tab_cache; 49 static flow_ops_t flow_l2_ops; 50 51 typedef struct { 52 const char *fs_name; 53 uint_t fs_offset; 54 } flow_stats_info_t; 55 56 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 57 static flow_stats_info_t flow_stats_list[] = { 58 {"rbytes", FS_OFF(fs_rbytes)}, 59 {"ipackets", FS_OFF(fs_ipackets)}, 60 {"ierrors", FS_OFF(fs_ierrors)}, 61 {"obytes", FS_OFF(fs_obytes)}, 62 {"opackets", FS_OFF(fs_opackets)}, 63 {"oerrors", FS_OFF(fs_oerrors)} 64 }; 65 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 66 67 /* 68 * Checks whether a flow mask is legal. 69 */ 70 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 71 72 static void 73 flow_stat_init(kstat_named_t *knp) 74 { 75 int i; 76 77 for (i = 0; i < FS_SIZE; i++, knp++) { 78 kstat_named_init(knp, flow_stats_list[i].fs_name, 79 KSTAT_DATA_UINT64); 80 } 81 } 82 83 static int 84 flow_stat_update(kstat_t *ksp, int rw) 85 { 86 flow_entry_t *fep = ksp->ks_private; 87 flow_stats_t *fsp = &fep->fe_flowstats; 88 kstat_named_t *knp = ksp->ks_data; 89 uint64_t *statp; 90 int i; 91 92 if (rw != KSTAT_READ) 93 return (EACCES); 94 95 for (i = 0; i < FS_SIZE; i++, knp++) { 96 statp = (uint64_t *) 97 ((uchar_t *)fsp + flow_stats_list[i].fs_offset); 98 99 knp->value.ui64 = *statp; 100 } 101 return (0); 102 } 103 104 static void 105 flow_stat_create(flow_entry_t *fep) 106 { 107 kstat_t *ksp; 108 kstat_named_t *knp; 109 uint_t nstats = FS_SIZE; 110 111 /* 112 * Fow now, flow entries are only manipulated and visible from the 113 * global zone. 114 */ 115 ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow", 116 KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID); 117 if (ksp == NULL) 118 return; 119 120 ksp->ks_update = flow_stat_update; 121 ksp->ks_private = fep; 122 fep->fe_ksp = ksp; 123 124 knp = (kstat_named_t *)ksp->ks_data; 125 flow_stat_init(knp); 126 kstat_install(ksp); 127 } 128 129 void 130 flow_stat_destroy(flow_entry_t *fep) 131 { 132 if (fep->fe_ksp != NULL) { 133 kstat_delete(fep->fe_ksp); 134 fep->fe_ksp = NULL; 135 } 136 } 137 138 /* 139 * Initialize the flow table 140 */ 141 void 142 mac_flow_init() 143 { 144 flow_cache = kmem_cache_create("flow_entry_cache", 145 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 146 flow_tab_cache = kmem_cache_create("flow_tab_cache", 147 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 148 flow_hash = mod_hash_create_extended("flow_hash", 149 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 150 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 151 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 152 } 153 154 /* 155 * Cleanup and release the flow table 156 */ 157 void 158 mac_flow_fini() 159 { 160 kmem_cache_destroy(flow_cache); 161 kmem_cache_destroy(flow_tab_cache); 162 mod_hash_destroy_hash(flow_hash); 163 rw_destroy(&flow_tab_lock); 164 } 165 166 /* 167 * mac_create_flow(): create a flow_entry_t. 168 */ 169 int 170 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 171 void *client_cookie, uint_t type, flow_entry_t **flentp) 172 { 173 flow_entry_t *flent = *flentp; 174 int err = 0; 175 176 if (mrp != NULL) { 177 err = mac_validate_props(mrp); 178 if (err != 0) 179 return (err); 180 } 181 182 if (flent == NULL) { 183 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 184 bzero(flent, sizeof (*flent)); 185 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 186 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 187 188 /* Initialize the receiver function to a safe routine */ 189 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 190 flent->fe_index = -1; 191 } 192 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 193 194 /* This is an initial flow, will be configured later */ 195 if (fd == NULL) { 196 *flentp = flent; 197 return (0); 198 } 199 200 flent->fe_client_cookie = client_cookie; 201 flent->fe_type = type; 202 203 /* Save flow desc */ 204 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 205 206 if (mrp != NULL) { 207 /* 208 * We have already set fe_resource_props for a Link. 209 */ 210 if (type & FLOW_USER) { 211 bcopy(mrp, &flent->fe_resource_props, 212 sizeof (mac_resource_props_t)); 213 } 214 /* 215 * The effective resource list should reflect the priority 216 * that we set implicitly. 217 */ 218 if (!(mrp->mrp_mask & MRP_PRIORITY)) 219 mrp->mrp_mask |= MRP_PRIORITY; 220 if (type & FLOW_USER) 221 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 222 else 223 mrp->mrp_priority = MPL_LINK_DEFAULT; 224 bcopy(mrp, &flent->fe_effective_props, 225 sizeof (mac_resource_props_t)); 226 } 227 flow_stat_create(flent); 228 229 *flentp = flent; 230 return (0); 231 } 232 233 /* 234 * Validate flow entry and add it to a flow table. 235 */ 236 int 237 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 238 { 239 flow_entry_t **headp, **p; 240 flow_ops_t *ops = &ft->ft_ops; 241 flow_mask_t mask; 242 uint32_t index; 243 int err; 244 245 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 246 247 /* 248 * Check for invalid bits in mask. 249 */ 250 mask = flent->fe_flow_desc.fd_mask; 251 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 252 return (EOPNOTSUPP); 253 254 /* 255 * Validate flent. 256 */ 257 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 258 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 259 flow_entry_t *, flent, int, err); 260 return (err); 261 } 262 263 /* 264 * Flent is valid. now calculate hash and insert it 265 * into hash table. 266 */ 267 index = ops->fo_hash_fe(ft, flent); 268 269 /* 270 * We do not need a lock up until now because we were 271 * not accessing the flow table. 272 */ 273 rw_enter(&ft->ft_lock, RW_WRITER); 274 headp = &ft->ft_table[index]; 275 276 /* 277 * Check for duplicate flow. 278 */ 279 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 280 if ((*p)->fe_flow_desc.fd_mask != 281 flent->fe_flow_desc.fd_mask) 282 continue; 283 284 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 285 rw_exit(&ft->ft_lock); 286 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 287 flow_entry_t *, flent, int, err); 288 return (EALREADY); 289 } 290 } 291 292 /* 293 * Insert flow to hash list. 294 */ 295 err = ops->fo_insert_fe(ft, headp, flent); 296 if (err != 0) { 297 rw_exit(&ft->ft_lock); 298 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 299 flow_entry_t *, flent, int, err); 300 return (err); 301 } 302 303 /* 304 * Save the hash index so it can be used by mac_flow_remove(). 305 */ 306 flent->fe_index = (int)index; 307 308 /* 309 * Save the flow tab back reference. 310 */ 311 flent->fe_flow_tab = ft; 312 FLOW_MARK(flent, FE_FLOW_TAB); 313 ft->ft_flow_count++; 314 rw_exit(&ft->ft_lock); 315 return (0); 316 } 317 318 /* 319 * Remove a flow from a mac client's subflow table 320 */ 321 void 322 mac_flow_rem_subflow(flow_entry_t *flent) 323 { 324 flow_tab_t *ft = flent->fe_flow_tab; 325 mac_client_impl_t *mcip = ft->ft_mcip; 326 mac_handle_t mh = (mac_handle_t)ft->ft_mip; 327 328 ASSERT(MAC_PERIM_HELD(mh)); 329 330 mac_flow_remove(ft, flent, B_FALSE); 331 if (flent->fe_mcip == NULL) { 332 /* 333 * The interface is not yet plumbed and mac_client_flow_add 334 * was not done. 335 */ 336 if (FLOW_TAB_EMPTY(ft)) { 337 mac_flow_tab_destroy(ft); 338 mcip->mci_subflow_tab = NULL; 339 } 340 } else { 341 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 342 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 343 } 344 mac_fastpath_enable(mh); 345 } 346 347 /* 348 * Add a flow to a mac client's subflow table and instantiate the flow 349 * in the mac by creating the associated SRSs etc. 350 */ 351 int 352 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 353 boolean_t instantiate_flow) 354 { 355 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 356 mac_handle_t mh = (mac_handle_t)mcip->mci_mip; 357 flow_tab_info_t *ftinfo; 358 flow_mask_t mask; 359 flow_tab_t *ft; 360 int err; 361 boolean_t ft_created = B_FALSE; 362 363 ASSERT(MAC_PERIM_HELD(mh)); 364 365 if ((err = mac_fastpath_disable(mh)) != 0) 366 return (err); 367 368 /* 369 * If the subflow table exists already just add the new subflow 370 * to the existing table, else we create a new subflow table below. 371 */ 372 ft = mcip->mci_subflow_tab; 373 if (ft == NULL) { 374 mask = flent->fe_flow_desc.fd_mask; 375 /* 376 * Try to create a new table and then add the subflow to the 377 * newly created subflow table 378 */ 379 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) { 380 mac_fastpath_enable(mh); 381 return (EOPNOTSUPP); 382 } 383 384 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 385 mcip->mci_mip, &ft); 386 ft_created = B_TRUE; 387 } 388 389 err = mac_flow_add(ft, flent); 390 if (err != 0) { 391 if (ft_created) 392 mac_flow_tab_destroy(ft); 393 mac_fastpath_enable(mh); 394 return (err); 395 } 396 397 if (instantiate_flow) { 398 /* Now activate the flow by creating its SRSs */ 399 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 400 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 401 if (err != 0) { 402 mac_flow_remove(ft, flent, B_FALSE); 403 if (ft_created) 404 mac_flow_tab_destroy(ft); 405 mac_fastpath_enable(mh); 406 return (err); 407 } 408 } else { 409 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 410 } 411 if (ft_created) { 412 ASSERT(mcip->mci_subflow_tab == NULL); 413 ft->ft_mcip = mcip; 414 mcip->mci_subflow_tab = ft; 415 if (instantiate_flow) 416 mac_client_update_classifier(mcip, B_TRUE); 417 } 418 return (0); 419 } 420 421 /* 422 * Remove flow entry from flow table. 423 */ 424 void 425 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 426 { 427 flow_entry_t **fp; 428 429 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 430 if (!(flent->fe_flags & FE_FLOW_TAB)) 431 return; 432 433 rw_enter(&ft->ft_lock, RW_WRITER); 434 /* 435 * If this is a permanent removal from the flow table, mark it 436 * CONDEMNED to prevent future references. If this is a temporary 437 * removal from the table, say to update the flow descriptor then 438 * we don't mark it CONDEMNED 439 */ 440 if (!temp) 441 FLOW_MARK(flent, FE_CONDEMNED); 442 /* 443 * Locate the specified flent. 444 */ 445 fp = &ft->ft_table[flent->fe_index]; 446 while (*fp != flent) 447 fp = &(*fp)->fe_next; 448 449 /* 450 * The flent must exist. Otherwise it's a bug. 451 */ 452 ASSERT(fp != NULL); 453 *fp = flent->fe_next; 454 flent->fe_next = NULL; 455 456 /* 457 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 458 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 459 * will panic. 460 */ 461 flent->fe_index = -1; 462 FLOW_UNMARK(flent, FE_FLOW_TAB); 463 ft->ft_flow_count--; 464 rw_exit(&ft->ft_lock); 465 } 466 467 /* 468 * This is the flow lookup routine used by the mac sw classifier engine. 469 */ 470 int 471 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 472 { 473 flow_state_t s; 474 flow_entry_t *flent; 475 flow_ops_t *ops = &ft->ft_ops; 476 boolean_t retried = B_FALSE; 477 int i, err; 478 479 s.fs_flags = flags; 480 retry: 481 s.fs_mp = mp; 482 483 /* 484 * Walk the list of predeclared accept functions. 485 * Each of these would accumulate enough state to allow the next 486 * accept routine to make progress. 487 */ 488 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 489 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 490 mblk_t *last; 491 492 /* 493 * ENOBUFS indicates that the mp could be too short 494 * and may need a pullup. 495 */ 496 if (err != ENOBUFS || retried) 497 return (err); 498 499 /* 500 * The pullup is done on the last processed mblk, not 501 * the starting one. pullup is not done if the mblk 502 * has references or if b_cont is NULL. 503 */ 504 last = s.fs_mp; 505 if (DB_REF(last) > 1 || last->b_cont == NULL || 506 pullupmsg(last, -1) == 0) 507 return (EINVAL); 508 509 retried = B_TRUE; 510 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 511 flow_state_t *, &s); 512 goto retry; 513 } 514 } 515 516 /* 517 * The packet is considered sane. We may now attempt to 518 * find the corresponding flent. 519 */ 520 rw_enter(&ft->ft_lock, RW_READER); 521 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 522 for (; flent != NULL; flent = flent->fe_next) { 523 if (flent->fe_match(ft, flent, &s)) { 524 FLOW_TRY_REFHOLD(flent, err); 525 if (err != 0) 526 continue; 527 *flentp = flent; 528 rw_exit(&ft->ft_lock); 529 return (0); 530 } 531 } 532 rw_exit(&ft->ft_lock); 533 return (ENOENT); 534 } 535 536 /* 537 * Walk flow table. 538 * The caller is assumed to have proper perimeter protection. 539 */ 540 int 541 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 542 void *arg) 543 { 544 int err, i, cnt = 0; 545 flow_entry_t *flent; 546 547 if (ft == NULL) 548 return (0); 549 550 for (i = 0; i < ft->ft_size; i++) { 551 for (flent = ft->ft_table[i]; flent != NULL; 552 flent = flent->fe_next) { 553 cnt++; 554 err = (*fn)(flent, arg); 555 if (err != 0) 556 return (err); 557 } 558 } 559 VERIFY(cnt == ft->ft_flow_count); 560 return (0); 561 } 562 563 /* 564 * Same as the above except a mutex is used for protection here. 565 */ 566 int 567 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 568 void *arg) 569 { 570 int err; 571 572 if (ft == NULL) 573 return (0); 574 575 rw_enter(&ft->ft_lock, RW_WRITER); 576 err = mac_flow_walk_nolock(ft, fn, arg); 577 rw_exit(&ft->ft_lock); 578 return (err); 579 } 580 581 static boolean_t mac_flow_clean(flow_entry_t *); 582 583 /* 584 * Destroy a flow entry. Called when the last reference on a flow is released. 585 */ 586 void 587 mac_flow_destroy(flow_entry_t *flent) 588 { 589 ASSERT(flent->fe_refcnt == 0); 590 591 if ((flent->fe_type & FLOW_USER) != 0) { 592 ASSERT(mac_flow_clean(flent)); 593 } else { 594 mac_flow_cleanup(flent); 595 } 596 597 mutex_destroy(&flent->fe_lock); 598 cv_destroy(&flent->fe_cv); 599 flow_stat_destroy(flent); 600 kmem_cache_free(flow_cache, flent); 601 } 602 603 /* 604 * XXX eric 605 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 606 * mac_link_flow_modify() should really be moved/reworked into the 607 * two functions below. This would consolidate all the mac property 608 * checking in one place. I'm leaving this alone for now since it's 609 * out of scope of the new flows work. 610 */ 611 /* ARGSUSED */ 612 uint32_t 613 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 614 { 615 uint32_t changed_mask = 0; 616 mac_resource_props_t *fmrp = &flent->fe_effective_props; 617 int i; 618 619 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 620 (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { 621 changed_mask |= MRP_MAXBW; 622 fmrp->mrp_maxbw = mrp->mrp_maxbw; 623 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 624 fmrp->mrp_mask &= ~MRP_MAXBW; 625 } else { 626 fmrp->mrp_mask |= MRP_MAXBW; 627 } 628 } 629 630 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 631 if (fmrp->mrp_priority != mrp->mrp_priority) 632 changed_mask |= MRP_PRIORITY; 633 if (mrp->mrp_priority == MPL_RESET) { 634 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 635 fmrp->mrp_mask &= ~MRP_PRIORITY; 636 } else { 637 fmrp->mrp_priority = mrp->mrp_priority; 638 fmrp->mrp_mask |= MRP_PRIORITY; 639 } 640 } 641 642 /* modify fanout */ 643 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 644 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 645 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 646 for (i = 0; i < mrp->mrp_ncpus; i++) { 647 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 648 break; 649 } 650 if (i == mrp->mrp_ncpus) { 651 /* 652 * The new set of cpus passed is exactly 653 * the same as the existing set. 654 */ 655 return (changed_mask); 656 } 657 } 658 changed_mask |= MRP_CPUS; 659 MAC_COPY_CPUS(mrp, fmrp); 660 } 661 return (changed_mask); 662 } 663 664 void 665 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 666 { 667 uint32_t changed_mask; 668 mac_client_impl_t *mcip = flent->fe_mcip; 669 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 670 671 ASSERT(flent != NULL); 672 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 673 674 rw_enter(&ft->ft_lock, RW_WRITER); 675 676 /* Update the cached values inside the subflow entry */ 677 changed_mask = mac_flow_modify_props(flent, mrp); 678 rw_exit(&ft->ft_lock); 679 /* 680 * Push the changed parameters to the scheduling code in the 681 * SRS's, to take effect right away. 682 */ 683 if (changed_mask & MRP_MAXBW) { 684 mac_srs_update_bwlimit(flent, mrp); 685 /* 686 * If bandwidth is changed, we may have to change 687 * the number of soft ring to be used for fanout. 688 * Call mac_flow_update_fanout() if MAC_BIND_CPU 689 * is not set and there is no user supplied cpu 690 * info. This applies only to link at this time. 691 */ 692 if (!(flent->fe_type & FLOW_USER) && 693 !(changed_mask & MRP_CPUS) && 694 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 695 mac_fanout_setup(mcip, flent, mcip_mrp, 696 mac_rx_deliver, mcip, NULL); 697 } 698 } 699 if (mrp->mrp_mask & MRP_PRIORITY) 700 mac_flow_update_priority(mcip, flent); 701 702 if (changed_mask & MRP_CPUS) 703 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); 704 } 705 706 /* 707 * This function waits for a certain condition to be met and is generally 708 * used before a destructive or quiescing operation. 709 */ 710 void 711 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 712 { 713 mutex_enter(&flent->fe_lock); 714 flent->fe_flags |= FE_WAITER; 715 716 switch (event) { 717 case FLOW_DRIVER_UPCALL: 718 /* 719 * We want to make sure the driver upcalls have finished before 720 * we signal the Rx SRS worker to quit. 721 */ 722 while (flent->fe_refcnt != 1) 723 cv_wait(&flent->fe_cv, &flent->fe_lock); 724 break; 725 726 case FLOW_USER_REF: 727 /* 728 * Wait for the fe_user_refcnt to drop to 0. The flow has 729 * been removed from the global flow hash. 730 */ 731 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 732 while (flent->fe_user_refcnt != 0) 733 cv_wait(&flent->fe_cv, &flent->fe_lock); 734 break; 735 736 default: 737 ASSERT(0); 738 } 739 740 flent->fe_flags &= ~FE_WAITER; 741 mutex_exit(&flent->fe_lock); 742 } 743 744 static boolean_t 745 mac_flow_clean(flow_entry_t *flent) 746 { 747 ASSERT(flent->fe_next == NULL); 748 ASSERT(flent->fe_tx_srs == NULL); 749 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 750 ASSERT(flent->fe_mbg == NULL); 751 752 return (B_TRUE); 753 } 754 755 void 756 mac_flow_cleanup(flow_entry_t *flent) 757 { 758 if ((flent->fe_type & FLOW_USER) == 0) { 759 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 760 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 761 ASSERT(flent->fe_refcnt == 0); 762 } else { 763 ASSERT(flent->fe_refcnt == 1); 764 } 765 766 if (flent->fe_mbg != NULL) { 767 ASSERT(flent->fe_tx_srs == NULL); 768 /* This is a multicast or broadcast flow entry */ 769 mac_bcast_grp_free(flent->fe_mbg); 770 flent->fe_mbg = NULL; 771 } 772 773 if (flent->fe_tx_srs != NULL) { 774 ASSERT(flent->fe_mbg == NULL); 775 mac_srs_free(flent->fe_tx_srs); 776 flent->fe_tx_srs = NULL; 777 } 778 779 /* 780 * In the normal case fe_rx_srs_cnt is 1. However in the error case 781 * when mac_unicast_add fails we may not have set up any SRS 782 * in which case fe_rx_srs_cnt will be zero. 783 */ 784 if (flent->fe_rx_srs_cnt != 0) { 785 ASSERT(flent->fe_rx_srs_cnt == 1); 786 mac_srs_free(flent->fe_rx_srs[0]); 787 flent->fe_rx_srs[0] = NULL; 788 flent->fe_rx_srs_cnt = 0; 789 } 790 ASSERT(flent->fe_rx_srs[0] == NULL); 791 } 792 793 void 794 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 795 { 796 /* 797 * Grab the fe_lock to see a self-consistent fe_flow_desc. 798 * Updates to the fe_flow_desc happen under the fe_lock 799 * after removing the flent from the flow table 800 */ 801 mutex_enter(&flent->fe_lock); 802 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 803 mutex_exit(&flent->fe_lock); 804 } 805 806 /* 807 * Update a field of a flow entry. The mac perimeter ensures that 808 * this is the only thread doing a modify operation on this mac end point. 809 * So the flow table can't change or disappear. The ft_lock protects access 810 * to the flow entry, and holding the lock ensures that there isn't any thread 811 * accessing the flow entry or attempting a flow table lookup. However 812 * data threads that are using the flow entry based on the old descriptor 813 * will continue to use the flow entry. If strong coherence is required 814 * then the flow will have to be quiesced before the descriptor can be 815 * changed. 816 */ 817 void 818 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 819 { 820 flow_tab_t *ft = flent->fe_flow_tab; 821 flow_desc_t old_desc; 822 int err; 823 824 if (ft == NULL) { 825 /* 826 * The flow hasn't yet been inserted into the table, 827 * so only the caller knows about this flow, however for 828 * uniformity we grab the fe_lock here. 829 */ 830 mutex_enter(&flent->fe_lock); 831 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 832 mutex_exit(&flent->fe_lock); 833 } 834 835 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 836 837 /* 838 * Need to remove the flow entry from the table and reinsert it, 839 * into a potentially diference hash line. The hash depends on 840 * the new descriptor fields. However access to fe_desc itself 841 * is always under the fe_lock. This helps log and stat functions 842 * see a self-consistent fe_flow_desc. 843 */ 844 mac_flow_remove(ft, flent, B_TRUE); 845 old_desc = flent->fe_flow_desc; 846 847 mutex_enter(&flent->fe_lock); 848 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 849 mutex_exit(&flent->fe_lock); 850 851 if (mac_flow_add(ft, flent) != 0) { 852 /* 853 * The add failed say due to an invalid flow descriptor. 854 * Undo the update 855 */ 856 flent->fe_flow_desc = old_desc; 857 err = mac_flow_add(ft, flent); 858 ASSERT(err == 0); 859 } 860 } 861 862 void 863 mac_flow_set_name(flow_entry_t *flent, const char *name) 864 { 865 flow_tab_t *ft = flent->fe_flow_tab; 866 867 if (ft == NULL) { 868 /* 869 * The flow hasn't yet been inserted into the table, 870 * so only the caller knows about this flow 871 */ 872 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 873 } else { 874 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 875 } 876 877 mutex_enter(&flent->fe_lock); 878 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 879 mutex_exit(&flent->fe_lock); 880 } 881 882 /* 883 * Return the client-private cookie that was associated with 884 * the flow when it was created. 885 */ 886 void * 887 mac_flow_get_client_cookie(flow_entry_t *flent) 888 { 889 return (flent->fe_client_cookie); 890 } 891 892 /* 893 * Forward declarations. 894 */ 895 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 896 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *); 897 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 898 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 899 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *); 900 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 901 902 /* 903 * Create flow table. 904 */ 905 void 906 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 907 mac_impl_t *mip, flow_tab_t **ftp) 908 { 909 flow_tab_t *ft; 910 flow_ops_t *new_ops; 911 912 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 913 bzero(ft, sizeof (*ft)); 914 915 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 916 917 /* 918 * We make a copy of the ops vector instead of just pointing to it 919 * because we might want to customize the ops vector on a per table 920 * basis (e.g. for optimization). 921 */ 922 new_ops = &ft->ft_ops; 923 bcopy(ops, new_ops, sizeof (*ops)); 924 ft->ft_mask = mask; 925 ft->ft_size = size; 926 ft->ft_mip = mip; 927 928 /* 929 * Optimizations for DL_ETHER media. 930 */ 931 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 932 if (new_ops->fo_hash == flow_l2_hash) 933 new_ops->fo_hash = flow_ether_hash; 934 if (new_ops->fo_hash_fe == flow_l2_hash_fe) 935 new_ops->fo_hash_fe = flow_ether_hash_fe; 936 if (new_ops->fo_accept[0] == flow_l2_accept) 937 new_ops->fo_accept[0] = flow_ether_accept; 938 } 939 *ftp = ft; 940 } 941 942 void 943 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 944 { 945 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 946 1024, mip, ftp); 947 } 948 949 /* 950 * Destroy flow table. 951 */ 952 void 953 mac_flow_tab_destroy(flow_tab_t *ft) 954 { 955 if (ft == NULL) 956 return; 957 958 ASSERT(ft->ft_flow_count == 0); 959 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 960 bzero(ft, sizeof (*ft)); 961 kmem_cache_free(flow_tab_cache, ft); 962 } 963 964 /* 965 * Add a new flow entry to the global flow hash table 966 */ 967 int 968 mac_flow_hash_add(flow_entry_t *flent) 969 { 970 int err; 971 972 rw_enter(&flow_tab_lock, RW_WRITER); 973 err = mod_hash_insert(flow_hash, 974 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 975 if (err != 0) { 976 rw_exit(&flow_tab_lock); 977 return (EEXIST); 978 } 979 /* Mark as inserted into the global flow hash table */ 980 FLOW_MARK(flent, FE_G_FLOW_HASH); 981 rw_exit(&flow_tab_lock); 982 return (err); 983 } 984 985 /* 986 * Remove a flow entry from the global flow hash table 987 */ 988 void 989 mac_flow_hash_remove(flow_entry_t *flent) 990 { 991 mod_hash_val_t val; 992 993 rw_enter(&flow_tab_lock, RW_WRITER); 994 VERIFY(mod_hash_remove(flow_hash, 995 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 996 997 /* Clear the mark that says inserted into the global flow hash table */ 998 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 999 rw_exit(&flow_tab_lock); 1000 } 1001 1002 /* 1003 * Retrieve a flow entry from the global flow hash table. 1004 */ 1005 int 1006 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1007 { 1008 int err; 1009 flow_entry_t *flent; 1010 1011 rw_enter(&flow_tab_lock, RW_READER); 1012 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1013 (mod_hash_val_t *)&flent); 1014 if (err != 0) { 1015 rw_exit(&flow_tab_lock); 1016 return (ENOENT); 1017 } 1018 ASSERT(flent != NULL); 1019 FLOW_USER_REFHOLD(flent); 1020 rw_exit(&flow_tab_lock); 1021 1022 *flentp = flent; 1023 return (0); 1024 } 1025 1026 /* 1027 * Initialize or release mac client flows by walking the subflow table. 1028 * These are typically invoked during plumb/unplumb of links. 1029 */ 1030 1031 static int 1032 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1033 { 1034 mac_client_impl_t *mcip = arg; 1035 1036 if (mac_link_flow_init(arg, flent) != 0) { 1037 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1038 flent->fe_flow_name, mcip->mci_name); 1039 } else { 1040 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1041 } 1042 return (0); 1043 } 1044 1045 void 1046 mac_link_init_flows(mac_client_handle_t mch) 1047 { 1048 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1049 1050 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1051 mac_link_init_flows_cb, mcip); 1052 /* 1053 * If mac client had subflow(s) configured before plumb, change 1054 * function to mac_rx_srs_subflow_process and in case of hardware 1055 * classification, disable polling. 1056 */ 1057 mac_client_update_classifier(mcip, B_TRUE); 1058 1059 } 1060 1061 boolean_t 1062 mac_link_has_flows(mac_client_handle_t mch) 1063 { 1064 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1065 1066 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1067 return (B_TRUE); 1068 1069 return (B_FALSE); 1070 } 1071 1072 static int 1073 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1074 { 1075 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1076 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1077 mac_link_flow_clean(arg, flent); 1078 return (0); 1079 } 1080 1081 void 1082 mac_link_release_flows(mac_client_handle_t mch) 1083 { 1084 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1085 1086 /* 1087 * Change the mci_flent callback back to mac_rx_srs_process() 1088 * because flows are about to be deactivated. 1089 */ 1090 mac_client_update_classifier(mcip, B_FALSE); 1091 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1092 mac_link_release_flows_cb, mcip); 1093 } 1094 1095 void 1096 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1097 { 1098 mac_flow_set_name(fep, new_name); 1099 if (fep->fe_ksp != NULL) { 1100 flow_stat_destroy(fep); 1101 flow_stat_create(fep); 1102 } 1103 } 1104 1105 /* 1106 * mac_link_flow_init() 1107 * Internal flow interface used for allocating SRSs and related 1108 * data structures. Not meant to be used by mac clients. 1109 */ 1110 int 1111 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1112 { 1113 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1114 mac_impl_t *mip = mcip->mci_mip; 1115 int err; 1116 1117 ASSERT(mch != NULL); 1118 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1119 1120 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1121 return (err); 1122 1123 sub_flow->fe_mcip = mcip; 1124 1125 return (0); 1126 } 1127 1128 /* 1129 * mac_link_flow_add() 1130 * Used by flowadm(1m) or kernel mac clients for creating flows. 1131 */ 1132 int 1133 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1134 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1135 { 1136 flow_entry_t *flent = NULL; 1137 int err; 1138 dls_dl_handle_t dlh; 1139 dls_link_t *dlp; 1140 boolean_t link_held = B_FALSE; 1141 boolean_t hash_added = B_FALSE; 1142 mac_perim_handle_t mph; 1143 1144 err = mac_flow_lookup_byname(flow_name, &flent); 1145 if (err == 0) { 1146 FLOW_USER_REFRELE(flent); 1147 return (EEXIST); 1148 } 1149 1150 /* 1151 * First create a flow entry given the description provided 1152 * by the caller. 1153 */ 1154 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1155 FLOW_USER | FLOW_OTHER, &flent); 1156 1157 if (err != 0) 1158 return (err); 1159 1160 /* 1161 * We've got a local variable referencing this flow now, so we need 1162 * to hold it. We'll release this flow before returning. 1163 * All failures until we return will undo any action that may internally 1164 * held the flow, so the last REFRELE will assure a clean freeing 1165 * of resources. 1166 */ 1167 FLOW_REFHOLD(flent); 1168 1169 flent->fe_link_id = linkid; 1170 FLOW_MARK(flent, FE_INCIPIENT); 1171 1172 err = mac_perim_enter_by_linkid(linkid, &mph); 1173 if (err != 0) { 1174 FLOW_FINAL_REFRELE(flent); 1175 return (err); 1176 } 1177 1178 /* 1179 * dls will eventually be merged with mac so it's ok 1180 * to call dls' internal functions. 1181 */ 1182 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1183 if (err != 0) 1184 goto bail; 1185 1186 link_held = B_TRUE; 1187 1188 /* 1189 * Add the flow to the global flow table, this table will be per 1190 * exclusive zone so each zone can have its own flow namespace. 1191 * RFE 6625651 will fix this. 1192 * 1193 */ 1194 if ((err = mac_flow_hash_add(flent)) != 0) 1195 goto bail; 1196 1197 hash_added = B_TRUE; 1198 1199 /* 1200 * do not allow flows to be configured on an anchor VNIC 1201 */ 1202 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1203 err = ENOTSUP; 1204 goto bail; 1205 } 1206 1207 /* 1208 * Add the subflow to the subflow table. Also instantiate the flow 1209 * in the mac if there is an active user (we check if the MAC client's 1210 * datapath has been setup). 1211 */ 1212 err = mac_flow_add_subflow(dlp->dl_mch, flent, 1213 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); 1214 if (err != 0) 1215 goto bail; 1216 1217 FLOW_UNMARK(flent, FE_INCIPIENT); 1218 dls_devnet_rele_link(dlh, dlp); 1219 mac_perim_exit(mph); 1220 return (0); 1221 1222 bail: 1223 if (hash_added) 1224 mac_flow_hash_remove(flent); 1225 1226 if (link_held) 1227 dls_devnet_rele_link(dlh, dlp); 1228 1229 /* 1230 * Wait for any transient global flow hash refs to clear 1231 * and then release the creation reference on the flow 1232 */ 1233 mac_flow_wait(flent, FLOW_USER_REF); 1234 FLOW_FINAL_REFRELE(flent); 1235 mac_perim_exit(mph); 1236 return (err); 1237 } 1238 1239 /* 1240 * mac_link_flow_clean() 1241 * Internal flow interface used for freeing SRSs and related 1242 * data structures. Not meant to be used by mac clients. 1243 */ 1244 void 1245 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1246 { 1247 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1248 mac_impl_t *mip = mcip->mci_mip; 1249 boolean_t last_subflow; 1250 1251 ASSERT(mch != NULL); 1252 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1253 1254 /* 1255 * This sub flow entry may fail to be fully initialized by 1256 * mac_link_flow_init(). If so, simply return. 1257 */ 1258 if (sub_flow->fe_mcip == NULL) 1259 return; 1260 1261 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1262 /* 1263 * Tear down the data path 1264 */ 1265 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1266 sub_flow->fe_mcip = NULL; 1267 1268 /* 1269 * Delete the SRSs associated with this subflow. If this is being 1270 * driven by flowadm(1M) then the subflow will be deleted by 1271 * dls_rem_flow. However if this is a result of the interface being 1272 * unplumbed then the subflow itself won't be deleted. 1273 */ 1274 mac_flow_cleanup(sub_flow); 1275 1276 /* 1277 * If all the subflows are gone, renable some of the stuff 1278 * we disabled when adding a subflow, polling etc. 1279 */ 1280 if (last_subflow) { 1281 /* 1282 * The subflow table itself is not protected by any locks or 1283 * refcnts. Hence quiesce the client upfront before clearing 1284 * mci_subflow_tab. 1285 */ 1286 mac_client_quiesce(mcip); 1287 mac_client_update_classifier(mcip, B_FALSE); 1288 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1289 mcip->mci_subflow_tab = NULL; 1290 mac_client_restart(mcip); 1291 } 1292 } 1293 1294 /* 1295 * mac_link_flow_remove() 1296 * Used by flowadm(1m) or kernel mac clients for removing flows. 1297 */ 1298 int 1299 mac_link_flow_remove(char *flow_name) 1300 { 1301 flow_entry_t *flent; 1302 mac_perim_handle_t mph; 1303 int err; 1304 datalink_id_t linkid; 1305 1306 err = mac_flow_lookup_byname(flow_name, &flent); 1307 if (err != 0) 1308 return (err); 1309 1310 linkid = flent->fe_link_id; 1311 FLOW_USER_REFRELE(flent); 1312 1313 /* 1314 * The perim must be acquired before acquiring any other references 1315 * to maintain the lock and perimeter hierarchy. Please note the 1316 * FLOW_REFRELE above. 1317 */ 1318 err = mac_perim_enter_by_linkid(linkid, &mph); 1319 if (err != 0) 1320 return (err); 1321 1322 /* 1323 * Note the second lookup of the flow, because a concurrent thread 1324 * may have removed it already while we were waiting to enter the 1325 * link's perimeter. 1326 */ 1327 err = mac_flow_lookup_byname(flow_name, &flent); 1328 if (err != 0) { 1329 mac_perim_exit(mph); 1330 return (err); 1331 } 1332 FLOW_USER_REFRELE(flent); 1333 1334 /* 1335 * Remove the flow from the subflow table and deactivate the flow 1336 * by quiescing and removings its SRSs 1337 */ 1338 mac_flow_rem_subflow(flent); 1339 1340 /* 1341 * Finally, remove the flow from the global table. 1342 */ 1343 mac_flow_hash_remove(flent); 1344 1345 /* 1346 * Wait for any transient global flow hash refs to clear 1347 * and then release the creation reference on the flow 1348 */ 1349 mac_flow_wait(flent, FLOW_USER_REF); 1350 FLOW_FINAL_REFRELE(flent); 1351 1352 mac_perim_exit(mph); 1353 1354 return (0); 1355 } 1356 1357 /* 1358 * mac_link_flow_modify() 1359 * Modifies the properties of a flow identified by its name. 1360 */ 1361 int 1362 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1363 { 1364 flow_entry_t *flent; 1365 mac_client_impl_t *mcip; 1366 int err = 0; 1367 mac_perim_handle_t mph; 1368 datalink_id_t linkid; 1369 flow_tab_t *flow_tab; 1370 1371 err = mac_validate_props(mrp); 1372 if (err != 0) 1373 return (err); 1374 1375 err = mac_flow_lookup_byname(flow_name, &flent); 1376 if (err != 0) 1377 return (err); 1378 1379 linkid = flent->fe_link_id; 1380 FLOW_USER_REFRELE(flent); 1381 1382 /* 1383 * The perim must be acquired before acquiring any other references 1384 * to maintain the lock and perimeter hierarchy. Please note the 1385 * FLOW_REFRELE above. 1386 */ 1387 err = mac_perim_enter_by_linkid(linkid, &mph); 1388 if (err != 0) 1389 return (err); 1390 1391 /* 1392 * Note the second lookup of the flow, because a concurrent thread 1393 * may have removed it already while we were waiting to enter the 1394 * link's perimeter. 1395 */ 1396 err = mac_flow_lookup_byname(flow_name, &flent); 1397 if (err != 0) { 1398 mac_perim_exit(mph); 1399 return (err); 1400 } 1401 FLOW_USER_REFRELE(flent); 1402 1403 /* 1404 * If this flow is attached to a MAC client, then pass the request 1405 * along to the client. 1406 * Otherwise, just update the cached values. 1407 */ 1408 mcip = flent->fe_mcip; 1409 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1410 if (mcip != NULL) { 1411 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1412 err = ENOENT; 1413 } else { 1414 mac_flow_modify(flow_tab, flent, mrp); 1415 } 1416 } else { 1417 (void) mac_flow_modify_props(flent, mrp); 1418 } 1419 1420 done: 1421 mac_perim_exit(mph); 1422 return (err); 1423 } 1424 1425 1426 /* 1427 * State structure and misc functions used by mac_link_flow_walk(). 1428 */ 1429 typedef struct { 1430 int (*ws_func)(mac_flowinfo_t *, void *); 1431 void *ws_arg; 1432 } flow_walk_state_t; 1433 1434 static void 1435 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1436 { 1437 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1438 MAXFLOWNAMELEN); 1439 finfop->fi_link_id = flent->fe_link_id; 1440 finfop->fi_flow_desc = flent->fe_flow_desc; 1441 finfop->fi_resource_props = flent->fe_resource_props; 1442 } 1443 1444 static int 1445 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1446 { 1447 flow_walk_state_t *statep = arg; 1448 mac_flowinfo_t finfo; 1449 1450 mac_link_flowinfo_copy(&finfo, flent); 1451 return (statep->ws_func(&finfo, statep->ws_arg)); 1452 } 1453 1454 /* 1455 * mac_link_flow_walk() 1456 * Invokes callback 'func' for all flows belonging to the specified link. 1457 */ 1458 int 1459 mac_link_flow_walk(datalink_id_t linkid, 1460 int (*func)(mac_flowinfo_t *, void *), void *arg) 1461 { 1462 mac_client_impl_t *mcip; 1463 mac_perim_handle_t mph; 1464 flow_walk_state_t state; 1465 dls_dl_handle_t dlh; 1466 dls_link_t *dlp; 1467 int err; 1468 1469 err = mac_perim_enter_by_linkid(linkid, &mph); 1470 if (err != 0) 1471 return (err); 1472 1473 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1474 if (err != 0) { 1475 mac_perim_exit(mph); 1476 return (err); 1477 } 1478 1479 mcip = (mac_client_impl_t *)dlp->dl_mch; 1480 state.ws_func = func; 1481 state.ws_arg = arg; 1482 1483 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1484 mac_link_flow_walk_cb, &state); 1485 1486 dls_devnet_rele_link(dlh, dlp); 1487 mac_perim_exit(mph); 1488 return (err); 1489 } 1490 1491 /* 1492 * mac_link_flow_info() 1493 * Retrieves information about a specific flow. 1494 */ 1495 int 1496 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1497 { 1498 flow_entry_t *flent; 1499 int err; 1500 1501 err = mac_flow_lookup_byname(flow_name, &flent); 1502 if (err != 0) 1503 return (err); 1504 1505 mac_link_flowinfo_copy(finfo, flent); 1506 FLOW_USER_REFRELE(flent); 1507 return (0); 1508 } 1509 1510 /* 1511 * Hash function macro that takes an Ethernet address and VLAN id as input. 1512 */ 1513 #define HASH_ETHER_VID(a, v, s) \ 1514 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1515 1516 /* 1517 * Generic layer-2 address hashing function that takes an address and address 1518 * length as input. This is the DJB hash function. 1519 */ 1520 static uint32_t 1521 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize) 1522 { 1523 uint32_t hash = 5381; 1524 size_t i; 1525 1526 for (i = 0; i < addrlen; i++) 1527 hash = ((hash << 5) + hash) + addr[i]; 1528 return (hash % htsize); 1529 } 1530 1531 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1532 1533 #define CHECK_AND_ADJUST_START_PTR(s, start) { \ 1534 if ((s)->fs_mp->b_wptr == (start)) { \ 1535 mblk_t *next = (s)->fs_mp->b_cont; \ 1536 if (next == NULL) \ 1537 return (EINVAL); \ 1538 \ 1539 (s)->fs_mp = next; \ 1540 (start) = next->b_rptr; \ 1541 } \ 1542 } 1543 1544 /* ARGSUSED */ 1545 static boolean_t 1546 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1547 { 1548 flow_l2info_t *l2 = &s->fs_l2info; 1549 flow_desc_t *fd = &flent->fe_flow_desc; 1550 1551 return (l2->l2_vid == fd->fd_vid && 1552 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1553 } 1554 1555 /* 1556 * Layer 2 hash function. 1557 * Must be paired with flow_l2_accept() within a set of flow_ops 1558 * because it assumes the dest address is already extracted. 1559 */ 1560 static uint32_t 1561 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1562 { 1563 return (flow_l2_addrhash(s->fs_l2info.l2_daddr, 1564 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1565 } 1566 1567 /* 1568 * This is the generic layer 2 accept function. 1569 * It makes use of mac_header_info() to extract the header length, 1570 * sap, vlan ID and destination address. 1571 */ 1572 static int 1573 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1574 { 1575 boolean_t is_ether; 1576 flow_l2info_t *l2 = &s->fs_l2info; 1577 mac_header_info_t mhi; 1578 int err; 1579 1580 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1581 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1582 s->fs_mp, &mhi)) != 0) { 1583 if (err == EINVAL) 1584 err = ENOBUFS; 1585 1586 return (err); 1587 } 1588 1589 l2->l2_start = s->fs_mp->b_rptr; 1590 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1591 1592 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1593 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1594 struct ether_vlan_header *evhp = 1595 (struct ether_vlan_header *)l2->l2_start; 1596 1597 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1598 return (ENOBUFS); 1599 1600 l2->l2_sap = ntohs(evhp->ether_type); 1601 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1602 l2->l2_hdrsize = sizeof (*evhp); 1603 } else { 1604 l2->l2_sap = mhi.mhi_bindsap; 1605 l2->l2_vid = 0; 1606 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1607 } 1608 return (0); 1609 } 1610 1611 /* 1612 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1613 * accept(). The notable difference is that dest address is now extracted 1614 * by hash() rather than by accept(). This saves a few memory references 1615 * for flow tables that do not care about mac addresses. 1616 */ 1617 static uint32_t 1618 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1619 { 1620 flow_l2info_t *l2 = &s->fs_l2info; 1621 struct ether_vlan_header *evhp; 1622 1623 evhp = (struct ether_vlan_header *)l2->l2_start; 1624 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1625 return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1626 } 1627 1628 static uint32_t 1629 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1630 { 1631 flow_desc_t *fd = &flent->fe_flow_desc; 1632 1633 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1634 return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1635 } 1636 1637 /* ARGSUSED */ 1638 static int 1639 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1640 { 1641 flow_l2info_t *l2 = &s->fs_l2info; 1642 struct ether_vlan_header *evhp; 1643 uint16_t sap; 1644 1645 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1646 l2->l2_start = (uchar_t *)evhp; 1647 1648 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1649 return (ENOBUFS); 1650 1651 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1652 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1653 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1654 return (ENOBUFS); 1655 1656 l2->l2_sap = ntohs(evhp->ether_type); 1657 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1658 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1659 } else { 1660 l2->l2_sap = sap; 1661 l2->l2_vid = 0; 1662 l2->l2_hdrsize = sizeof (struct ether_header); 1663 } 1664 return (0); 1665 } 1666 1667 /* 1668 * Validates a layer 2 flow entry. 1669 */ 1670 static int 1671 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1672 { 1673 flow_desc_t *fd = &flent->fe_flow_desc; 1674 1675 /* 1676 * Dest address is mandatory, and 0 length addresses are not yet 1677 * supported. 1678 */ 1679 if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0) 1680 return (EINVAL); 1681 1682 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1683 /* 1684 * VLAN flows are only supported over ethernet macs. 1685 */ 1686 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1687 return (EINVAL); 1688 1689 if (fd->fd_vid == 0) 1690 return (EINVAL); 1691 1692 } 1693 flent->fe_match = flow_l2_match; 1694 return (0); 1695 } 1696 1697 /* 1698 * Calculates hash index of flow entry. 1699 */ 1700 static uint32_t 1701 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1702 { 1703 flow_desc_t *fd = &flent->fe_flow_desc; 1704 1705 ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0); 1706 return (flow_l2_addrhash(fd->fd_dst_mac, 1707 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1708 } 1709 1710 /* 1711 * This is used for duplicate flow checking. 1712 */ 1713 /* ARGSUSED */ 1714 static boolean_t 1715 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1716 { 1717 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1718 1719 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1720 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1721 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1722 } 1723 1724 /* 1725 * Generic flow entry insertion function. 1726 * Used by flow tables that do not have ordering requirements. 1727 */ 1728 /* ARGSUSED */ 1729 static int 1730 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1731 flow_entry_t *flent) 1732 { 1733 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1734 1735 if (*headp != NULL) { 1736 ASSERT(flent->fe_next == NULL); 1737 flent->fe_next = *headp; 1738 } 1739 *headp = flent; 1740 return (0); 1741 } 1742 1743 /* 1744 * IP version independent DSField matching function. 1745 */ 1746 /* ARGSUSED */ 1747 static boolean_t 1748 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1749 { 1750 flow_l3info_t *l3info = &s->fs_l3info; 1751 flow_desc_t *fd = &flent->fe_flow_desc; 1752 1753 switch (l3info->l3_version) { 1754 case IPV4_VERSION: { 1755 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1756 1757 return ((ipha->ipha_type_of_service & 1758 fd->fd_dsfield_mask) == fd->fd_dsfield); 1759 } 1760 case IPV6_VERSION: { 1761 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1762 1763 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1764 fd->fd_dsfield_mask) == fd->fd_dsfield); 1765 } 1766 default: 1767 return (B_FALSE); 1768 } 1769 } 1770 1771 /* 1772 * IP v4 and v6 address matching. 1773 * The netmask only needs to be applied on the packet but not on the 1774 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1775 */ 1776 1777 /* ARGSUSED */ 1778 static boolean_t 1779 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1780 { 1781 flow_l3info_t *l3info = &s->fs_l3info; 1782 flow_desc_t *fd = &flent->fe_flow_desc; 1783 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1784 in_addr_t addr; 1785 1786 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1787 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1788 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1789 V4_PART_OF_V6(fd->fd_local_addr)); 1790 } 1791 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1792 V4_PART_OF_V6(fd->fd_remote_addr)); 1793 } 1794 1795 /* ARGSUSED */ 1796 static boolean_t 1797 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1798 { 1799 flow_l3info_t *l3info = &s->fs_l3info; 1800 flow_desc_t *fd = &flent->fe_flow_desc; 1801 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1802 in6_addr_t *addrp; 1803 1804 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1805 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1806 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1807 fd->fd_local_addr)); 1808 } 1809 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1810 } 1811 1812 /* ARGSUSED */ 1813 static boolean_t 1814 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1815 { 1816 flow_l3info_t *l3info = &s->fs_l3info; 1817 flow_desc_t *fd = &flent->fe_flow_desc; 1818 1819 return (l3info->l3_protocol == fd->fd_protocol); 1820 } 1821 1822 static uint32_t 1823 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1824 { 1825 flow_l3info_t *l3info = &s->fs_l3info; 1826 flow_mask_t mask = ft->ft_mask; 1827 1828 if ((mask & FLOW_IP_LOCAL) != 0) { 1829 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1830 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1831 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1832 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1833 /* 1834 * DSField flents are arranged as a single list. 1835 */ 1836 return (0); 1837 } 1838 /* 1839 * IP addr flents are hashed into two lists, v4 or v6. 1840 */ 1841 ASSERT(ft->ft_size >= 2); 1842 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1843 } 1844 1845 static uint32_t 1846 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1847 { 1848 flow_l3info_t *l3info = &s->fs_l3info; 1849 1850 return (l3info->l3_protocol % ft->ft_size); 1851 } 1852 1853 /* ARGSUSED */ 1854 static int 1855 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1856 { 1857 flow_l2info_t *l2info = &s->fs_l2info; 1858 flow_l3info_t *l3info = &s->fs_l3info; 1859 uint16_t sap = l2info->l2_sap; 1860 uchar_t *l3_start; 1861 1862 l3_start = l2info->l2_start + l2info->l2_hdrsize; 1863 1864 /* 1865 * Adjust start pointer if we're at the end of an mblk. 1866 */ 1867 CHECK_AND_ADJUST_START_PTR(s, l3_start); 1868 1869 l3info->l3_start = l3_start; 1870 if (!OK_32PTR(l3_start)) 1871 return (EINVAL); 1872 1873 switch (sap) { 1874 case ETHERTYPE_IP: { 1875 ipha_t *ipha = (ipha_t *)l3_start; 1876 1877 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1878 return (ENOBUFS); 1879 1880 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1881 l3info->l3_protocol = ipha->ipha_protocol; 1882 l3info->l3_version = IPV4_VERSION; 1883 l3info->l3_fragmented = 1884 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1885 break; 1886 } 1887 case ETHERTYPE_IPV6: { 1888 ip6_t *ip6h = (ip6_t *)l3_start; 1889 uint16_t ip6_hdrlen; 1890 uint8_t nexthdr; 1891 1892 if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, 1893 &nexthdr)) { 1894 return (ENOBUFS); 1895 } 1896 l3info->l3_hdrsize = ip6_hdrlen; 1897 l3info->l3_protocol = nexthdr; 1898 l3info->l3_version = IPV6_VERSION; 1899 l3info->l3_fragmented = B_FALSE; 1900 break; 1901 } 1902 default: 1903 return (EINVAL); 1904 } 1905 return (0); 1906 } 1907 1908 /* ARGSUSED */ 1909 static int 1910 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1911 { 1912 flow_desc_t *fd = &flent->fe_flow_desc; 1913 1914 switch (fd->fd_protocol) { 1915 case IPPROTO_TCP: 1916 case IPPROTO_UDP: 1917 case IPPROTO_SCTP: 1918 case IPPROTO_ICMP: 1919 case IPPROTO_ICMPV6: 1920 flent->fe_match = flow_ip_proto_match; 1921 return (0); 1922 default: 1923 return (EINVAL); 1924 } 1925 } 1926 1927 /* ARGSUSED */ 1928 static int 1929 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1930 { 1931 flow_desc_t *fd = &flent->fe_flow_desc; 1932 flow_mask_t mask; 1933 uint8_t version; 1934 in6_addr_t *addr, *netmask; 1935 1936 /* 1937 * DSField does not require a IP version. 1938 */ 1939 if (fd->fd_mask == FLOW_IP_DSFIELD) { 1940 if (fd->fd_dsfield_mask == 0) 1941 return (EINVAL); 1942 1943 flent->fe_match = flow_ip_dsfield_match; 1944 return (0); 1945 } 1946 1947 /* 1948 * IP addresses must come with a version to avoid ambiguity. 1949 */ 1950 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 1951 return (EINVAL); 1952 1953 version = fd->fd_ipversion; 1954 if (version != IPV4_VERSION && version != IPV6_VERSION) 1955 return (EINVAL); 1956 1957 mask = fd->fd_mask & ~FLOW_IP_VERSION; 1958 switch (mask) { 1959 case FLOW_IP_LOCAL: 1960 addr = &fd->fd_local_addr; 1961 netmask = &fd->fd_local_netmask; 1962 break; 1963 case FLOW_IP_REMOTE: 1964 addr = &fd->fd_remote_addr; 1965 netmask = &fd->fd_remote_netmask; 1966 break; 1967 default: 1968 return (EINVAL); 1969 } 1970 1971 /* 1972 * Apply netmask onto specified address. 1973 */ 1974 V6_MASK_COPY(*addr, *netmask, *addr); 1975 if (version == IPV4_VERSION) { 1976 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 1977 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 1978 1979 if (v4addr == 0 || v4mask == 0) 1980 return (EINVAL); 1981 flent->fe_match = flow_ip_v4_match; 1982 } else { 1983 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 1984 IN6_IS_ADDR_UNSPECIFIED(netmask)) 1985 return (EINVAL); 1986 flent->fe_match = flow_ip_v6_match; 1987 } 1988 return (0); 1989 } 1990 1991 static uint32_t 1992 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1993 { 1994 flow_desc_t *fd = &flent->fe_flow_desc; 1995 1996 return (fd->fd_protocol % ft->ft_size); 1997 } 1998 1999 static uint32_t 2000 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2001 { 2002 flow_desc_t *fd = &flent->fe_flow_desc; 2003 2004 /* 2005 * DSField flents are arranged as a single list. 2006 */ 2007 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2008 return (0); 2009 2010 /* 2011 * IP addr flents are hashed into two lists, v4 or v6. 2012 */ 2013 ASSERT(ft->ft_size >= 2); 2014 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 2015 } 2016 2017 /* ARGSUSED */ 2018 static boolean_t 2019 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2020 { 2021 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2022 2023 return (fd1->fd_protocol == fd2->fd_protocol); 2024 } 2025 2026 /* ARGSUSED */ 2027 static boolean_t 2028 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2029 { 2030 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2031 in6_addr_t *a1, *m1, *a2, *m2; 2032 2033 ASSERT(fd1->fd_mask == fd2->fd_mask); 2034 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 2035 return (fd1->fd_dsfield == fd2->fd_dsfield && 2036 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2037 } 2038 2039 /* 2040 * flow_ip_accept_fe() already validated the version. 2041 */ 2042 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2043 if (fd1->fd_ipversion != fd2->fd_ipversion) 2044 return (B_FALSE); 2045 2046 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2047 case FLOW_IP_LOCAL: 2048 a1 = &fd1->fd_local_addr; 2049 m1 = &fd1->fd_local_netmask; 2050 a2 = &fd2->fd_local_addr; 2051 m2 = &fd2->fd_local_netmask; 2052 break; 2053 case FLOW_IP_REMOTE: 2054 a1 = &fd1->fd_remote_addr; 2055 m1 = &fd1->fd_remote_netmask; 2056 a2 = &fd2->fd_remote_addr; 2057 m2 = &fd2->fd_remote_netmask; 2058 break; 2059 default: 2060 /* 2061 * This is unreachable given the checks in 2062 * flow_ip_accept_fe(). 2063 */ 2064 return (B_FALSE); 2065 } 2066 2067 if (fd1->fd_ipversion == IPV4_VERSION) { 2068 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2069 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2070 2071 } else { 2072 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2073 IN6_ARE_ADDR_EQUAL(m1, m2)); 2074 } 2075 } 2076 2077 static int 2078 flow_ip_mask2plen(in6_addr_t *v6mask) 2079 { 2080 int bits; 2081 int plen = IPV6_ABITS; 2082 int i; 2083 2084 for (i = 3; i >= 0; i--) { 2085 if (v6mask->s6_addr32[i] == 0) { 2086 plen -= 32; 2087 continue; 2088 } 2089 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2090 if (bits == 0) 2091 break; 2092 plen -= bits; 2093 } 2094 return (plen); 2095 } 2096 2097 /* ARGSUSED */ 2098 static int 2099 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2100 flow_entry_t *flent) 2101 { 2102 flow_entry_t **p = headp; 2103 flow_desc_t *fd0, *fd; 2104 in6_addr_t *m0, *m; 2105 int plen0, plen; 2106 2107 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2108 2109 /* 2110 * No special ordering needed for dsfield. 2111 */ 2112 fd0 = &flent->fe_flow_desc; 2113 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2114 if (*p != NULL) { 2115 ASSERT(flent->fe_next == NULL); 2116 flent->fe_next = *p; 2117 } 2118 *p = flent; 2119 return (0); 2120 } 2121 2122 /* 2123 * IP address flows are arranged in descending prefix length order. 2124 */ 2125 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2126 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2127 plen0 = flow_ip_mask2plen(m0); 2128 ASSERT(plen0 != 0); 2129 2130 for (; *p != NULL; p = &(*p)->fe_next) { 2131 fd = &(*p)->fe_flow_desc; 2132 2133 /* 2134 * Normally a dsfield flent shouldn't end up on the same 2135 * list as an IP address because flow tables are (for now) 2136 * disjoint. If we decide to support both IP and dsfield 2137 * in the same table in the future, this check will allow 2138 * for that. 2139 */ 2140 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2141 continue; 2142 2143 /* 2144 * We also allow for the mixing of local and remote address 2145 * flents within one list. 2146 */ 2147 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2148 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2149 plen = flow_ip_mask2plen(m); 2150 2151 if (plen <= plen0) 2152 break; 2153 } 2154 if (*p != NULL) { 2155 ASSERT(flent->fe_next == NULL); 2156 flent->fe_next = *p; 2157 } 2158 *p = flent; 2159 return (0); 2160 } 2161 2162 /* 2163 * Transport layer protocol and port matching functions. 2164 */ 2165 2166 /* ARGSUSED */ 2167 static boolean_t 2168 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2169 { 2170 flow_l3info_t *l3info = &s->fs_l3info; 2171 flow_l4info_t *l4info = &s->fs_l4info; 2172 flow_desc_t *fd = &flent->fe_flow_desc; 2173 2174 return (fd->fd_protocol == l3info->l3_protocol && 2175 fd->fd_local_port == l4info->l4_hash_port); 2176 } 2177 2178 /* ARGSUSED */ 2179 static boolean_t 2180 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2181 { 2182 flow_l3info_t *l3info = &s->fs_l3info; 2183 flow_l4info_t *l4info = &s->fs_l4info; 2184 flow_desc_t *fd = &flent->fe_flow_desc; 2185 2186 return (fd->fd_protocol == l3info->l3_protocol && 2187 fd->fd_remote_port == l4info->l4_hash_port); 2188 } 2189 2190 /* 2191 * Transport hash function. 2192 * Since we only support either local or remote port flows, 2193 * we only need to extract one of the ports to be used for 2194 * matching. 2195 */ 2196 static uint32_t 2197 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2198 { 2199 flow_l3info_t *l3info = &s->fs_l3info; 2200 flow_l4info_t *l4info = &s->fs_l4info; 2201 uint8_t proto = l3info->l3_protocol; 2202 boolean_t dst_or_src; 2203 2204 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2205 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2206 } else { 2207 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2208 } 2209 2210 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2211 l4info->l4_src_port; 2212 2213 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2214 } 2215 2216 /* 2217 * Unlike other accept() functions above, we do not need to get the header 2218 * size because this is our highest layer so far. If we want to do support 2219 * other higher layer protocols, we would need to save the l4_hdrsize 2220 * in the code below. 2221 */ 2222 2223 /* ARGSUSED */ 2224 static int 2225 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2226 { 2227 flow_l3info_t *l3info = &s->fs_l3info; 2228 flow_l4info_t *l4info = &s->fs_l4info; 2229 uint8_t proto = l3info->l3_protocol; 2230 uchar_t *l4_start; 2231 2232 l4_start = l3info->l3_start + l3info->l3_hdrsize; 2233 2234 /* 2235 * Adjust start pointer if we're at the end of an mblk. 2236 */ 2237 CHECK_AND_ADJUST_START_PTR(s, l4_start); 2238 2239 l4info->l4_start = l4_start; 2240 if (!OK_32PTR(l4_start)) 2241 return (EINVAL); 2242 2243 if (l3info->l3_fragmented == B_TRUE) 2244 return (EINVAL); 2245 2246 switch (proto) { 2247 case IPPROTO_TCP: { 2248 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2249 2250 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2251 return (ENOBUFS); 2252 2253 l4info->l4_src_port = tcph->th_sport; 2254 l4info->l4_dst_port = tcph->th_dport; 2255 break; 2256 } 2257 case IPPROTO_UDP: { 2258 struct udphdr *udph = (struct udphdr *)l4_start; 2259 2260 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2261 return (ENOBUFS); 2262 2263 l4info->l4_src_port = udph->uh_sport; 2264 l4info->l4_dst_port = udph->uh_dport; 2265 break; 2266 } 2267 case IPPROTO_SCTP: { 2268 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2269 2270 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2271 return (ENOBUFS); 2272 2273 l4info->l4_src_port = sctph->sh_sport; 2274 l4info->l4_dst_port = sctph->sh_dport; 2275 break; 2276 } 2277 default: 2278 return (EINVAL); 2279 } 2280 2281 return (0); 2282 } 2283 2284 /* 2285 * Validates transport flow entry. 2286 * The protocol field must be present. 2287 */ 2288 2289 /* ARGSUSED */ 2290 static int 2291 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2292 { 2293 flow_desc_t *fd = &flent->fe_flow_desc; 2294 flow_mask_t mask = fd->fd_mask; 2295 2296 if ((mask & FLOW_IP_PROTOCOL) == 0) 2297 return (EINVAL); 2298 2299 switch (fd->fd_protocol) { 2300 case IPPROTO_TCP: 2301 case IPPROTO_UDP: 2302 case IPPROTO_SCTP: 2303 break; 2304 default: 2305 return (EINVAL); 2306 } 2307 2308 switch (mask & ~FLOW_IP_PROTOCOL) { 2309 case FLOW_ULP_PORT_LOCAL: 2310 if (fd->fd_local_port == 0) 2311 return (EINVAL); 2312 2313 flent->fe_match = flow_transport_lport_match; 2314 break; 2315 case FLOW_ULP_PORT_REMOTE: 2316 if (fd->fd_remote_port == 0) 2317 return (EINVAL); 2318 2319 flent->fe_match = flow_transport_rport_match; 2320 break; 2321 case 0: 2322 /* 2323 * transport-only flows conflicts with our table type. 2324 */ 2325 return (EOPNOTSUPP); 2326 default: 2327 return (EINVAL); 2328 } 2329 2330 return (0); 2331 } 2332 2333 static uint32_t 2334 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2335 { 2336 flow_desc_t *fd = &flent->fe_flow_desc; 2337 uint16_t port = 0; 2338 2339 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2340 fd->fd_local_port : fd->fd_remote_port; 2341 2342 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2343 } 2344 2345 /* ARGSUSED */ 2346 static boolean_t 2347 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2348 { 2349 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2350 2351 if (fd1->fd_protocol != fd2->fd_protocol) 2352 return (B_FALSE); 2353 2354 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2355 return (fd1->fd_local_port == fd2->fd_local_port); 2356 2357 if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0) 2358 return (fd1->fd_remote_port == fd2->fd_remote_port); 2359 2360 return (B_TRUE); 2361 } 2362 2363 static flow_ops_t flow_l2_ops = { 2364 flow_l2_accept_fe, 2365 flow_l2_hash_fe, 2366 flow_l2_match_fe, 2367 flow_generic_insert_fe, 2368 flow_l2_hash, 2369 {flow_l2_accept} 2370 }; 2371 2372 static flow_ops_t flow_ip_ops = { 2373 flow_ip_accept_fe, 2374 flow_ip_hash_fe, 2375 flow_ip_match_fe, 2376 flow_ip_insert_fe, 2377 flow_ip_hash, 2378 {flow_l2_accept, flow_ip_accept} 2379 }; 2380 2381 static flow_ops_t flow_ip_proto_ops = { 2382 flow_ip_proto_accept_fe, 2383 flow_ip_proto_hash_fe, 2384 flow_ip_proto_match_fe, 2385 flow_generic_insert_fe, 2386 flow_ip_proto_hash, 2387 {flow_l2_accept, flow_ip_accept} 2388 }; 2389 2390 static flow_ops_t flow_transport_ops = { 2391 flow_transport_accept_fe, 2392 flow_transport_hash_fe, 2393 flow_transport_match_fe, 2394 flow_generic_insert_fe, 2395 flow_transport_hash, 2396 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2397 }; 2398 2399 static flow_tab_info_t flow_tab_info_list[] = { 2400 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2401 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2402 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2403 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2404 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}, 2405 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024} 2406 }; 2407 2408 #define FLOW_MAX_TAB_INFO \ 2409 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2410 2411 static flow_tab_info_t * 2412 mac_flow_tab_info_get(flow_mask_t mask) 2413 { 2414 int i; 2415 2416 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2417 if (mask == flow_tab_info_list[i].fti_mask) 2418 return (&flow_tab_info_list[i]); 2419 } 2420 return (NULL); 2421 } 2422