1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/dls.h> 33 #include <sys/dls_impl.h> 34 #include <sys/mac_soft_ring.h> 35 #include <sys/ethernet.h> 36 #include <sys/vlan.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <netinet/tcp.h> 40 #include <netinet/udp.h> 41 #include <netinet/sctp.h> 42 43 /* global flow table, will be a per exclusive-zone table later */ 44 static mod_hash_t *flow_hash; 45 static krwlock_t flow_tab_lock; 46 47 static kmem_cache_t *flow_cache; 48 static kmem_cache_t *flow_tab_cache; 49 static flow_ops_t flow_l2_ops; 50 51 typedef struct { 52 const char *fs_name; 53 uint_t fs_offset; 54 } flow_stats_info_t; 55 56 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 57 static flow_stats_info_t flow_stats_list[] = { 58 {"rbytes", FS_OFF(fs_rbytes)}, 59 {"ipackets", FS_OFF(fs_ipackets)}, 60 {"ierrors", FS_OFF(fs_ierrors)}, 61 {"obytes", FS_OFF(fs_obytes)}, 62 {"opackets", FS_OFF(fs_opackets)}, 63 {"oerrors", FS_OFF(fs_oerrors)} 64 }; 65 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 66 67 /* 68 * Checks whether a flow mask is legal. 69 */ 70 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 71 72 static void 73 flow_stat_init(kstat_named_t *knp) 74 { 75 int i; 76 77 for (i = 0; i < FS_SIZE; i++, knp++) { 78 kstat_named_init(knp, flow_stats_list[i].fs_name, 79 KSTAT_DATA_UINT64); 80 } 81 } 82 83 static int 84 flow_stat_update(kstat_t *ksp, int rw) 85 { 86 flow_entry_t *fep = ksp->ks_private; 87 flow_stats_t *fsp = &fep->fe_flowstats; 88 kstat_named_t *knp = ksp->ks_data; 89 uint64_t *statp; 90 zoneid_t zid; 91 int i; 92 93 if (rw != KSTAT_READ) 94 return (EACCES); 95 96 zid = getzoneid(); 97 if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) { 98 for (i = 0; i < FS_SIZE; i++, knp++) 99 knp->value.ui64 = 0; 100 101 return (0); 102 } 103 104 for (i = 0; i < FS_SIZE; i++, knp++) { 105 statp = (uint64_t *) 106 ((uchar_t *)fsp + flow_stats_list[i].fs_offset); 107 108 knp->value.ui64 = *statp; 109 } 110 return (0); 111 } 112 113 static void 114 flow_stat_create(flow_entry_t *fep) 115 { 116 kstat_t *ksp; 117 kstat_named_t *knp; 118 uint_t nstats = FS_SIZE; 119 120 ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow", 121 KSTAT_TYPE_NAMED, nstats, 0); 122 if (ksp == NULL) 123 return; 124 125 ksp->ks_update = flow_stat_update; 126 ksp->ks_private = fep; 127 fep->fe_ksp = ksp; 128 129 knp = (kstat_named_t *)ksp->ks_data; 130 flow_stat_init(knp); 131 kstat_install(ksp); 132 } 133 134 void 135 flow_stat_destroy(flow_entry_t *fep) 136 { 137 if (fep->fe_ksp != NULL) { 138 kstat_delete(fep->fe_ksp); 139 fep->fe_ksp = NULL; 140 } 141 } 142 143 /* 144 * Initialize the flow table 145 */ 146 void 147 mac_flow_init() 148 { 149 flow_cache = kmem_cache_create("flow_entry_cache", 150 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 151 flow_tab_cache = kmem_cache_create("flow_tab_cache", 152 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 153 flow_hash = mod_hash_create_extended("flow_hash", 154 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 155 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 156 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 157 } 158 159 /* 160 * Cleanup and release the flow table 161 */ 162 void 163 mac_flow_fini() 164 { 165 kmem_cache_destroy(flow_cache); 166 kmem_cache_destroy(flow_tab_cache); 167 mod_hash_destroy_hash(flow_hash); 168 rw_destroy(&flow_tab_lock); 169 } 170 171 /* 172 * mac_create_flow(): create a flow_entry_t. 173 */ 174 int 175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 176 void *client_cookie, uint_t type, flow_entry_t **flentp) 177 { 178 flow_entry_t *flent = *flentp; 179 int err = 0; 180 181 if (mrp != NULL) { 182 err = mac_validate_props(mrp); 183 if (err != 0) 184 return (err); 185 } 186 187 if (flent == NULL) { 188 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 189 bzero(flent, sizeof (*flent)); 190 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 192 193 /* Initialize the receiver function to a safe routine */ 194 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 195 flent->fe_index = -1; 196 } 197 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 198 199 /* This is an initial flow, will be configured later */ 200 if (fd == NULL) { 201 *flentp = flent; 202 return (0); 203 } 204 205 flent->fe_client_cookie = client_cookie; 206 flent->fe_type = type; 207 208 /* 209 * As flow creation is only allowed in global zone, this will 210 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will 211 * later set the right value. 212 */ 213 flent->fe_zoneid = getzoneid(); 214 215 /* Save flow desc */ 216 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 217 218 if (mrp != NULL) { 219 /* 220 * We have already set fe_resource_props for a Link. 221 */ 222 if (type & FLOW_USER) { 223 bcopy(mrp, &flent->fe_resource_props, 224 sizeof (mac_resource_props_t)); 225 } 226 /* 227 * The effective resource list should reflect the priority 228 * that we set implicitly. 229 */ 230 if (!(mrp->mrp_mask & MRP_PRIORITY)) 231 mrp->mrp_mask |= MRP_PRIORITY; 232 if (type & FLOW_USER) 233 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 234 else 235 mrp->mrp_priority = MPL_LINK_DEFAULT; 236 bcopy(mrp, &flent->fe_effective_props, 237 sizeof (mac_resource_props_t)); 238 } 239 flow_stat_create(flent); 240 241 *flentp = flent; 242 return (0); 243 } 244 245 /* 246 * Validate flow entry and add it to a flow table. 247 */ 248 int 249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 250 { 251 flow_entry_t **headp, **p; 252 flow_ops_t *ops = &ft->ft_ops; 253 flow_mask_t mask; 254 uint32_t index; 255 int err; 256 257 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 258 259 /* 260 * Check for invalid bits in mask. 261 */ 262 mask = flent->fe_flow_desc.fd_mask; 263 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 264 return (EOPNOTSUPP); 265 266 /* 267 * Validate flent. 268 */ 269 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 270 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 271 flow_entry_t *, flent, int, err); 272 return (err); 273 } 274 275 /* 276 * Flent is valid. now calculate hash and insert it 277 * into hash table. 278 */ 279 index = ops->fo_hash_fe(ft, flent); 280 281 /* 282 * We do not need a lock up until now because we were 283 * not accessing the flow table. 284 */ 285 rw_enter(&ft->ft_lock, RW_WRITER); 286 headp = &ft->ft_table[index]; 287 288 /* 289 * Check for duplicate flow. 290 */ 291 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 292 if ((*p)->fe_flow_desc.fd_mask != 293 flent->fe_flow_desc.fd_mask) 294 continue; 295 296 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 297 rw_exit(&ft->ft_lock); 298 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 299 flow_entry_t *, flent, int, err); 300 return (EALREADY); 301 } 302 } 303 304 /* 305 * Insert flow to hash list. 306 */ 307 err = ops->fo_insert_fe(ft, headp, flent); 308 if (err != 0) { 309 rw_exit(&ft->ft_lock); 310 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 311 flow_entry_t *, flent, int, err); 312 return (err); 313 } 314 315 /* 316 * Save the hash index so it can be used by mac_flow_remove(). 317 */ 318 flent->fe_index = (int)index; 319 320 /* 321 * Save the flow tab back reference. 322 */ 323 flent->fe_flow_tab = ft; 324 FLOW_MARK(flent, FE_FLOW_TAB); 325 ft->ft_flow_count++; 326 rw_exit(&ft->ft_lock); 327 return (0); 328 } 329 330 /* 331 * Remove a flow from a mac client's subflow table 332 */ 333 void 334 mac_flow_rem_subflow(flow_entry_t *flent) 335 { 336 flow_tab_t *ft = flent->fe_flow_tab; 337 mac_client_impl_t *mcip = ft->ft_mcip; 338 mac_handle_t mh = (mac_handle_t)ft->ft_mip; 339 340 ASSERT(MAC_PERIM_HELD(mh)); 341 342 mac_flow_remove(ft, flent, B_FALSE); 343 if (flent->fe_mcip == NULL) { 344 /* 345 * The interface is not yet plumbed and mac_client_flow_add 346 * was not done. 347 */ 348 if (FLOW_TAB_EMPTY(ft)) { 349 mac_flow_tab_destroy(ft); 350 mcip->mci_subflow_tab = NULL; 351 } 352 } else { 353 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 354 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 355 } 356 mac_fastpath_enable(mh); 357 } 358 359 /* 360 * Add a flow to a mac client's subflow table and instantiate the flow 361 * in the mac by creating the associated SRSs etc. 362 */ 363 int 364 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 365 boolean_t instantiate_flow) 366 { 367 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 368 mac_handle_t mh = (mac_handle_t)mcip->mci_mip; 369 flow_tab_info_t *ftinfo; 370 flow_mask_t mask; 371 flow_tab_t *ft; 372 int err; 373 boolean_t ft_created = B_FALSE; 374 375 ASSERT(MAC_PERIM_HELD(mh)); 376 377 if ((err = mac_fastpath_disable(mh)) != 0) 378 return (err); 379 380 /* 381 * If the subflow table exists already just add the new subflow 382 * to the existing table, else we create a new subflow table below. 383 */ 384 ft = mcip->mci_subflow_tab; 385 if (ft == NULL) { 386 mask = flent->fe_flow_desc.fd_mask; 387 /* 388 * Try to create a new table and then add the subflow to the 389 * newly created subflow table 390 */ 391 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) { 392 mac_fastpath_enable(mh); 393 return (EOPNOTSUPP); 394 } 395 396 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 397 mcip->mci_mip, &ft); 398 ft_created = B_TRUE; 399 } 400 401 err = mac_flow_add(ft, flent); 402 if (err != 0) { 403 if (ft_created) 404 mac_flow_tab_destroy(ft); 405 mac_fastpath_enable(mh); 406 return (err); 407 } 408 409 if (instantiate_flow) { 410 /* Now activate the flow by creating its SRSs */ 411 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 412 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 413 if (err != 0) { 414 mac_flow_remove(ft, flent, B_FALSE); 415 if (ft_created) 416 mac_flow_tab_destroy(ft); 417 mac_fastpath_enable(mh); 418 return (err); 419 } 420 } else { 421 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 422 } 423 if (ft_created) { 424 ASSERT(mcip->mci_subflow_tab == NULL); 425 ft->ft_mcip = mcip; 426 mcip->mci_subflow_tab = ft; 427 if (instantiate_flow) 428 mac_client_update_classifier(mcip, B_TRUE); 429 } 430 return (0); 431 } 432 433 /* 434 * Remove flow entry from flow table. 435 */ 436 void 437 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 438 { 439 flow_entry_t **fp; 440 441 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 442 if (!(flent->fe_flags & FE_FLOW_TAB)) 443 return; 444 445 rw_enter(&ft->ft_lock, RW_WRITER); 446 /* 447 * If this is a permanent removal from the flow table, mark it 448 * CONDEMNED to prevent future references. If this is a temporary 449 * removal from the table, say to update the flow descriptor then 450 * we don't mark it CONDEMNED 451 */ 452 if (!temp) 453 FLOW_MARK(flent, FE_CONDEMNED); 454 /* 455 * Locate the specified flent. 456 */ 457 fp = &ft->ft_table[flent->fe_index]; 458 while (*fp != flent) 459 fp = &(*fp)->fe_next; 460 461 /* 462 * The flent must exist. Otherwise it's a bug. 463 */ 464 ASSERT(fp != NULL); 465 *fp = flent->fe_next; 466 flent->fe_next = NULL; 467 468 /* 469 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 470 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 471 * will panic. 472 */ 473 flent->fe_index = -1; 474 FLOW_UNMARK(flent, FE_FLOW_TAB); 475 ft->ft_flow_count--; 476 rw_exit(&ft->ft_lock); 477 } 478 479 /* 480 * This is the flow lookup routine used by the mac sw classifier engine. 481 */ 482 int 483 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 484 { 485 flow_state_t s; 486 flow_entry_t *flent; 487 flow_ops_t *ops = &ft->ft_ops; 488 boolean_t retried = B_FALSE; 489 int i, err; 490 491 s.fs_flags = flags; 492 retry: 493 s.fs_mp = mp; 494 495 /* 496 * Walk the list of predeclared accept functions. 497 * Each of these would accumulate enough state to allow the next 498 * accept routine to make progress. 499 */ 500 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 501 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 502 mblk_t *last; 503 504 /* 505 * ENOBUFS indicates that the mp could be too short 506 * and may need a pullup. 507 */ 508 if (err != ENOBUFS || retried) 509 return (err); 510 511 /* 512 * The pullup is done on the last processed mblk, not 513 * the starting one. pullup is not done if the mblk 514 * has references or if b_cont is NULL. 515 */ 516 last = s.fs_mp; 517 if (DB_REF(last) > 1 || last->b_cont == NULL || 518 pullupmsg(last, -1) == 0) 519 return (EINVAL); 520 521 retried = B_TRUE; 522 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 523 flow_state_t *, &s); 524 goto retry; 525 } 526 } 527 528 /* 529 * The packet is considered sane. We may now attempt to 530 * find the corresponding flent. 531 */ 532 rw_enter(&ft->ft_lock, RW_READER); 533 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 534 for (; flent != NULL; flent = flent->fe_next) { 535 if (flent->fe_match(ft, flent, &s)) { 536 FLOW_TRY_REFHOLD(flent, err); 537 if (err != 0) 538 continue; 539 *flentp = flent; 540 rw_exit(&ft->ft_lock); 541 return (0); 542 } 543 } 544 rw_exit(&ft->ft_lock); 545 return (ENOENT); 546 } 547 548 /* 549 * Walk flow table. 550 * The caller is assumed to have proper perimeter protection. 551 */ 552 int 553 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 554 void *arg) 555 { 556 int err, i, cnt = 0; 557 flow_entry_t *flent; 558 559 if (ft == NULL) 560 return (0); 561 562 for (i = 0; i < ft->ft_size; i++) { 563 for (flent = ft->ft_table[i]; flent != NULL; 564 flent = flent->fe_next) { 565 cnt++; 566 err = (*fn)(flent, arg); 567 if (err != 0) 568 return (err); 569 } 570 } 571 VERIFY(cnt == ft->ft_flow_count); 572 return (0); 573 } 574 575 /* 576 * Same as the above except a mutex is used for protection here. 577 */ 578 int 579 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 580 void *arg) 581 { 582 int err; 583 584 if (ft == NULL) 585 return (0); 586 587 rw_enter(&ft->ft_lock, RW_WRITER); 588 err = mac_flow_walk_nolock(ft, fn, arg); 589 rw_exit(&ft->ft_lock); 590 return (err); 591 } 592 593 static boolean_t mac_flow_clean(flow_entry_t *); 594 595 /* 596 * Destroy a flow entry. Called when the last reference on a flow is released. 597 */ 598 void 599 mac_flow_destroy(flow_entry_t *flent) 600 { 601 ASSERT(flent->fe_refcnt == 0); 602 603 if ((flent->fe_type & FLOW_USER) != 0) { 604 ASSERT(mac_flow_clean(flent)); 605 } else { 606 mac_flow_cleanup(flent); 607 } 608 609 mutex_destroy(&flent->fe_lock); 610 cv_destroy(&flent->fe_cv); 611 flow_stat_destroy(flent); 612 kmem_cache_free(flow_cache, flent); 613 } 614 615 /* 616 * XXX eric 617 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 618 * mac_link_flow_modify() should really be moved/reworked into the 619 * two functions below. This would consolidate all the mac property 620 * checking in one place. I'm leaving this alone for now since it's 621 * out of scope of the new flows work. 622 */ 623 /* ARGSUSED */ 624 uint32_t 625 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 626 { 627 uint32_t changed_mask = 0; 628 mac_resource_props_t *fmrp = &flent->fe_effective_props; 629 int i; 630 631 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 632 (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { 633 changed_mask |= MRP_MAXBW; 634 fmrp->mrp_maxbw = mrp->mrp_maxbw; 635 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 636 fmrp->mrp_mask &= ~MRP_MAXBW; 637 } else { 638 fmrp->mrp_mask |= MRP_MAXBW; 639 } 640 } 641 642 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 643 if (fmrp->mrp_priority != mrp->mrp_priority) 644 changed_mask |= MRP_PRIORITY; 645 if (mrp->mrp_priority == MPL_RESET) { 646 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 647 fmrp->mrp_mask &= ~MRP_PRIORITY; 648 } else { 649 fmrp->mrp_priority = mrp->mrp_priority; 650 fmrp->mrp_mask |= MRP_PRIORITY; 651 } 652 } 653 654 /* modify fanout */ 655 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 656 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 657 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 658 for (i = 0; i < mrp->mrp_ncpus; i++) { 659 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 660 break; 661 } 662 if (i == mrp->mrp_ncpus) { 663 /* 664 * The new set of cpus passed is exactly 665 * the same as the existing set. 666 */ 667 return (changed_mask); 668 } 669 } 670 changed_mask |= MRP_CPUS; 671 MAC_COPY_CPUS(mrp, fmrp); 672 } 673 return (changed_mask); 674 } 675 676 void 677 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 678 { 679 uint32_t changed_mask; 680 mac_client_impl_t *mcip = flent->fe_mcip; 681 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 682 683 ASSERT(flent != NULL); 684 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 685 686 rw_enter(&ft->ft_lock, RW_WRITER); 687 688 /* Update the cached values inside the subflow entry */ 689 changed_mask = mac_flow_modify_props(flent, mrp); 690 rw_exit(&ft->ft_lock); 691 /* 692 * Push the changed parameters to the scheduling code in the 693 * SRS's, to take effect right away. 694 */ 695 if (changed_mask & MRP_MAXBW) { 696 mac_srs_update_bwlimit(flent, mrp); 697 /* 698 * If bandwidth is changed, we may have to change 699 * the number of soft ring to be used for fanout. 700 * Call mac_flow_update_fanout() if MAC_BIND_CPU 701 * is not set and there is no user supplied cpu 702 * info. This applies only to link at this time. 703 */ 704 if (!(flent->fe_type & FLOW_USER) && 705 !(changed_mask & MRP_CPUS) && 706 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 707 mac_fanout_setup(mcip, flent, mcip_mrp, 708 mac_rx_deliver, mcip, NULL); 709 } 710 } 711 if (mrp->mrp_mask & MRP_PRIORITY) 712 mac_flow_update_priority(mcip, flent); 713 714 if (changed_mask & MRP_CPUS) 715 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); 716 } 717 718 /* 719 * This function waits for a certain condition to be met and is generally 720 * used before a destructive or quiescing operation. 721 */ 722 void 723 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 724 { 725 mutex_enter(&flent->fe_lock); 726 flent->fe_flags |= FE_WAITER; 727 728 switch (event) { 729 case FLOW_DRIVER_UPCALL: 730 /* 731 * We want to make sure the driver upcalls have finished before 732 * we signal the Rx SRS worker to quit. 733 */ 734 while (flent->fe_refcnt != 1) 735 cv_wait(&flent->fe_cv, &flent->fe_lock); 736 break; 737 738 case FLOW_USER_REF: 739 /* 740 * Wait for the fe_user_refcnt to drop to 0. The flow has 741 * been removed from the global flow hash. 742 */ 743 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 744 while (flent->fe_user_refcnt != 0) 745 cv_wait(&flent->fe_cv, &flent->fe_lock); 746 break; 747 748 default: 749 ASSERT(0); 750 } 751 752 flent->fe_flags &= ~FE_WAITER; 753 mutex_exit(&flent->fe_lock); 754 } 755 756 static boolean_t 757 mac_flow_clean(flow_entry_t *flent) 758 { 759 ASSERT(flent->fe_next == NULL); 760 ASSERT(flent->fe_tx_srs == NULL); 761 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 762 ASSERT(flent->fe_mbg == NULL); 763 764 return (B_TRUE); 765 } 766 767 void 768 mac_flow_cleanup(flow_entry_t *flent) 769 { 770 if ((flent->fe_type & FLOW_USER) == 0) { 771 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 772 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 773 ASSERT(flent->fe_refcnt == 0); 774 } else { 775 ASSERT(flent->fe_refcnt == 1); 776 } 777 778 if (flent->fe_mbg != NULL) { 779 ASSERT(flent->fe_tx_srs == NULL); 780 /* This is a multicast or broadcast flow entry */ 781 mac_bcast_grp_free(flent->fe_mbg); 782 flent->fe_mbg = NULL; 783 } 784 785 if (flent->fe_tx_srs != NULL) { 786 ASSERT(flent->fe_mbg == NULL); 787 mac_srs_free(flent->fe_tx_srs); 788 flent->fe_tx_srs = NULL; 789 } 790 791 /* 792 * In the normal case fe_rx_srs_cnt is 1. However in the error case 793 * when mac_unicast_add fails we may not have set up any SRS 794 * in which case fe_rx_srs_cnt will be zero. 795 */ 796 if (flent->fe_rx_srs_cnt != 0) { 797 ASSERT(flent->fe_rx_srs_cnt == 1); 798 mac_srs_free(flent->fe_rx_srs[0]); 799 flent->fe_rx_srs[0] = NULL; 800 flent->fe_rx_srs_cnt = 0; 801 } 802 ASSERT(flent->fe_rx_srs[0] == NULL); 803 } 804 805 void 806 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 807 { 808 /* 809 * Grab the fe_lock to see a self-consistent fe_flow_desc. 810 * Updates to the fe_flow_desc happen under the fe_lock 811 * after removing the flent from the flow table 812 */ 813 mutex_enter(&flent->fe_lock); 814 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 815 mutex_exit(&flent->fe_lock); 816 } 817 818 /* 819 * Update a field of a flow entry. The mac perimeter ensures that 820 * this is the only thread doing a modify operation on this mac end point. 821 * So the flow table can't change or disappear. The ft_lock protects access 822 * to the flow entry, and holding the lock ensures that there isn't any thread 823 * accessing the flow entry or attempting a flow table lookup. However 824 * data threads that are using the flow entry based on the old descriptor 825 * will continue to use the flow entry. If strong coherence is required 826 * then the flow will have to be quiesced before the descriptor can be 827 * changed. 828 */ 829 void 830 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 831 { 832 flow_tab_t *ft = flent->fe_flow_tab; 833 flow_desc_t old_desc; 834 int err; 835 836 if (ft == NULL) { 837 /* 838 * The flow hasn't yet been inserted into the table, 839 * so only the caller knows about this flow, however for 840 * uniformity we grab the fe_lock here. 841 */ 842 mutex_enter(&flent->fe_lock); 843 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 844 mutex_exit(&flent->fe_lock); 845 } 846 847 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 848 849 /* 850 * Need to remove the flow entry from the table and reinsert it, 851 * into a potentially diference hash line. The hash depends on 852 * the new descriptor fields. However access to fe_desc itself 853 * is always under the fe_lock. This helps log and stat functions 854 * see a self-consistent fe_flow_desc. 855 */ 856 mac_flow_remove(ft, flent, B_TRUE); 857 old_desc = flent->fe_flow_desc; 858 859 mutex_enter(&flent->fe_lock); 860 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 861 mutex_exit(&flent->fe_lock); 862 863 if (mac_flow_add(ft, flent) != 0) { 864 /* 865 * The add failed say due to an invalid flow descriptor. 866 * Undo the update 867 */ 868 flent->fe_flow_desc = old_desc; 869 err = mac_flow_add(ft, flent); 870 ASSERT(err == 0); 871 } 872 } 873 874 void 875 mac_flow_set_name(flow_entry_t *flent, const char *name) 876 { 877 flow_tab_t *ft = flent->fe_flow_tab; 878 879 if (ft == NULL) { 880 /* 881 * The flow hasn't yet been inserted into the table, 882 * so only the caller knows about this flow 883 */ 884 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 885 } else { 886 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 887 } 888 889 mutex_enter(&flent->fe_lock); 890 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 891 mutex_exit(&flent->fe_lock); 892 } 893 894 /* 895 * Return the client-private cookie that was associated with 896 * the flow when it was created. 897 */ 898 void * 899 mac_flow_get_client_cookie(flow_entry_t *flent) 900 { 901 return (flent->fe_client_cookie); 902 } 903 904 /* 905 * Forward declarations. 906 */ 907 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 908 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 909 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 910 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 911 912 /* 913 * Create flow table. 914 */ 915 void 916 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 917 mac_impl_t *mip, flow_tab_t **ftp) 918 { 919 flow_tab_t *ft; 920 flow_ops_t *new_ops; 921 922 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 923 bzero(ft, sizeof (*ft)); 924 925 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 926 927 /* 928 * We make a copy of the ops vector instead of just pointing to it 929 * because we might want to customize the ops vector on a per table 930 * basis (e.g. for optimization). 931 */ 932 new_ops = &ft->ft_ops; 933 bcopy(ops, new_ops, sizeof (*ops)); 934 ft->ft_mask = mask; 935 ft->ft_size = size; 936 ft->ft_mip = mip; 937 938 /* 939 * Optimization for DL_ETHER media. 940 */ 941 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 942 if (new_ops->fo_hash == flow_l2_hash) 943 new_ops->fo_hash = flow_ether_hash; 944 945 if (new_ops->fo_accept[0] == flow_l2_accept) 946 new_ops->fo_accept[0] = flow_ether_accept; 947 948 } 949 *ftp = ft; 950 } 951 952 void 953 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 954 { 955 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 956 1024, mip, ftp); 957 } 958 959 /* 960 * Destroy flow table. 961 */ 962 void 963 mac_flow_tab_destroy(flow_tab_t *ft) 964 { 965 if (ft == NULL) 966 return; 967 968 ASSERT(ft->ft_flow_count == 0); 969 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 970 bzero(ft, sizeof (*ft)); 971 kmem_cache_free(flow_tab_cache, ft); 972 } 973 974 /* 975 * Add a new flow entry to the global flow hash table 976 */ 977 int 978 mac_flow_hash_add(flow_entry_t *flent) 979 { 980 int err; 981 982 rw_enter(&flow_tab_lock, RW_WRITER); 983 err = mod_hash_insert(flow_hash, 984 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 985 if (err != 0) { 986 rw_exit(&flow_tab_lock); 987 return (EEXIST); 988 } 989 /* Mark as inserted into the global flow hash table */ 990 FLOW_MARK(flent, FE_G_FLOW_HASH); 991 rw_exit(&flow_tab_lock); 992 return (err); 993 } 994 995 /* 996 * Remove a flow entry from the global flow hash table 997 */ 998 void 999 mac_flow_hash_remove(flow_entry_t *flent) 1000 { 1001 mod_hash_val_t val; 1002 1003 rw_enter(&flow_tab_lock, RW_WRITER); 1004 VERIFY(mod_hash_remove(flow_hash, 1005 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 1006 1007 /* Clear the mark that says inserted into the global flow hash table */ 1008 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 1009 rw_exit(&flow_tab_lock); 1010 } 1011 1012 /* 1013 * Retrieve a flow entry from the global flow hash table. 1014 */ 1015 int 1016 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1017 { 1018 int err; 1019 flow_entry_t *flent; 1020 1021 rw_enter(&flow_tab_lock, RW_READER); 1022 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1023 (mod_hash_val_t *)&flent); 1024 if (err != 0) { 1025 rw_exit(&flow_tab_lock); 1026 return (ENOENT); 1027 } 1028 ASSERT(flent != NULL); 1029 FLOW_USER_REFHOLD(flent); 1030 rw_exit(&flow_tab_lock); 1031 1032 *flentp = flent; 1033 return (0); 1034 } 1035 1036 /* 1037 * Initialize or release mac client flows by walking the subflow table. 1038 * These are typically invoked during plumb/unplumb of links. 1039 */ 1040 1041 static int 1042 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1043 { 1044 mac_client_impl_t *mcip = arg; 1045 1046 if (mac_link_flow_init(arg, flent) != 0) { 1047 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1048 flent->fe_flow_name, mcip->mci_name); 1049 } else { 1050 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1051 } 1052 return (0); 1053 } 1054 1055 void 1056 mac_link_init_flows(mac_client_handle_t mch) 1057 { 1058 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1059 1060 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1061 mac_link_init_flows_cb, mcip); 1062 /* 1063 * If mac client had subflow(s) configured before plumb, change 1064 * function to mac_rx_srs_subflow_process and in case of hardware 1065 * classification, disable polling. 1066 */ 1067 mac_client_update_classifier(mcip, B_TRUE); 1068 1069 } 1070 1071 boolean_t 1072 mac_link_has_flows(mac_client_handle_t mch) 1073 { 1074 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1075 1076 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1077 return (B_TRUE); 1078 1079 return (B_FALSE); 1080 } 1081 1082 static int 1083 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1084 { 1085 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1086 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1087 mac_link_flow_clean(arg, flent); 1088 return (0); 1089 } 1090 1091 void 1092 mac_link_release_flows(mac_client_handle_t mch) 1093 { 1094 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1095 1096 /* 1097 * Change the mci_flent callback back to mac_rx_srs_process() 1098 * because flows are about to be deactivated. 1099 */ 1100 mac_client_update_classifier(mcip, B_FALSE); 1101 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1102 mac_link_release_flows_cb, mcip); 1103 } 1104 1105 void 1106 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1107 { 1108 mac_flow_set_name(fep, new_name); 1109 if (fep->fe_ksp != NULL) { 1110 flow_stat_destroy(fep); 1111 flow_stat_create(fep); 1112 } 1113 } 1114 1115 /* 1116 * mac_link_flow_init() 1117 * Internal flow interface used for allocating SRSs and related 1118 * data structures. Not meant to be used by mac clients. 1119 */ 1120 int 1121 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1122 { 1123 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1124 mac_impl_t *mip = mcip->mci_mip; 1125 int err; 1126 1127 ASSERT(mch != NULL); 1128 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1129 1130 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1131 return (err); 1132 1133 sub_flow->fe_mcip = mcip; 1134 1135 return (0); 1136 } 1137 1138 /* 1139 * mac_link_flow_add() 1140 * Used by flowadm(1m) or kernel mac clients for creating flows. 1141 */ 1142 int 1143 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1144 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1145 { 1146 flow_entry_t *flent = NULL; 1147 int err; 1148 dls_dl_handle_t dlh; 1149 dls_link_t *dlp; 1150 boolean_t link_held = B_FALSE; 1151 boolean_t hash_added = B_FALSE; 1152 mac_perim_handle_t mph; 1153 1154 err = mac_flow_lookup_byname(flow_name, &flent); 1155 if (err == 0) { 1156 FLOW_USER_REFRELE(flent); 1157 return (EEXIST); 1158 } 1159 1160 /* 1161 * First create a flow entry given the description provided 1162 * by the caller. 1163 */ 1164 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1165 FLOW_USER | FLOW_OTHER, &flent); 1166 1167 if (err != 0) 1168 return (err); 1169 1170 /* 1171 * We've got a local variable referencing this flow now, so we need 1172 * to hold it. We'll release this flow before returning. 1173 * All failures until we return will undo any action that may internally 1174 * held the flow, so the last REFRELE will assure a clean freeing 1175 * of resources. 1176 */ 1177 FLOW_REFHOLD(flent); 1178 1179 flent->fe_link_id = linkid; 1180 FLOW_MARK(flent, FE_INCIPIENT); 1181 1182 err = mac_perim_enter_by_linkid(linkid, &mph); 1183 if (err != 0) { 1184 FLOW_FINAL_REFRELE(flent); 1185 return (err); 1186 } 1187 1188 /* 1189 * dls will eventually be merged with mac so it's ok 1190 * to call dls' internal functions. 1191 */ 1192 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1193 if (err != 0) 1194 goto bail; 1195 1196 link_held = B_TRUE; 1197 1198 /* 1199 * Add the flow to the global flow table, this table will be per 1200 * exclusive zone so each zone can have its own flow namespace. 1201 * RFE 6625651 will fix this. 1202 * 1203 */ 1204 if ((err = mac_flow_hash_add(flent)) != 0) 1205 goto bail; 1206 1207 hash_added = B_TRUE; 1208 1209 /* 1210 * do not allow flows to be configured on an anchor VNIC 1211 */ 1212 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1213 err = ENOTSUP; 1214 goto bail; 1215 } 1216 1217 /* 1218 * Save the zoneid of the underlying link in the flow entry, 1219 * this is needed to prevent non-global zone from getting 1220 * statistics information of global zone. 1221 */ 1222 flent->fe_zoneid = dlp->dl_zid; 1223 1224 /* 1225 * Add the subflow to the subflow table. Also instantiate the flow 1226 * in the mac if there is an active user (we check if the MAC client's 1227 * datapath has been setup). 1228 */ 1229 err = mac_flow_add_subflow(dlp->dl_mch, flent, 1230 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); 1231 if (err != 0) 1232 goto bail; 1233 1234 FLOW_UNMARK(flent, FE_INCIPIENT); 1235 dls_devnet_rele_link(dlh, dlp); 1236 mac_perim_exit(mph); 1237 return (0); 1238 1239 bail: 1240 if (hash_added) 1241 mac_flow_hash_remove(flent); 1242 1243 if (link_held) 1244 dls_devnet_rele_link(dlh, dlp); 1245 1246 /* 1247 * Wait for any transient global flow hash refs to clear 1248 * and then release the creation reference on the flow 1249 */ 1250 mac_flow_wait(flent, FLOW_USER_REF); 1251 FLOW_FINAL_REFRELE(flent); 1252 mac_perim_exit(mph); 1253 return (err); 1254 } 1255 1256 /* 1257 * mac_link_flow_clean() 1258 * Internal flow interface used for freeing SRSs and related 1259 * data structures. Not meant to be used by mac clients. 1260 */ 1261 void 1262 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1263 { 1264 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1265 mac_impl_t *mip = mcip->mci_mip; 1266 boolean_t last_subflow; 1267 1268 ASSERT(mch != NULL); 1269 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1270 1271 /* 1272 * This sub flow entry may fail to be fully initialized by 1273 * mac_link_flow_init(). If so, simply return. 1274 */ 1275 if (sub_flow->fe_mcip == NULL) 1276 return; 1277 1278 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1279 /* 1280 * Tear down the data path 1281 */ 1282 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1283 sub_flow->fe_mcip = NULL; 1284 1285 /* 1286 * Delete the SRSs associated with this subflow. If this is being 1287 * driven by flowadm(1M) then the subflow will be deleted by 1288 * dls_rem_flow. However if this is a result of the interface being 1289 * unplumbed then the subflow itself won't be deleted. 1290 */ 1291 mac_flow_cleanup(sub_flow); 1292 1293 /* 1294 * If all the subflows are gone, renable some of the stuff 1295 * we disabled when adding a subflow, polling etc. 1296 */ 1297 if (last_subflow) { 1298 /* 1299 * The subflow table itself is not protected by any locks or 1300 * refcnts. Hence quiesce the client upfront before clearing 1301 * mci_subflow_tab. 1302 */ 1303 mac_client_quiesce(mcip); 1304 mac_client_update_classifier(mcip, B_FALSE); 1305 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1306 mcip->mci_subflow_tab = NULL; 1307 mac_client_restart(mcip); 1308 } 1309 } 1310 1311 /* 1312 * mac_link_flow_remove() 1313 * Used by flowadm(1m) or kernel mac clients for removing flows. 1314 */ 1315 int 1316 mac_link_flow_remove(char *flow_name) 1317 { 1318 flow_entry_t *flent; 1319 mac_perim_handle_t mph; 1320 int err; 1321 datalink_id_t linkid; 1322 1323 err = mac_flow_lookup_byname(flow_name, &flent); 1324 if (err != 0) 1325 return (err); 1326 1327 linkid = flent->fe_link_id; 1328 FLOW_USER_REFRELE(flent); 1329 1330 /* 1331 * The perim must be acquired before acquiring any other references 1332 * to maintain the lock and perimeter hierarchy. Please note the 1333 * FLOW_REFRELE above. 1334 */ 1335 err = mac_perim_enter_by_linkid(linkid, &mph); 1336 if (err != 0) 1337 return (err); 1338 1339 /* 1340 * Note the second lookup of the flow, because a concurrent thread 1341 * may have removed it already while we were waiting to enter the 1342 * link's perimeter. 1343 */ 1344 err = mac_flow_lookup_byname(flow_name, &flent); 1345 if (err != 0) { 1346 mac_perim_exit(mph); 1347 return (err); 1348 } 1349 FLOW_USER_REFRELE(flent); 1350 1351 /* 1352 * Remove the flow from the subflow table and deactivate the flow 1353 * by quiescing and removings its SRSs 1354 */ 1355 mac_flow_rem_subflow(flent); 1356 1357 /* 1358 * Finally, remove the flow from the global table. 1359 */ 1360 mac_flow_hash_remove(flent); 1361 1362 /* 1363 * Wait for any transient global flow hash refs to clear 1364 * and then release the creation reference on the flow 1365 */ 1366 mac_flow_wait(flent, FLOW_USER_REF); 1367 FLOW_FINAL_REFRELE(flent); 1368 1369 mac_perim_exit(mph); 1370 1371 return (0); 1372 } 1373 1374 /* 1375 * mac_link_flow_modify() 1376 * Modifies the properties of a flow identified by its name. 1377 */ 1378 int 1379 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1380 { 1381 flow_entry_t *flent; 1382 mac_client_impl_t *mcip; 1383 int err = 0; 1384 mac_perim_handle_t mph; 1385 datalink_id_t linkid; 1386 flow_tab_t *flow_tab; 1387 1388 err = mac_validate_props(mrp); 1389 if (err != 0) 1390 return (err); 1391 1392 err = mac_flow_lookup_byname(flow_name, &flent); 1393 if (err != 0) 1394 return (err); 1395 1396 linkid = flent->fe_link_id; 1397 FLOW_USER_REFRELE(flent); 1398 1399 /* 1400 * The perim must be acquired before acquiring any other references 1401 * to maintain the lock and perimeter hierarchy. Please note the 1402 * FLOW_REFRELE above. 1403 */ 1404 err = mac_perim_enter_by_linkid(linkid, &mph); 1405 if (err != 0) 1406 return (err); 1407 1408 /* 1409 * Note the second lookup of the flow, because a concurrent thread 1410 * may have removed it already while we were waiting to enter the 1411 * link's perimeter. 1412 */ 1413 err = mac_flow_lookup_byname(flow_name, &flent); 1414 if (err != 0) { 1415 mac_perim_exit(mph); 1416 return (err); 1417 } 1418 FLOW_USER_REFRELE(flent); 1419 1420 /* 1421 * If this flow is attached to a MAC client, then pass the request 1422 * along to the client. 1423 * Otherwise, just update the cached values. 1424 */ 1425 mcip = flent->fe_mcip; 1426 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1427 if (mcip != NULL) { 1428 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1429 err = ENOENT; 1430 } else { 1431 mac_flow_modify(flow_tab, flent, mrp); 1432 } 1433 } else { 1434 (void) mac_flow_modify_props(flent, mrp); 1435 } 1436 1437 done: 1438 mac_perim_exit(mph); 1439 return (err); 1440 } 1441 1442 1443 /* 1444 * State structure and misc functions used by mac_link_flow_walk(). 1445 */ 1446 typedef struct { 1447 int (*ws_func)(mac_flowinfo_t *, void *); 1448 void *ws_arg; 1449 } flow_walk_state_t; 1450 1451 static void 1452 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1453 { 1454 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1455 MAXFLOWNAMELEN); 1456 finfop->fi_link_id = flent->fe_link_id; 1457 finfop->fi_flow_desc = flent->fe_flow_desc; 1458 finfop->fi_resource_props = flent->fe_resource_props; 1459 } 1460 1461 static int 1462 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1463 { 1464 flow_walk_state_t *statep = arg; 1465 mac_flowinfo_t finfo; 1466 1467 mac_link_flowinfo_copy(&finfo, flent); 1468 return (statep->ws_func(&finfo, statep->ws_arg)); 1469 } 1470 1471 /* 1472 * mac_link_flow_walk() 1473 * Invokes callback 'func' for all flows belonging to the specified link. 1474 */ 1475 int 1476 mac_link_flow_walk(datalink_id_t linkid, 1477 int (*func)(mac_flowinfo_t *, void *), void *arg) 1478 { 1479 mac_client_impl_t *mcip; 1480 mac_perim_handle_t mph; 1481 flow_walk_state_t state; 1482 dls_dl_handle_t dlh; 1483 dls_link_t *dlp; 1484 int err; 1485 1486 err = mac_perim_enter_by_linkid(linkid, &mph); 1487 if (err != 0) 1488 return (err); 1489 1490 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1491 if (err != 0) { 1492 mac_perim_exit(mph); 1493 return (err); 1494 } 1495 1496 mcip = (mac_client_impl_t *)dlp->dl_mch; 1497 state.ws_func = func; 1498 state.ws_arg = arg; 1499 1500 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1501 mac_link_flow_walk_cb, &state); 1502 1503 dls_devnet_rele_link(dlh, dlp); 1504 mac_perim_exit(mph); 1505 return (err); 1506 } 1507 1508 /* 1509 * mac_link_flow_info() 1510 * Retrieves information about a specific flow. 1511 */ 1512 int 1513 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1514 { 1515 flow_entry_t *flent; 1516 int err; 1517 1518 err = mac_flow_lookup_byname(flow_name, &flent); 1519 if (err != 0) 1520 return (err); 1521 1522 mac_link_flowinfo_copy(finfo, flent); 1523 FLOW_USER_REFRELE(flent); 1524 return (0); 1525 } 1526 1527 #define HASH_MAC_VID(a, v, s) \ 1528 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1529 1530 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1531 1532 #define CHECK_AND_ADJUST_START_PTR(s, start) { \ 1533 if ((s)->fs_mp->b_wptr == (start)) { \ 1534 mblk_t *next = (s)->fs_mp->b_cont; \ 1535 if (next == NULL) \ 1536 return (EINVAL); \ 1537 \ 1538 (s)->fs_mp = next; \ 1539 (start) = next->b_rptr; \ 1540 } \ 1541 } 1542 1543 /* ARGSUSED */ 1544 static boolean_t 1545 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1546 { 1547 flow_l2info_t *l2 = &s->fs_l2info; 1548 flow_desc_t *fd = &flent->fe_flow_desc; 1549 1550 return (l2->l2_vid == fd->fd_vid && 1551 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1552 } 1553 1554 /* 1555 * Layer 2 hash function. 1556 * Must be paired with flow_l2_accept() within a set of flow_ops 1557 * because it assumes the dest address is already extracted. 1558 */ 1559 static uint32_t 1560 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1561 { 1562 flow_l2info_t *l2 = &s->fs_l2info; 1563 1564 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1565 } 1566 1567 /* 1568 * This is the generic layer 2 accept function. 1569 * It makes use of mac_header_info() to extract the header length, 1570 * sap, vlan ID and destination address. 1571 */ 1572 static int 1573 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1574 { 1575 boolean_t is_ether; 1576 flow_l2info_t *l2 = &s->fs_l2info; 1577 mac_header_info_t mhi; 1578 int err; 1579 1580 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1581 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1582 s->fs_mp, &mhi)) != 0) { 1583 if (err == EINVAL) 1584 err = ENOBUFS; 1585 1586 return (err); 1587 } 1588 1589 l2->l2_start = s->fs_mp->b_rptr; 1590 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1591 1592 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1593 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1594 struct ether_vlan_header *evhp = 1595 (struct ether_vlan_header *)l2->l2_start; 1596 1597 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1598 return (ENOBUFS); 1599 1600 l2->l2_sap = ntohs(evhp->ether_type); 1601 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1602 l2->l2_hdrsize = sizeof (*evhp); 1603 } else { 1604 l2->l2_sap = mhi.mhi_bindsap; 1605 l2->l2_vid = 0; 1606 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1607 } 1608 return (0); 1609 } 1610 1611 /* 1612 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1613 * accept(). The notable difference is that dest address is now extracted 1614 * by hash() rather than by accept(). This saves a few memory references 1615 * for flow tables that do not care about mac addresses. 1616 */ 1617 static uint32_t 1618 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1619 { 1620 flow_l2info_t *l2 = &s->fs_l2info; 1621 struct ether_vlan_header *evhp; 1622 1623 evhp = (struct ether_vlan_header *)l2->l2_start; 1624 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1625 return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1626 } 1627 1628 /* ARGSUSED */ 1629 static int 1630 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1631 { 1632 flow_l2info_t *l2 = &s->fs_l2info; 1633 struct ether_vlan_header *evhp; 1634 uint16_t sap; 1635 1636 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1637 l2->l2_start = (uchar_t *)evhp; 1638 1639 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1640 return (ENOBUFS); 1641 1642 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1643 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1644 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1645 return (ENOBUFS); 1646 1647 l2->l2_sap = ntohs(evhp->ether_type); 1648 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1649 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1650 } else { 1651 l2->l2_sap = sap; 1652 l2->l2_vid = 0; 1653 l2->l2_hdrsize = sizeof (struct ether_header); 1654 } 1655 return (0); 1656 } 1657 1658 /* 1659 * Validates a layer 2 flow entry. 1660 */ 1661 static int 1662 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1663 { 1664 int i; 1665 flow_desc_t *fd = &flent->fe_flow_desc; 1666 1667 /* 1668 * Dest address is mandatory. 1669 */ 1670 if ((fd->fd_mask & FLOW_LINK_DST) == 0) 1671 return (EINVAL); 1672 1673 for (i = 0; i < fd->fd_mac_len; i++) { 1674 if (fd->fd_dst_mac[i] != 0) 1675 break; 1676 } 1677 if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL) 1678 return (EINVAL); 1679 1680 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1681 /* 1682 * VLAN flows are only supported over ethernet macs. 1683 */ 1684 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1685 return (EINVAL); 1686 1687 if (fd->fd_vid == 0) 1688 return (EINVAL); 1689 1690 } 1691 flent->fe_match = flow_l2_match; 1692 return (0); 1693 } 1694 1695 /* 1696 * Calculates hash index of flow entry. 1697 */ 1698 static uint32_t 1699 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1700 { 1701 flow_desc_t *fd = &flent->fe_flow_desc; 1702 1703 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1704 return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1705 } 1706 1707 /* 1708 * This is used for duplicate flow checking. 1709 */ 1710 /* ARGSUSED */ 1711 static boolean_t 1712 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1713 { 1714 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1715 1716 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1717 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1718 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1719 } 1720 1721 /* 1722 * Generic flow entry insertion function. 1723 * Used by flow tables that do not have ordering requirements. 1724 */ 1725 /* ARGSUSED */ 1726 static int 1727 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1728 flow_entry_t *flent) 1729 { 1730 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1731 1732 if (*headp != NULL) { 1733 ASSERT(flent->fe_next == NULL); 1734 flent->fe_next = *headp; 1735 } 1736 *headp = flent; 1737 return (0); 1738 } 1739 1740 /* 1741 * IP version independent DSField matching function. 1742 */ 1743 /* ARGSUSED */ 1744 static boolean_t 1745 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1746 { 1747 flow_l3info_t *l3info = &s->fs_l3info; 1748 flow_desc_t *fd = &flent->fe_flow_desc; 1749 1750 switch (l3info->l3_version) { 1751 case IPV4_VERSION: { 1752 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1753 1754 return ((ipha->ipha_type_of_service & 1755 fd->fd_dsfield_mask) == fd->fd_dsfield); 1756 } 1757 case IPV6_VERSION: { 1758 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1759 1760 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1761 fd->fd_dsfield_mask) == fd->fd_dsfield); 1762 } 1763 default: 1764 return (B_FALSE); 1765 } 1766 } 1767 1768 /* 1769 * IP v4 and v6 address matching. 1770 * The netmask only needs to be applied on the packet but not on the 1771 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1772 */ 1773 1774 /* ARGSUSED */ 1775 static boolean_t 1776 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1777 { 1778 flow_l3info_t *l3info = &s->fs_l3info; 1779 flow_desc_t *fd = &flent->fe_flow_desc; 1780 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1781 in_addr_t addr; 1782 1783 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1784 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1785 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1786 V4_PART_OF_V6(fd->fd_local_addr)); 1787 } 1788 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1789 V4_PART_OF_V6(fd->fd_remote_addr)); 1790 } 1791 1792 /* ARGSUSED */ 1793 static boolean_t 1794 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1795 { 1796 flow_l3info_t *l3info = &s->fs_l3info; 1797 flow_desc_t *fd = &flent->fe_flow_desc; 1798 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1799 in6_addr_t *addrp; 1800 1801 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1802 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1803 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1804 fd->fd_local_addr)); 1805 } 1806 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1807 } 1808 1809 /* ARGSUSED */ 1810 static boolean_t 1811 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1812 { 1813 flow_l3info_t *l3info = &s->fs_l3info; 1814 flow_desc_t *fd = &flent->fe_flow_desc; 1815 1816 return (l3info->l3_protocol == fd->fd_protocol); 1817 } 1818 1819 static uint32_t 1820 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1821 { 1822 flow_l3info_t *l3info = &s->fs_l3info; 1823 flow_mask_t mask = ft->ft_mask; 1824 1825 if ((mask & FLOW_IP_LOCAL) != 0) { 1826 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1827 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1828 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1829 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1830 /* 1831 * DSField flents are arranged as a single list. 1832 */ 1833 return (0); 1834 } 1835 /* 1836 * IP addr flents are hashed into two lists, v4 or v6. 1837 */ 1838 ASSERT(ft->ft_size >= 2); 1839 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1840 } 1841 1842 static uint32_t 1843 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1844 { 1845 flow_l3info_t *l3info = &s->fs_l3info; 1846 1847 return (l3info->l3_protocol % ft->ft_size); 1848 } 1849 1850 /* ARGSUSED */ 1851 static int 1852 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1853 { 1854 flow_l2info_t *l2info = &s->fs_l2info; 1855 flow_l3info_t *l3info = &s->fs_l3info; 1856 uint16_t sap = l2info->l2_sap; 1857 uchar_t *l3_start; 1858 1859 l3_start = l2info->l2_start + l2info->l2_hdrsize; 1860 1861 /* 1862 * Adjust start pointer if we're at the end of an mblk. 1863 */ 1864 CHECK_AND_ADJUST_START_PTR(s, l3_start); 1865 1866 l3info->l3_start = l3_start; 1867 if (!OK_32PTR(l3_start)) 1868 return (EINVAL); 1869 1870 switch (sap) { 1871 case ETHERTYPE_IP: { 1872 ipha_t *ipha = (ipha_t *)l3_start; 1873 1874 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1875 return (ENOBUFS); 1876 1877 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1878 l3info->l3_protocol = ipha->ipha_protocol; 1879 l3info->l3_version = IPV4_VERSION; 1880 l3info->l3_fragmented = 1881 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1882 break; 1883 } 1884 case ETHERTYPE_IPV6: { 1885 ip6_t *ip6h = (ip6_t *)l3_start; 1886 uint16_t ip6_hdrlen; 1887 uint8_t nexthdr; 1888 1889 if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, 1890 &nexthdr)) { 1891 return (ENOBUFS); 1892 } 1893 l3info->l3_hdrsize = ip6_hdrlen; 1894 l3info->l3_protocol = nexthdr; 1895 l3info->l3_version = IPV6_VERSION; 1896 l3info->l3_fragmented = B_FALSE; 1897 break; 1898 } 1899 default: 1900 return (EINVAL); 1901 } 1902 return (0); 1903 } 1904 1905 /* ARGSUSED */ 1906 static int 1907 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1908 { 1909 flow_desc_t *fd = &flent->fe_flow_desc; 1910 1911 switch (fd->fd_protocol) { 1912 case IPPROTO_TCP: 1913 case IPPROTO_UDP: 1914 case IPPROTO_SCTP: 1915 case IPPROTO_ICMP: 1916 case IPPROTO_ICMPV6: 1917 flent->fe_match = flow_ip_proto_match; 1918 return (0); 1919 default: 1920 return (EINVAL); 1921 } 1922 } 1923 1924 /* ARGSUSED */ 1925 static int 1926 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1927 { 1928 flow_desc_t *fd = &flent->fe_flow_desc; 1929 flow_mask_t mask; 1930 uint8_t version; 1931 in6_addr_t *addr, *netmask; 1932 1933 /* 1934 * DSField does not require a IP version. 1935 */ 1936 if (fd->fd_mask == FLOW_IP_DSFIELD) { 1937 if (fd->fd_dsfield_mask == 0) 1938 return (EINVAL); 1939 1940 flent->fe_match = flow_ip_dsfield_match; 1941 return (0); 1942 } 1943 1944 /* 1945 * IP addresses must come with a version to avoid ambiguity. 1946 */ 1947 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 1948 return (EINVAL); 1949 1950 version = fd->fd_ipversion; 1951 if (version != IPV4_VERSION && version != IPV6_VERSION) 1952 return (EINVAL); 1953 1954 mask = fd->fd_mask & ~FLOW_IP_VERSION; 1955 switch (mask) { 1956 case FLOW_IP_LOCAL: 1957 addr = &fd->fd_local_addr; 1958 netmask = &fd->fd_local_netmask; 1959 break; 1960 case FLOW_IP_REMOTE: 1961 addr = &fd->fd_remote_addr; 1962 netmask = &fd->fd_remote_netmask; 1963 break; 1964 default: 1965 return (EINVAL); 1966 } 1967 1968 /* 1969 * Apply netmask onto specified address. 1970 */ 1971 V6_MASK_COPY(*addr, *netmask, *addr); 1972 if (version == IPV4_VERSION) { 1973 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 1974 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 1975 1976 if (v4addr == 0 || v4mask == 0) 1977 return (EINVAL); 1978 flent->fe_match = flow_ip_v4_match; 1979 } else { 1980 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 1981 IN6_IS_ADDR_UNSPECIFIED(netmask)) 1982 return (EINVAL); 1983 flent->fe_match = flow_ip_v6_match; 1984 } 1985 return (0); 1986 } 1987 1988 static uint32_t 1989 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1990 { 1991 flow_desc_t *fd = &flent->fe_flow_desc; 1992 1993 return (fd->fd_protocol % ft->ft_size); 1994 } 1995 1996 static uint32_t 1997 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1998 { 1999 flow_desc_t *fd = &flent->fe_flow_desc; 2000 2001 /* 2002 * DSField flents are arranged as a single list. 2003 */ 2004 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2005 return (0); 2006 2007 /* 2008 * IP addr flents are hashed into two lists, v4 or v6. 2009 */ 2010 ASSERT(ft->ft_size >= 2); 2011 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 2012 } 2013 2014 /* ARGSUSED */ 2015 static boolean_t 2016 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2017 { 2018 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2019 2020 return (fd1->fd_protocol == fd2->fd_protocol); 2021 } 2022 2023 /* ARGSUSED */ 2024 static boolean_t 2025 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2026 { 2027 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2028 in6_addr_t *a1, *m1, *a2, *m2; 2029 2030 ASSERT(fd1->fd_mask == fd2->fd_mask); 2031 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 2032 return (fd1->fd_dsfield == fd2->fd_dsfield && 2033 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2034 } 2035 2036 /* 2037 * flow_ip_accept_fe() already validated the version. 2038 */ 2039 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2040 if (fd1->fd_ipversion != fd2->fd_ipversion) 2041 return (B_FALSE); 2042 2043 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2044 case FLOW_IP_LOCAL: 2045 a1 = &fd1->fd_local_addr; 2046 m1 = &fd1->fd_local_netmask; 2047 a2 = &fd2->fd_local_addr; 2048 m2 = &fd2->fd_local_netmask; 2049 break; 2050 case FLOW_IP_REMOTE: 2051 a1 = &fd1->fd_remote_addr; 2052 m1 = &fd1->fd_remote_netmask; 2053 a2 = &fd2->fd_remote_addr; 2054 m2 = &fd2->fd_remote_netmask; 2055 break; 2056 default: 2057 /* 2058 * This is unreachable given the checks in 2059 * flow_ip_accept_fe(). 2060 */ 2061 return (B_FALSE); 2062 } 2063 2064 if (fd1->fd_ipversion == IPV4_VERSION) { 2065 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2066 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2067 2068 } else { 2069 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2070 IN6_ARE_ADDR_EQUAL(m1, m2)); 2071 } 2072 } 2073 2074 static int 2075 flow_ip_mask2plen(in6_addr_t *v6mask) 2076 { 2077 int bits; 2078 int plen = IPV6_ABITS; 2079 int i; 2080 2081 for (i = 3; i >= 0; i--) { 2082 if (v6mask->s6_addr32[i] == 0) { 2083 plen -= 32; 2084 continue; 2085 } 2086 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2087 if (bits == 0) 2088 break; 2089 plen -= bits; 2090 } 2091 return (plen); 2092 } 2093 2094 /* ARGSUSED */ 2095 static int 2096 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2097 flow_entry_t *flent) 2098 { 2099 flow_entry_t **p = headp; 2100 flow_desc_t *fd0, *fd; 2101 in6_addr_t *m0, *m; 2102 int plen0, plen; 2103 2104 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2105 2106 /* 2107 * No special ordering needed for dsfield. 2108 */ 2109 fd0 = &flent->fe_flow_desc; 2110 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2111 if (*p != NULL) { 2112 ASSERT(flent->fe_next == NULL); 2113 flent->fe_next = *p; 2114 } 2115 *p = flent; 2116 return (0); 2117 } 2118 2119 /* 2120 * IP address flows are arranged in descending prefix length order. 2121 */ 2122 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2123 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2124 plen0 = flow_ip_mask2plen(m0); 2125 ASSERT(plen0 != 0); 2126 2127 for (; *p != NULL; p = &(*p)->fe_next) { 2128 fd = &(*p)->fe_flow_desc; 2129 2130 /* 2131 * Normally a dsfield flent shouldn't end up on the same 2132 * list as an IP address because flow tables are (for now) 2133 * disjoint. If we decide to support both IP and dsfield 2134 * in the same table in the future, this check will allow 2135 * for that. 2136 */ 2137 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2138 continue; 2139 2140 /* 2141 * We also allow for the mixing of local and remote address 2142 * flents within one list. 2143 */ 2144 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2145 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2146 plen = flow_ip_mask2plen(m); 2147 2148 if (plen <= plen0) 2149 break; 2150 } 2151 if (*p != NULL) { 2152 ASSERT(flent->fe_next == NULL); 2153 flent->fe_next = *p; 2154 } 2155 *p = flent; 2156 return (0); 2157 } 2158 2159 /* 2160 * Transport layer protocol and port matching functions. 2161 */ 2162 2163 /* ARGSUSED */ 2164 static boolean_t 2165 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2166 { 2167 flow_l3info_t *l3info = &s->fs_l3info; 2168 flow_l4info_t *l4info = &s->fs_l4info; 2169 flow_desc_t *fd = &flent->fe_flow_desc; 2170 2171 return (fd->fd_protocol == l3info->l3_protocol && 2172 fd->fd_local_port == l4info->l4_hash_port); 2173 } 2174 2175 /* ARGSUSED */ 2176 static boolean_t 2177 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2178 { 2179 flow_l3info_t *l3info = &s->fs_l3info; 2180 flow_l4info_t *l4info = &s->fs_l4info; 2181 flow_desc_t *fd = &flent->fe_flow_desc; 2182 2183 return (fd->fd_protocol == l3info->l3_protocol && 2184 fd->fd_remote_port == l4info->l4_hash_port); 2185 } 2186 2187 /* 2188 * Transport hash function. 2189 * Since we only support either local or remote port flows, 2190 * we only need to extract one of the ports to be used for 2191 * matching. 2192 */ 2193 static uint32_t 2194 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2195 { 2196 flow_l3info_t *l3info = &s->fs_l3info; 2197 flow_l4info_t *l4info = &s->fs_l4info; 2198 uint8_t proto = l3info->l3_protocol; 2199 boolean_t dst_or_src; 2200 2201 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2202 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2203 } else { 2204 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2205 } 2206 2207 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2208 l4info->l4_src_port; 2209 2210 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2211 } 2212 2213 /* 2214 * Unlike other accept() functions above, we do not need to get the header 2215 * size because this is our highest layer so far. If we want to do support 2216 * other higher layer protocols, we would need to save the l4_hdrsize 2217 * in the code below. 2218 */ 2219 2220 /* ARGSUSED */ 2221 static int 2222 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2223 { 2224 flow_l3info_t *l3info = &s->fs_l3info; 2225 flow_l4info_t *l4info = &s->fs_l4info; 2226 uint8_t proto = l3info->l3_protocol; 2227 uchar_t *l4_start; 2228 2229 l4_start = l3info->l3_start + l3info->l3_hdrsize; 2230 2231 /* 2232 * Adjust start pointer if we're at the end of an mblk. 2233 */ 2234 CHECK_AND_ADJUST_START_PTR(s, l4_start); 2235 2236 l4info->l4_start = l4_start; 2237 if (!OK_32PTR(l4_start)) 2238 return (EINVAL); 2239 2240 if (l3info->l3_fragmented == B_TRUE) 2241 return (EINVAL); 2242 2243 switch (proto) { 2244 case IPPROTO_TCP: { 2245 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2246 2247 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2248 return (ENOBUFS); 2249 2250 l4info->l4_src_port = tcph->th_sport; 2251 l4info->l4_dst_port = tcph->th_dport; 2252 break; 2253 } 2254 case IPPROTO_UDP: { 2255 struct udphdr *udph = (struct udphdr *)l4_start; 2256 2257 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2258 return (ENOBUFS); 2259 2260 l4info->l4_src_port = udph->uh_sport; 2261 l4info->l4_dst_port = udph->uh_dport; 2262 break; 2263 } 2264 case IPPROTO_SCTP: { 2265 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2266 2267 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2268 return (ENOBUFS); 2269 2270 l4info->l4_src_port = sctph->sh_sport; 2271 l4info->l4_dst_port = sctph->sh_dport; 2272 break; 2273 } 2274 default: 2275 return (EINVAL); 2276 } 2277 2278 return (0); 2279 } 2280 2281 /* 2282 * Validates transport flow entry. 2283 * The protocol field must be present. 2284 */ 2285 2286 /* ARGSUSED */ 2287 static int 2288 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2289 { 2290 flow_desc_t *fd = &flent->fe_flow_desc; 2291 flow_mask_t mask = fd->fd_mask; 2292 2293 if ((mask & FLOW_IP_PROTOCOL) == 0) 2294 return (EINVAL); 2295 2296 switch (fd->fd_protocol) { 2297 case IPPROTO_TCP: 2298 case IPPROTO_UDP: 2299 case IPPROTO_SCTP: 2300 break; 2301 default: 2302 return (EINVAL); 2303 } 2304 2305 switch (mask & ~FLOW_IP_PROTOCOL) { 2306 case FLOW_ULP_PORT_LOCAL: 2307 if (fd->fd_local_port == 0) 2308 return (EINVAL); 2309 2310 flent->fe_match = flow_transport_lport_match; 2311 break; 2312 case FLOW_ULP_PORT_REMOTE: 2313 if (fd->fd_remote_port == 0) 2314 return (EINVAL); 2315 2316 flent->fe_match = flow_transport_rport_match; 2317 break; 2318 case 0: 2319 /* 2320 * transport-only flows conflicts with our table type. 2321 */ 2322 return (EOPNOTSUPP); 2323 default: 2324 return (EINVAL); 2325 } 2326 2327 return (0); 2328 } 2329 2330 static uint32_t 2331 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2332 { 2333 flow_desc_t *fd = &flent->fe_flow_desc; 2334 uint16_t port = 0; 2335 2336 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2337 fd->fd_local_port : fd->fd_remote_port; 2338 2339 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2340 } 2341 2342 /* ARGSUSED */ 2343 static boolean_t 2344 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2345 { 2346 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2347 2348 if (fd1->fd_protocol != fd2->fd_protocol) 2349 return (B_FALSE); 2350 2351 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2352 return (fd1->fd_local_port == fd2->fd_local_port); 2353 2354 return (fd1->fd_remote_port == fd2->fd_remote_port); 2355 } 2356 2357 static flow_ops_t flow_l2_ops = { 2358 flow_l2_accept_fe, 2359 flow_l2_hash_fe, 2360 flow_l2_match_fe, 2361 flow_generic_insert_fe, 2362 flow_l2_hash, 2363 {flow_l2_accept} 2364 }; 2365 2366 static flow_ops_t flow_ip_ops = { 2367 flow_ip_accept_fe, 2368 flow_ip_hash_fe, 2369 flow_ip_match_fe, 2370 flow_ip_insert_fe, 2371 flow_ip_hash, 2372 {flow_l2_accept, flow_ip_accept} 2373 }; 2374 2375 static flow_ops_t flow_ip_proto_ops = { 2376 flow_ip_proto_accept_fe, 2377 flow_ip_proto_hash_fe, 2378 flow_ip_proto_match_fe, 2379 flow_generic_insert_fe, 2380 flow_ip_proto_hash, 2381 {flow_l2_accept, flow_ip_accept} 2382 }; 2383 2384 static flow_ops_t flow_transport_ops = { 2385 flow_transport_accept_fe, 2386 flow_transport_hash_fe, 2387 flow_transport_match_fe, 2388 flow_generic_insert_fe, 2389 flow_transport_hash, 2390 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2391 }; 2392 2393 static flow_tab_info_t flow_tab_info_list[] = { 2394 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2395 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2396 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2397 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2398 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024} 2399 }; 2400 2401 #define FLOW_MAX_TAB_INFO \ 2402 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2403 2404 static flow_tab_info_t * 2405 mac_flow_tab_info_get(flow_mask_t mask) 2406 { 2407 int i; 2408 2409 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2410 if (mask == flow_tab_info_list[i].fti_mask) 2411 return (&flow_tab_info_list[i]); 2412 } 2413 return (NULL); 2414 } 2415