1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2018 Joyent, Inc. 26 */ 27 28 #include <sys/strsun.h> 29 #include <sys/sdt.h> 30 #include <sys/mac.h> 31 #include <sys/mac_impl.h> 32 #include <sys/mac_client_impl.h> 33 #include <sys/mac_stat.h> 34 #include <sys/dls.h> 35 #include <sys/dls_impl.h> 36 #include <sys/mac_soft_ring.h> 37 #include <sys/ethernet.h> 38 #include <sys/cpupart.h> 39 #include <sys/pool.h> 40 #include <sys/pool_pset.h> 41 #include <sys/vlan.h> 42 #include <inet/ip.h> 43 #include <inet/ip6.h> 44 #include <netinet/tcp.h> 45 #include <netinet/udp.h> 46 #include <netinet/sctp.h> 47 48 typedef struct flow_stats_s { 49 uint64_t fs_obytes; 50 uint64_t fs_opackets; 51 uint64_t fs_oerrors; 52 uint64_t fs_ibytes; 53 uint64_t fs_ipackets; 54 uint64_t fs_ierrors; 55 } flow_stats_t; 56 57 58 /* global flow table, will be a per exclusive-zone table later */ 59 static mod_hash_t *flow_hash; 60 static krwlock_t flow_tab_lock; 61 62 static kmem_cache_t *flow_cache; 63 static kmem_cache_t *flow_tab_cache; 64 static flow_ops_t flow_l2_ops; 65 66 typedef struct { 67 const char *fs_name; 68 uint_t fs_offset; 69 } flow_stats_info_t; 70 71 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 72 static flow_stats_info_t flow_stats_list[] = { 73 {"rbytes", FS_OFF(fs_ibytes)}, 74 {"ipackets", FS_OFF(fs_ipackets)}, 75 {"ierrors", FS_OFF(fs_ierrors)}, 76 {"obytes", FS_OFF(fs_obytes)}, 77 {"opackets", FS_OFF(fs_opackets)}, 78 {"oerrors", FS_OFF(fs_oerrors)} 79 }; 80 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 81 82 /* 83 * Checks whether a flow mask is legal. 84 */ 85 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 86 87 static void 88 flow_stat_init(kstat_named_t *knp) 89 { 90 int i; 91 92 for (i = 0; i < FS_SIZE; i++, knp++) { 93 kstat_named_init(knp, flow_stats_list[i].fs_name, 94 KSTAT_DATA_UINT64); 95 } 96 } 97 98 static int 99 flow_stat_update(kstat_t *ksp, int rw) 100 { 101 flow_entry_t *fep = ksp->ks_private; 102 kstat_named_t *knp = ksp->ks_data; 103 uint64_t *statp; 104 int i; 105 mac_rx_stats_t *mac_rx_stat; 106 mac_tx_stats_t *mac_tx_stat; 107 flow_stats_t flow_stats; 108 mac_soft_ring_set_t *mac_srs; 109 110 if (rw != KSTAT_READ) 111 return (EACCES); 112 113 bzero(&flow_stats, sizeof (flow_stats_t)); 114 115 for (i = 0; i < fep->fe_rx_srs_cnt; i++) { 116 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i]; 117 if (mac_srs == NULL) /* Multicast flow */ 118 break; 119 mac_rx_stat = &mac_srs->srs_rx.sr_stat; 120 121 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes + 122 mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes; 123 124 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt + 125 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt; 126 127 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors; 128 } 129 130 mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs; 131 if (mac_srs == NULL) /* Multicast flow */ 132 goto done; 133 mac_tx_stat = &mac_srs->srs_tx.st_stat; 134 135 flow_stats.fs_obytes = mac_tx_stat->mts_obytes; 136 flow_stats.fs_opackets = mac_tx_stat->mts_opackets; 137 flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors; 138 139 done: 140 for (i = 0; i < FS_SIZE; i++, knp++) { 141 statp = (uint64_t *) 142 ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset); 143 knp->value.ui64 = *statp; 144 } 145 return (0); 146 } 147 148 static void 149 flow_stat_create(flow_entry_t *fep) 150 { 151 kstat_t *ksp; 152 kstat_named_t *knp; 153 uint_t nstats = FS_SIZE; 154 155 /* 156 * Fow now, flow entries are only manipulated and visible from the 157 * global zone. 158 */ 159 ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow", 160 KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID); 161 if (ksp == NULL) 162 return; 163 164 ksp->ks_update = flow_stat_update; 165 ksp->ks_private = fep; 166 fep->fe_ksp = ksp; 167 168 knp = (kstat_named_t *)ksp->ks_data; 169 flow_stat_init(knp); 170 kstat_install(ksp); 171 } 172 173 void 174 flow_stat_destroy(flow_entry_t *fep) 175 { 176 if (fep->fe_ksp != NULL) { 177 kstat_delete(fep->fe_ksp); 178 fep->fe_ksp = NULL; 179 } 180 } 181 182 /* 183 * Initialize the flow table 184 */ 185 void 186 mac_flow_init() 187 { 188 flow_cache = kmem_cache_create("flow_entry_cache", 189 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 190 flow_tab_cache = kmem_cache_create("flow_tab_cache", 191 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 192 flow_hash = mod_hash_create_extended("flow_hash", 193 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 194 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 195 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 196 } 197 198 /* 199 * Cleanup and release the flow table 200 */ 201 void 202 mac_flow_fini() 203 { 204 kmem_cache_destroy(flow_cache); 205 kmem_cache_destroy(flow_tab_cache); 206 mod_hash_destroy_hash(flow_hash); 207 rw_destroy(&flow_tab_lock); 208 } 209 210 /* 211 * mac_create_flow(): create a flow_entry_t. 212 */ 213 int 214 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 215 void *client_cookie, uint_t type, flow_entry_t **flentp) 216 { 217 flow_entry_t *flent = *flentp; 218 int err = 0; 219 220 if (mrp != NULL) { 221 err = mac_validate_props(NULL, mrp); 222 if (err != 0) 223 return (err); 224 } 225 226 if (flent == NULL) { 227 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 228 bzero(flent, sizeof (*flent)); 229 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 230 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 231 232 /* Initialize the receiver function to a safe routine */ 233 flent->fe_cb_fn = (flow_fn_t)mac_rx_def; 234 flent->fe_index = -1; 235 } 236 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 237 238 /* This is an initial flow, will be configured later */ 239 if (fd == NULL) { 240 *flentp = flent; 241 return (0); 242 } 243 244 flent->fe_client_cookie = client_cookie; 245 flent->fe_type = type; 246 247 /* Save flow desc */ 248 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 249 250 if (mrp != NULL) { 251 /* 252 * We have already set fe_resource_props for a Link. 253 */ 254 if (type & FLOW_USER) { 255 bcopy(mrp, &flent->fe_resource_props, 256 sizeof (mac_resource_props_t)); 257 } 258 /* 259 * The effective resource list should reflect the priority 260 * that we set implicitly. 261 */ 262 if (!(mrp->mrp_mask & MRP_PRIORITY)) 263 mrp->mrp_mask |= MRP_PRIORITY; 264 if (type & FLOW_USER) 265 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 266 else 267 mrp->mrp_priority = MPL_LINK_DEFAULT; 268 bzero(mrp->mrp_pool, MAXPATHLEN); 269 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t)); 270 bcopy(mrp, &flent->fe_effective_props, 271 sizeof (mac_resource_props_t)); 272 } 273 flow_stat_create(flent); 274 275 *flentp = flent; 276 return (0); 277 } 278 279 /* 280 * Validate flow entry and add it to a flow table. 281 */ 282 int 283 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 284 { 285 flow_entry_t **headp, **p; 286 flow_ops_t *ops = &ft->ft_ops; 287 flow_mask_t mask; 288 uint32_t index; 289 int err; 290 291 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 292 293 /* 294 * Check for invalid bits in mask. 295 */ 296 mask = flent->fe_flow_desc.fd_mask; 297 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 298 return (EOPNOTSUPP); 299 300 /* 301 * Validate flent. 302 */ 303 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 304 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 305 flow_entry_t *, flent, int, err); 306 return (err); 307 } 308 309 /* 310 * Flent is valid. now calculate hash and insert it 311 * into hash table. 312 */ 313 index = ops->fo_hash_fe(ft, flent); 314 315 /* 316 * We do not need a lock up until now because we were 317 * not accessing the flow table. 318 */ 319 rw_enter(&ft->ft_lock, RW_WRITER); 320 headp = &ft->ft_table[index]; 321 322 /* 323 * Check for duplicate flow. 324 */ 325 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 326 if ((*p)->fe_flow_desc.fd_mask != 327 flent->fe_flow_desc.fd_mask) 328 continue; 329 330 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 331 rw_exit(&ft->ft_lock); 332 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 333 flow_entry_t *, flent, int, err); 334 return (EALREADY); 335 } 336 } 337 338 /* 339 * Insert flow to hash list. 340 */ 341 err = ops->fo_insert_fe(ft, headp, flent); 342 if (err != 0) { 343 rw_exit(&ft->ft_lock); 344 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 345 flow_entry_t *, flent, int, err); 346 return (err); 347 } 348 349 /* 350 * Save the hash index so it can be used by mac_flow_remove(). 351 */ 352 flent->fe_index = (int)index; 353 354 /* 355 * Save the flow tab back reference. 356 */ 357 flent->fe_flow_tab = ft; 358 FLOW_MARK(flent, FE_FLOW_TAB); 359 ft->ft_flow_count++; 360 rw_exit(&ft->ft_lock); 361 return (0); 362 } 363 364 /* 365 * Remove a flow from a mac client's subflow table 366 */ 367 void 368 mac_flow_rem_subflow(flow_entry_t *flent) 369 { 370 flow_tab_t *ft = flent->fe_flow_tab; 371 mac_client_impl_t *mcip = ft->ft_mcip; 372 mac_handle_t mh = (mac_handle_t)ft->ft_mip; 373 374 ASSERT(MAC_PERIM_HELD(mh)); 375 376 mac_flow_remove(ft, flent, B_FALSE); 377 if (flent->fe_mcip == NULL) { 378 /* 379 * The interface is not yet plumbed and mac_client_flow_add 380 * was not done. 381 */ 382 if (FLOW_TAB_EMPTY(ft)) { 383 mac_flow_tab_destroy(ft); 384 mcip->mci_subflow_tab = NULL; 385 } 386 } else { 387 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 388 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 389 } 390 mac_fastpath_enable(mh); 391 } 392 393 /* 394 * Add a flow to a mac client's subflow table and instantiate the flow 395 * in the mac by creating the associated SRSs etc. 396 */ 397 int 398 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 399 boolean_t instantiate_flow) 400 { 401 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 402 mac_handle_t mh = (mac_handle_t)mcip->mci_mip; 403 flow_tab_info_t *ftinfo; 404 flow_mask_t mask; 405 flow_tab_t *ft; 406 int err; 407 boolean_t ft_created = B_FALSE; 408 409 ASSERT(MAC_PERIM_HELD(mh)); 410 411 if ((err = mac_fastpath_disable(mh)) != 0) 412 return (err); 413 414 /* 415 * If the subflow table exists already just add the new subflow 416 * to the existing table, else we create a new subflow table below. 417 */ 418 ft = mcip->mci_subflow_tab; 419 if (ft == NULL) { 420 mask = flent->fe_flow_desc.fd_mask; 421 /* 422 * Try to create a new table and then add the subflow to the 423 * newly created subflow table 424 */ 425 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) { 426 mac_fastpath_enable(mh); 427 return (EOPNOTSUPP); 428 } 429 430 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 431 mcip->mci_mip, &ft); 432 ft_created = B_TRUE; 433 } 434 435 err = mac_flow_add(ft, flent); 436 if (err != 0) { 437 if (ft_created) 438 mac_flow_tab_destroy(ft); 439 mac_fastpath_enable(mh); 440 return (err); 441 } 442 443 if (instantiate_flow) { 444 /* Now activate the flow by creating its SRSs */ 445 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 446 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 447 if (err != 0) { 448 mac_flow_remove(ft, flent, B_FALSE); 449 if (ft_created) 450 mac_flow_tab_destroy(ft); 451 mac_fastpath_enable(mh); 452 return (err); 453 } 454 } else { 455 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 456 } 457 if (ft_created) { 458 ASSERT(mcip->mci_subflow_tab == NULL); 459 ft->ft_mcip = mcip; 460 mcip->mci_subflow_tab = ft; 461 if (instantiate_flow) 462 mac_client_update_classifier(mcip, B_TRUE); 463 } 464 return (0); 465 } 466 467 /* 468 * Remove flow entry from flow table. 469 */ 470 void 471 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 472 { 473 flow_entry_t **fp; 474 475 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 476 if (!(flent->fe_flags & FE_FLOW_TAB)) 477 return; 478 479 rw_enter(&ft->ft_lock, RW_WRITER); 480 /* 481 * If this is a permanent removal from the flow table, mark it 482 * CONDEMNED to prevent future references. If this is a temporary 483 * removal from the table, say to update the flow descriptor then 484 * we don't mark it CONDEMNED 485 */ 486 if (!temp) 487 FLOW_MARK(flent, FE_CONDEMNED); 488 /* 489 * Locate the specified flent. 490 */ 491 fp = &ft->ft_table[flent->fe_index]; 492 while (*fp != flent) 493 fp = &(*fp)->fe_next; 494 495 /* 496 * The flent must exist. Otherwise it's a bug. 497 */ 498 ASSERT(fp != NULL); 499 *fp = flent->fe_next; 500 flent->fe_next = NULL; 501 502 /* 503 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 504 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 505 * will panic. 506 */ 507 flent->fe_index = -1; 508 FLOW_UNMARK(flent, FE_FLOW_TAB); 509 ft->ft_flow_count--; 510 rw_exit(&ft->ft_lock); 511 } 512 513 /* 514 * This is the flow lookup routine used by the mac sw classifier engine. 515 */ 516 int 517 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 518 { 519 flow_state_t s; 520 flow_entry_t *flent; 521 flow_ops_t *ops = &ft->ft_ops; 522 boolean_t retried = B_FALSE; 523 int i, err; 524 525 s.fs_flags = flags; 526 retry: 527 s.fs_mp = mp; 528 529 /* 530 * Walk the list of predeclared accept functions. 531 * Each of these would accumulate enough state to allow the next 532 * accept routine to make progress. 533 */ 534 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 535 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 536 mblk_t *last; 537 538 /* 539 * ENOBUFS indicates that the mp could be too short 540 * and may need a pullup. 541 */ 542 if (err != ENOBUFS || retried) 543 return (err); 544 545 /* 546 * The pullup is done on the last processed mblk, not 547 * the starting one. pullup is not done if the mblk 548 * has references or if b_cont is NULL. 549 */ 550 last = s.fs_mp; 551 if (DB_REF(last) > 1 || last->b_cont == NULL || 552 pullupmsg(last, -1) == 0) 553 return (EINVAL); 554 555 retried = B_TRUE; 556 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 557 flow_state_t *, &s); 558 goto retry; 559 } 560 } 561 562 /* 563 * The packet is considered sane. We may now attempt to 564 * find the corresponding flent. 565 */ 566 rw_enter(&ft->ft_lock, RW_READER); 567 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 568 for (; flent != NULL; flent = flent->fe_next) { 569 if (flent->fe_match(ft, flent, &s)) { 570 FLOW_TRY_REFHOLD(flent, err); 571 if (err != 0) 572 continue; 573 *flentp = flent; 574 rw_exit(&ft->ft_lock); 575 return (0); 576 } 577 } 578 rw_exit(&ft->ft_lock); 579 return (ENOENT); 580 } 581 582 /* 583 * Walk flow table. 584 * The caller is assumed to have proper perimeter protection. 585 */ 586 int 587 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 588 void *arg) 589 { 590 int err, i, cnt = 0; 591 flow_entry_t *flent; 592 593 if (ft == NULL) 594 return (0); 595 596 for (i = 0; i < ft->ft_size; i++) { 597 for (flent = ft->ft_table[i]; flent != NULL; 598 flent = flent->fe_next) { 599 cnt++; 600 err = (*fn)(flent, arg); 601 if (err != 0) 602 return (err); 603 } 604 } 605 VERIFY(cnt == ft->ft_flow_count); 606 return (0); 607 } 608 609 /* 610 * Same as the above except a mutex is used for protection here. 611 */ 612 int 613 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 614 void *arg) 615 { 616 int err; 617 618 if (ft == NULL) 619 return (0); 620 621 rw_enter(&ft->ft_lock, RW_WRITER); 622 err = mac_flow_walk_nolock(ft, fn, arg); 623 rw_exit(&ft->ft_lock); 624 return (err); 625 } 626 627 static boolean_t mac_flow_clean(flow_entry_t *); 628 629 /* 630 * Destroy a flow entry. Called when the last reference on a flow is released. 631 */ 632 void 633 mac_flow_destroy(flow_entry_t *flent) 634 { 635 ASSERT(flent->fe_refcnt == 0); 636 637 if ((flent->fe_type & FLOW_USER) != 0) { 638 ASSERT(mac_flow_clean(flent)); 639 } else { 640 mac_flow_cleanup(flent); 641 } 642 mac_misc_stat_delete(flent); 643 mutex_destroy(&flent->fe_lock); 644 cv_destroy(&flent->fe_cv); 645 flow_stat_destroy(flent); 646 kmem_cache_free(flow_cache, flent); 647 } 648 649 /* 650 * XXX eric 651 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 652 * mac_link_flow_modify() should really be moved/reworked into the 653 * two functions below. This would consolidate all the mac property 654 * checking in one place. I'm leaving this alone for now since it's 655 * out of scope of the new flows work. 656 */ 657 /* ARGSUSED */ 658 uint32_t 659 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 660 { 661 uint32_t changed_mask = 0; 662 mac_resource_props_t *fmrp = &flent->fe_effective_props; 663 int i; 664 665 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 666 (!(fmrp->mrp_mask & MRP_MAXBW) || 667 (fmrp->mrp_maxbw != mrp->mrp_maxbw))) { 668 changed_mask |= MRP_MAXBW; 669 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 670 fmrp->mrp_mask &= ~MRP_MAXBW; 671 fmrp->mrp_maxbw = 0; 672 } else { 673 fmrp->mrp_mask |= MRP_MAXBW; 674 fmrp->mrp_maxbw = mrp->mrp_maxbw; 675 } 676 } 677 678 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 679 if (fmrp->mrp_priority != mrp->mrp_priority) 680 changed_mask |= MRP_PRIORITY; 681 if (mrp->mrp_priority == MPL_RESET) { 682 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 683 fmrp->mrp_mask &= ~MRP_PRIORITY; 684 } else { 685 fmrp->mrp_priority = mrp->mrp_priority; 686 fmrp->mrp_mask |= MRP_PRIORITY; 687 } 688 } 689 690 /* modify fanout */ 691 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 692 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 693 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 694 for (i = 0; i < mrp->mrp_ncpus; i++) { 695 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 696 break; 697 } 698 if (i == mrp->mrp_ncpus) { 699 /* 700 * The new set of cpus passed is exactly 701 * the same as the existing set. 702 */ 703 return (changed_mask); 704 } 705 } 706 changed_mask |= MRP_CPUS; 707 MAC_COPY_CPUS(mrp, fmrp); 708 } 709 710 /* 711 * Modify the rings property. 712 */ 713 if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS) 714 mac_set_rings_effective(flent->fe_mcip); 715 716 if ((mrp->mrp_mask & MRP_POOL) != 0) { 717 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0) 718 changed_mask |= MRP_POOL; 719 if (strlen(mrp->mrp_pool) == 0) 720 fmrp->mrp_mask &= ~MRP_POOL; 721 else 722 fmrp->mrp_mask |= MRP_POOL; 723 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN); 724 } 725 return (changed_mask); 726 } 727 728 void 729 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 730 { 731 uint32_t changed_mask; 732 mac_client_impl_t *mcip = flent->fe_mcip; 733 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 734 mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip); 735 cpupart_t *cpupart = NULL; 736 boolean_t use_default = B_FALSE; 737 738 ASSERT(flent != NULL); 739 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 740 741 rw_enter(&ft->ft_lock, RW_WRITER); 742 743 /* Update the cached values inside the subflow entry */ 744 changed_mask = mac_flow_modify_props(flent, mrp); 745 rw_exit(&ft->ft_lock); 746 /* 747 * Push the changed parameters to the scheduling code in the 748 * SRS's, to take effect right away. 749 */ 750 if (changed_mask & MRP_MAXBW) { 751 mac_srs_update_bwlimit(flent, mrp); 752 /* 753 * If bandwidth is changed, we may have to change 754 * the number of soft ring to be used for fanout. 755 * Call mac_flow_update_fanout() if MAC_BIND_CPU 756 * is not set and there is no user supplied cpu 757 * info. This applies only to link at this time. 758 */ 759 if (!(flent->fe_type & FLOW_USER) && 760 !(changed_mask & MRP_CPUS) && 761 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 762 mac_fanout_setup(mcip, flent, mcip_mrp, 763 mac_rx_deliver, mcip, NULL, NULL); 764 } 765 } 766 if (mrp->mrp_mask & MRP_PRIORITY) 767 mac_flow_update_priority(mcip, flent); 768 769 if (changed_mask & MRP_CPUS) 770 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, 771 NULL); 772 773 if (mrp->mrp_mask & MRP_POOL) { 774 pool_lock(); 775 cpupart = mac_pset_find(mrp, &use_default); 776 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, 777 cpupart); 778 mac_set_pool_effective(use_default, cpupart, mrp, emrp); 779 pool_unlock(); 780 } 781 } 782 783 /* 784 * This function waits for a certain condition to be met and is generally 785 * used before a destructive or quiescing operation. 786 */ 787 void 788 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 789 { 790 mutex_enter(&flent->fe_lock); 791 flent->fe_flags |= FE_WAITER; 792 793 switch (event) { 794 case FLOW_DRIVER_UPCALL: 795 /* 796 * We want to make sure the driver upcalls have finished before 797 * we signal the Rx SRS worker to quit. 798 */ 799 while (flent->fe_refcnt != 1) 800 cv_wait(&flent->fe_cv, &flent->fe_lock); 801 break; 802 803 case FLOW_USER_REF: 804 /* 805 * Wait for the fe_user_refcnt to drop to 0. The flow has 806 * been removed from the global flow hash. 807 */ 808 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 809 while (flent->fe_user_refcnt != 0) 810 cv_wait(&flent->fe_cv, &flent->fe_lock); 811 break; 812 813 default: 814 ASSERT(0); 815 } 816 817 flent->fe_flags &= ~FE_WAITER; 818 mutex_exit(&flent->fe_lock); 819 } 820 821 static boolean_t 822 mac_flow_clean(flow_entry_t *flent) 823 { 824 ASSERT(flent->fe_next == NULL); 825 ASSERT(flent->fe_tx_srs == NULL); 826 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 827 ASSERT(flent->fe_mbg == NULL); 828 829 return (B_TRUE); 830 } 831 832 void 833 mac_flow_cleanup(flow_entry_t *flent) 834 { 835 if ((flent->fe_type & FLOW_USER) == 0) { 836 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 837 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 838 ASSERT(flent->fe_refcnt == 0); 839 } else { 840 ASSERT(flent->fe_refcnt == 1); 841 } 842 843 if (flent->fe_mbg != NULL) { 844 ASSERT(flent->fe_tx_srs == NULL); 845 /* This is a multicast or broadcast flow entry */ 846 mac_bcast_grp_free(flent->fe_mbg); 847 flent->fe_mbg = NULL; 848 } 849 850 if (flent->fe_tx_srs != NULL) { 851 ASSERT(flent->fe_mbg == NULL); 852 mac_srs_free(flent->fe_tx_srs); 853 flent->fe_tx_srs = NULL; 854 } 855 856 /* 857 * In the normal case fe_rx_srs_cnt is 1. However in the error case 858 * when mac_unicast_add fails we may not have set up any SRS 859 * in which case fe_rx_srs_cnt will be zero. 860 */ 861 if (flent->fe_rx_srs_cnt != 0) { 862 ASSERT(flent->fe_rx_srs_cnt == 1); 863 mac_srs_free(flent->fe_rx_srs[0]); 864 flent->fe_rx_srs[0] = NULL; 865 flent->fe_rx_srs_cnt = 0; 866 } 867 ASSERT(flent->fe_rx_srs[0] == NULL); 868 } 869 870 void 871 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 872 { 873 /* 874 * Grab the fe_lock to see a self-consistent fe_flow_desc. 875 * Updates to the fe_flow_desc happen under the fe_lock 876 * after removing the flent from the flow table 877 */ 878 mutex_enter(&flent->fe_lock); 879 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 880 mutex_exit(&flent->fe_lock); 881 } 882 883 /* 884 * Update a field of a flow entry. The mac perimeter ensures that 885 * this is the only thread doing a modify operation on this mac end point. 886 * So the flow table can't change or disappear. The ft_lock protects access 887 * to the flow entry, and holding the lock ensures that there isn't any thread 888 * accessing the flow entry or attempting a flow table lookup. However 889 * data threads that are using the flow entry based on the old descriptor 890 * will continue to use the flow entry. If strong coherence is required 891 * then the flow will have to be quiesced before the descriptor can be 892 * changed. 893 */ 894 void 895 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 896 { 897 flow_tab_t *ft = flent->fe_flow_tab; 898 flow_desc_t old_desc; 899 int err; 900 901 if (ft == NULL) { 902 /* 903 * The flow hasn't yet been inserted into the table, 904 * so only the caller knows about this flow, however for 905 * uniformity we grab the fe_lock here. 906 */ 907 mutex_enter(&flent->fe_lock); 908 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 909 mutex_exit(&flent->fe_lock); 910 } 911 912 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 913 914 /* 915 * Need to remove the flow entry from the table and reinsert it, 916 * into a potentially diference hash line. The hash depends on 917 * the new descriptor fields. However access to fe_desc itself 918 * is always under the fe_lock. This helps log and stat functions 919 * see a self-consistent fe_flow_desc. 920 */ 921 mac_flow_remove(ft, flent, B_TRUE); 922 old_desc = flent->fe_flow_desc; 923 924 mutex_enter(&flent->fe_lock); 925 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 926 mutex_exit(&flent->fe_lock); 927 928 if (mac_flow_add(ft, flent) != 0) { 929 /* 930 * The add failed say due to an invalid flow descriptor. 931 * Undo the update 932 */ 933 flent->fe_flow_desc = old_desc; 934 err = mac_flow_add(ft, flent); 935 ASSERT(err == 0); 936 } 937 } 938 939 void 940 mac_flow_set_name(flow_entry_t *flent, const char *name) 941 { 942 flow_tab_t *ft = flent->fe_flow_tab; 943 944 if (ft == NULL) { 945 /* 946 * The flow hasn't yet been inserted into the table, 947 * so only the caller knows about this flow 948 */ 949 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 950 } else { 951 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 952 } 953 954 mutex_enter(&flent->fe_lock); 955 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 956 mutex_exit(&flent->fe_lock); 957 } 958 959 /* 960 * Return the client-private cookie that was associated with 961 * the flow when it was created. 962 */ 963 void * 964 mac_flow_get_client_cookie(flow_entry_t *flent) 965 { 966 return (flent->fe_client_cookie); 967 } 968 969 /* 970 * Forward declarations. 971 */ 972 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 973 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *); 974 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 975 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 976 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *); 977 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 978 979 /* 980 * Create flow table. 981 */ 982 void 983 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 984 mac_impl_t *mip, flow_tab_t **ftp) 985 { 986 flow_tab_t *ft; 987 flow_ops_t *new_ops; 988 989 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 990 bzero(ft, sizeof (*ft)); 991 992 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 993 994 /* 995 * We make a copy of the ops vector instead of just pointing to it 996 * because we might want to customize the ops vector on a per table 997 * basis (e.g. for optimization). 998 */ 999 new_ops = &ft->ft_ops; 1000 bcopy(ops, new_ops, sizeof (*ops)); 1001 ft->ft_mask = mask; 1002 ft->ft_size = size; 1003 ft->ft_mip = mip; 1004 1005 /* 1006 * Optimizations for DL_ETHER media. 1007 */ 1008 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 1009 if (new_ops->fo_hash == flow_l2_hash) 1010 new_ops->fo_hash = flow_ether_hash; 1011 if (new_ops->fo_hash_fe == flow_l2_hash_fe) 1012 new_ops->fo_hash_fe = flow_ether_hash_fe; 1013 if (new_ops->fo_accept[0] == flow_l2_accept) 1014 new_ops->fo_accept[0] = flow_ether_accept; 1015 } 1016 *ftp = ft; 1017 } 1018 1019 void 1020 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 1021 { 1022 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 1023 1024, mip, ftp); 1024 } 1025 1026 /* 1027 * Destroy flow table. 1028 */ 1029 void 1030 mac_flow_tab_destroy(flow_tab_t *ft) 1031 { 1032 if (ft == NULL) 1033 return; 1034 1035 ASSERT(ft->ft_flow_count == 0); 1036 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 1037 bzero(ft, sizeof (*ft)); 1038 kmem_cache_free(flow_tab_cache, ft); 1039 } 1040 1041 /* 1042 * Add a new flow entry to the global flow hash table 1043 */ 1044 int 1045 mac_flow_hash_add(flow_entry_t *flent) 1046 { 1047 int err; 1048 1049 rw_enter(&flow_tab_lock, RW_WRITER); 1050 err = mod_hash_insert(flow_hash, 1051 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 1052 if (err != 0) { 1053 rw_exit(&flow_tab_lock); 1054 return (EEXIST); 1055 } 1056 /* Mark as inserted into the global flow hash table */ 1057 FLOW_MARK(flent, FE_G_FLOW_HASH); 1058 rw_exit(&flow_tab_lock); 1059 return (err); 1060 } 1061 1062 /* 1063 * Remove a flow entry from the global flow hash table 1064 */ 1065 void 1066 mac_flow_hash_remove(flow_entry_t *flent) 1067 { 1068 mod_hash_val_t val; 1069 1070 rw_enter(&flow_tab_lock, RW_WRITER); 1071 VERIFY(mod_hash_remove(flow_hash, 1072 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 1073 1074 /* Clear the mark that says inserted into the global flow hash table */ 1075 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 1076 rw_exit(&flow_tab_lock); 1077 } 1078 1079 /* 1080 * Retrieve a flow entry from the global flow hash table. 1081 */ 1082 int 1083 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1084 { 1085 int err; 1086 flow_entry_t *flent; 1087 1088 rw_enter(&flow_tab_lock, RW_READER); 1089 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1090 (mod_hash_val_t *)&flent); 1091 if (err != 0) { 1092 rw_exit(&flow_tab_lock); 1093 return (ENOENT); 1094 } 1095 ASSERT(flent != NULL); 1096 FLOW_USER_REFHOLD(flent); 1097 rw_exit(&flow_tab_lock); 1098 1099 *flentp = flent; 1100 return (0); 1101 } 1102 1103 /* 1104 * Initialize or release mac client flows by walking the subflow table. 1105 * These are typically invoked during plumb/unplumb of links. 1106 */ 1107 1108 static int 1109 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1110 { 1111 mac_client_impl_t *mcip = arg; 1112 1113 if (mac_link_flow_init(arg, flent) != 0) { 1114 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1115 flent->fe_flow_name, mcip->mci_name); 1116 } else { 1117 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1118 } 1119 return (0); 1120 } 1121 1122 void 1123 mac_link_init_flows(mac_client_handle_t mch) 1124 { 1125 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1126 1127 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1128 mac_link_init_flows_cb, mcip); 1129 /* 1130 * If mac client had subflow(s) configured before plumb, change 1131 * function to mac_rx_srs_subflow_process and in case of hardware 1132 * classification, disable polling. 1133 */ 1134 mac_client_update_classifier(mcip, B_TRUE); 1135 1136 } 1137 1138 boolean_t 1139 mac_link_has_flows(mac_client_handle_t mch) 1140 { 1141 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1142 1143 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1144 return (B_TRUE); 1145 1146 return (B_FALSE); 1147 } 1148 1149 static int 1150 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1151 { 1152 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1153 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1154 mac_link_flow_clean(arg, flent); 1155 return (0); 1156 } 1157 1158 void 1159 mac_link_release_flows(mac_client_handle_t mch) 1160 { 1161 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1162 1163 /* 1164 * Change the mci_flent callback back to mac_rx_srs_process() 1165 * because flows are about to be deactivated. 1166 */ 1167 mac_client_update_classifier(mcip, B_FALSE); 1168 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1169 mac_link_release_flows_cb, mcip); 1170 } 1171 1172 void 1173 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1174 { 1175 mac_flow_set_name(fep, new_name); 1176 if (fep->fe_ksp != NULL) { 1177 flow_stat_destroy(fep); 1178 flow_stat_create(fep); 1179 } 1180 } 1181 1182 /* 1183 * mac_link_flow_init() 1184 * Internal flow interface used for allocating SRSs and related 1185 * data structures. Not meant to be used by mac clients. 1186 */ 1187 int 1188 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1189 { 1190 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1191 mac_impl_t *mip = mcip->mci_mip; 1192 int err; 1193 1194 ASSERT(mch != NULL); 1195 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1196 1197 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1198 return (err); 1199 1200 sub_flow->fe_mcip = mcip; 1201 1202 return (0); 1203 } 1204 1205 /* 1206 * mac_link_flow_add() 1207 * Used by flowadm(1m) or kernel mac clients for creating flows. 1208 */ 1209 int 1210 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1211 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1212 { 1213 flow_entry_t *flent = NULL; 1214 int err; 1215 dls_dl_handle_t dlh; 1216 dls_link_t *dlp; 1217 boolean_t link_held = B_FALSE; 1218 boolean_t hash_added = B_FALSE; 1219 mac_perim_handle_t mph; 1220 1221 err = mac_flow_lookup_byname(flow_name, &flent); 1222 if (err == 0) { 1223 FLOW_USER_REFRELE(flent); 1224 return (EEXIST); 1225 } 1226 1227 /* 1228 * First create a flow entry given the description provided 1229 * by the caller. 1230 */ 1231 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1232 FLOW_USER | FLOW_OTHER, &flent); 1233 1234 if (err != 0) 1235 return (err); 1236 1237 /* 1238 * We've got a local variable referencing this flow now, so we need 1239 * to hold it. We'll release this flow before returning. 1240 * All failures until we return will undo any action that may internally 1241 * held the flow, so the last REFRELE will assure a clean freeing 1242 * of resources. 1243 */ 1244 FLOW_REFHOLD(flent); 1245 1246 flent->fe_link_id = linkid; 1247 FLOW_MARK(flent, FE_INCIPIENT); 1248 1249 err = mac_perim_enter_by_linkid(linkid, &mph); 1250 if (err != 0) { 1251 FLOW_FINAL_REFRELE(flent); 1252 return (err); 1253 } 1254 1255 /* 1256 * dls will eventually be merged with mac so it's ok 1257 * to call dls' internal functions. 1258 */ 1259 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1260 if (err != 0) 1261 goto bail; 1262 1263 link_held = B_TRUE; 1264 1265 /* 1266 * Add the flow to the global flow table, this table will be per 1267 * exclusive zone so each zone can have its own flow namespace. 1268 * RFE 6625651 will fix this. 1269 * 1270 */ 1271 if ((err = mac_flow_hash_add(flent)) != 0) 1272 goto bail; 1273 1274 hash_added = B_TRUE; 1275 1276 /* 1277 * do not allow flows to be configured on an anchor VNIC 1278 */ 1279 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1280 err = ENOTSUP; 1281 goto bail; 1282 } 1283 1284 /* 1285 * Add the subflow to the subflow table. Also instantiate the flow 1286 * in the mac if there is an active user (we check if the MAC client's 1287 * datapath has been setup). 1288 */ 1289 err = mac_flow_add_subflow(dlp->dl_mch, flent, 1290 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); 1291 if (err != 0) 1292 goto bail; 1293 1294 FLOW_UNMARK(flent, FE_INCIPIENT); 1295 dls_devnet_rele_link(dlh, dlp); 1296 mac_perim_exit(mph); 1297 return (0); 1298 1299 bail: 1300 if (hash_added) 1301 mac_flow_hash_remove(flent); 1302 1303 if (link_held) 1304 dls_devnet_rele_link(dlh, dlp); 1305 1306 /* 1307 * Wait for any transient global flow hash refs to clear 1308 * and then release the creation reference on the flow 1309 */ 1310 mac_flow_wait(flent, FLOW_USER_REF); 1311 FLOW_FINAL_REFRELE(flent); 1312 mac_perim_exit(mph); 1313 return (err); 1314 } 1315 1316 /* 1317 * mac_link_flow_clean() 1318 * Internal flow interface used for freeing SRSs and related 1319 * data structures. Not meant to be used by mac clients. 1320 */ 1321 void 1322 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1323 { 1324 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1325 mac_impl_t *mip = mcip->mci_mip; 1326 boolean_t last_subflow; 1327 1328 ASSERT(mch != NULL); 1329 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1330 1331 /* 1332 * This sub flow entry may fail to be fully initialized by 1333 * mac_link_flow_init(). If so, simply return. 1334 */ 1335 if (sub_flow->fe_mcip == NULL) 1336 return; 1337 1338 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1339 /* 1340 * Tear down the data path 1341 */ 1342 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1343 sub_flow->fe_mcip = NULL; 1344 1345 /* 1346 * Delete the SRSs associated with this subflow. If this is being 1347 * driven by flowadm(1M) then the subflow will be deleted by 1348 * dls_rem_flow. However if this is a result of the interface being 1349 * unplumbed then the subflow itself won't be deleted. 1350 */ 1351 mac_flow_cleanup(sub_flow); 1352 1353 /* 1354 * If all the subflows are gone, renable some of the stuff 1355 * we disabled when adding a subflow, polling etc. 1356 */ 1357 if (last_subflow) { 1358 /* 1359 * The subflow table itself is not protected by any locks or 1360 * refcnts. Hence quiesce the client upfront before clearing 1361 * mci_subflow_tab. 1362 */ 1363 mac_client_quiesce(mcip); 1364 mac_client_update_classifier(mcip, B_FALSE); 1365 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1366 mcip->mci_subflow_tab = NULL; 1367 mac_client_restart(mcip); 1368 } 1369 } 1370 1371 /* 1372 * mac_link_flow_remove() 1373 * Used by flowadm(1m) or kernel mac clients for removing flows. 1374 */ 1375 int 1376 mac_link_flow_remove(char *flow_name) 1377 { 1378 flow_entry_t *flent; 1379 mac_perim_handle_t mph; 1380 int err; 1381 datalink_id_t linkid; 1382 1383 err = mac_flow_lookup_byname(flow_name, &flent); 1384 if (err != 0) 1385 return (err); 1386 1387 linkid = flent->fe_link_id; 1388 FLOW_USER_REFRELE(flent); 1389 1390 /* 1391 * The perim must be acquired before acquiring any other references 1392 * to maintain the lock and perimeter hierarchy. Please note the 1393 * FLOW_REFRELE above. 1394 */ 1395 err = mac_perim_enter_by_linkid(linkid, &mph); 1396 if (err != 0) 1397 return (err); 1398 1399 /* 1400 * Note the second lookup of the flow, because a concurrent thread 1401 * may have removed it already while we were waiting to enter the 1402 * link's perimeter. 1403 */ 1404 err = mac_flow_lookup_byname(flow_name, &flent); 1405 if (err != 0) { 1406 mac_perim_exit(mph); 1407 return (err); 1408 } 1409 FLOW_USER_REFRELE(flent); 1410 1411 /* 1412 * Remove the flow from the subflow table and deactivate the flow 1413 * by quiescing and removings its SRSs 1414 */ 1415 mac_flow_rem_subflow(flent); 1416 1417 /* 1418 * Finally, remove the flow from the global table. 1419 */ 1420 mac_flow_hash_remove(flent); 1421 1422 /* 1423 * Wait for any transient global flow hash refs to clear 1424 * and then release the creation reference on the flow 1425 */ 1426 mac_flow_wait(flent, FLOW_USER_REF); 1427 FLOW_FINAL_REFRELE(flent); 1428 1429 mac_perim_exit(mph); 1430 1431 return (0); 1432 } 1433 1434 /* 1435 * mac_link_flow_modify() 1436 * Modifies the properties of a flow identified by its name. 1437 */ 1438 int 1439 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1440 { 1441 flow_entry_t *flent; 1442 mac_client_impl_t *mcip; 1443 int err = 0; 1444 mac_perim_handle_t mph; 1445 datalink_id_t linkid; 1446 flow_tab_t *flow_tab; 1447 1448 err = mac_validate_props(NULL, mrp); 1449 if (err != 0) 1450 return (err); 1451 1452 err = mac_flow_lookup_byname(flow_name, &flent); 1453 if (err != 0) 1454 return (err); 1455 1456 linkid = flent->fe_link_id; 1457 FLOW_USER_REFRELE(flent); 1458 1459 /* 1460 * The perim must be acquired before acquiring any other references 1461 * to maintain the lock and perimeter hierarchy. Please note the 1462 * FLOW_REFRELE above. 1463 */ 1464 err = mac_perim_enter_by_linkid(linkid, &mph); 1465 if (err != 0) 1466 return (err); 1467 1468 /* 1469 * Note the second lookup of the flow, because a concurrent thread 1470 * may have removed it already while we were waiting to enter the 1471 * link's perimeter. 1472 */ 1473 err = mac_flow_lookup_byname(flow_name, &flent); 1474 if (err != 0) { 1475 mac_perim_exit(mph); 1476 return (err); 1477 } 1478 FLOW_USER_REFRELE(flent); 1479 1480 /* 1481 * If this flow is attached to a MAC client, then pass the request 1482 * along to the client. 1483 * Otherwise, just update the cached values. 1484 */ 1485 mcip = flent->fe_mcip; 1486 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1487 if (mcip != NULL) { 1488 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1489 err = ENOENT; 1490 } else { 1491 mac_flow_modify(flow_tab, flent, mrp); 1492 } 1493 } else { 1494 (void) mac_flow_modify_props(flent, mrp); 1495 } 1496 1497 done: 1498 mac_perim_exit(mph); 1499 return (err); 1500 } 1501 1502 1503 /* 1504 * State structure and misc functions used by mac_link_flow_walk(). 1505 */ 1506 typedef struct { 1507 int (*ws_func)(mac_flowinfo_t *, void *); 1508 void *ws_arg; 1509 } flow_walk_state_t; 1510 1511 static void 1512 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1513 { 1514 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1515 MAXFLOWNAMELEN); 1516 finfop->fi_link_id = flent->fe_link_id; 1517 finfop->fi_flow_desc = flent->fe_flow_desc; 1518 finfop->fi_resource_props = flent->fe_resource_props; 1519 } 1520 1521 static int 1522 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1523 { 1524 flow_walk_state_t *statep = arg; 1525 mac_flowinfo_t *finfo; 1526 int err; 1527 1528 finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP); 1529 mac_link_flowinfo_copy(finfo, flent); 1530 err = statep->ws_func(finfo, statep->ws_arg); 1531 kmem_free(finfo, sizeof (*finfo)); 1532 return (err); 1533 } 1534 1535 /* 1536 * mac_link_flow_walk() 1537 * Invokes callback 'func' for all flows belonging to the specified link. 1538 */ 1539 int 1540 mac_link_flow_walk(datalink_id_t linkid, 1541 int (*func)(mac_flowinfo_t *, void *), void *arg) 1542 { 1543 mac_client_impl_t *mcip; 1544 mac_perim_handle_t mph; 1545 flow_walk_state_t state; 1546 dls_dl_handle_t dlh; 1547 dls_link_t *dlp; 1548 int err; 1549 1550 err = mac_perim_enter_by_linkid(linkid, &mph); 1551 if (err != 0) 1552 return (err); 1553 1554 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1555 if (err != 0) { 1556 mac_perim_exit(mph); 1557 return (err); 1558 } 1559 1560 mcip = (mac_client_impl_t *)dlp->dl_mch; 1561 state.ws_func = func; 1562 state.ws_arg = arg; 1563 1564 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1565 mac_link_flow_walk_cb, &state); 1566 1567 dls_devnet_rele_link(dlh, dlp); 1568 mac_perim_exit(mph); 1569 return (err); 1570 } 1571 1572 /* 1573 * mac_link_flow_info() 1574 * Retrieves information about a specific flow. 1575 */ 1576 int 1577 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1578 { 1579 flow_entry_t *flent; 1580 int err; 1581 1582 err = mac_flow_lookup_byname(flow_name, &flent); 1583 if (err != 0) 1584 return (err); 1585 1586 mac_link_flowinfo_copy(finfo, flent); 1587 FLOW_USER_REFRELE(flent); 1588 return (0); 1589 } 1590 1591 /* 1592 * Hash function macro that takes an Ethernet address and VLAN id as input. 1593 */ 1594 #define HASH_ETHER_VID(a, v, s) \ 1595 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1596 1597 /* 1598 * Generic layer-2 address hashing function that takes an address and address 1599 * length as input. This is the DJB hash function. 1600 */ 1601 static uint32_t 1602 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize) 1603 { 1604 uint32_t hash = 5381; 1605 size_t i; 1606 1607 for (i = 0; i < addrlen; i++) 1608 hash = ((hash << 5) + hash) + addr[i]; 1609 return (hash % htsize); 1610 } 1611 1612 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1613 1614 #define CHECK_AND_ADJUST_START_PTR(s, start) { \ 1615 if ((s)->fs_mp->b_wptr == (start)) { \ 1616 mblk_t *next = (s)->fs_mp->b_cont; \ 1617 if (next == NULL) \ 1618 return (EINVAL); \ 1619 \ 1620 (s)->fs_mp = next; \ 1621 (start) = next->b_rptr; \ 1622 } \ 1623 } 1624 1625 /* ARGSUSED */ 1626 static boolean_t 1627 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1628 { 1629 flow_l2info_t *l2 = &s->fs_l2info; 1630 flow_desc_t *fd = &flent->fe_flow_desc; 1631 1632 return (l2->l2_vid == fd->fd_vid && 1633 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1634 } 1635 1636 /* 1637 * Layer 2 hash function. 1638 * Must be paired with flow_l2_accept() within a set of flow_ops 1639 * because it assumes the dest address is already extracted. 1640 */ 1641 static uint32_t 1642 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1643 { 1644 return (flow_l2_addrhash(s->fs_l2info.l2_daddr, 1645 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1646 } 1647 1648 /* 1649 * This is the generic layer 2 accept function. 1650 * It makes use of mac_header_info() to extract the header length, 1651 * sap, vlan ID and destination address. 1652 */ 1653 static int 1654 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1655 { 1656 boolean_t is_ether; 1657 flow_l2info_t *l2 = &s->fs_l2info; 1658 mac_header_info_t mhi; 1659 int err; 1660 1661 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1662 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1663 s->fs_mp, &mhi)) != 0) { 1664 if (err == EINVAL) 1665 err = ENOBUFS; 1666 1667 return (err); 1668 } 1669 1670 l2->l2_start = s->fs_mp->b_rptr; 1671 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1672 1673 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1674 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1675 struct ether_vlan_header *evhp = 1676 (struct ether_vlan_header *)l2->l2_start; 1677 1678 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1679 return (ENOBUFS); 1680 1681 l2->l2_sap = ntohs(evhp->ether_type); 1682 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1683 l2->l2_hdrsize = sizeof (*evhp); 1684 } else { 1685 l2->l2_sap = mhi.mhi_bindsap; 1686 l2->l2_vid = 0; 1687 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1688 } 1689 return (0); 1690 } 1691 1692 /* 1693 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1694 * accept(). The notable difference is that dest address is now extracted 1695 * by hash() rather than by accept(). This saves a few memory references 1696 * for flow tables that do not care about mac addresses. 1697 */ 1698 static uint32_t 1699 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1700 { 1701 flow_l2info_t *l2 = &s->fs_l2info; 1702 struct ether_vlan_header *evhp; 1703 1704 evhp = (struct ether_vlan_header *)l2->l2_start; 1705 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1706 return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1707 } 1708 1709 static uint32_t 1710 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1711 { 1712 flow_desc_t *fd = &flent->fe_flow_desc; 1713 1714 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1715 return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1716 } 1717 1718 /* ARGSUSED */ 1719 static int 1720 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1721 { 1722 flow_l2info_t *l2 = &s->fs_l2info; 1723 struct ether_vlan_header *evhp; 1724 uint16_t sap; 1725 1726 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1727 l2->l2_start = (uchar_t *)evhp; 1728 1729 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1730 return (ENOBUFS); 1731 1732 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1733 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1734 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1735 return (ENOBUFS); 1736 1737 l2->l2_sap = ntohs(evhp->ether_type); 1738 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1739 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1740 } else { 1741 l2->l2_sap = sap; 1742 l2->l2_vid = 0; 1743 l2->l2_hdrsize = sizeof (struct ether_header); 1744 } 1745 return (0); 1746 } 1747 1748 /* 1749 * Validates a layer 2 flow entry. 1750 */ 1751 static int 1752 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1753 { 1754 flow_desc_t *fd = &flent->fe_flow_desc; 1755 1756 /* 1757 * Dest address is mandatory, and 0 length addresses are not yet 1758 * supported. 1759 */ 1760 if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0) 1761 return (EINVAL); 1762 1763 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1764 /* 1765 * VLAN flows are only supported over ethernet macs. 1766 */ 1767 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1768 return (EINVAL); 1769 1770 if (fd->fd_vid == 0) 1771 return (EINVAL); 1772 1773 } 1774 flent->fe_match = flow_l2_match; 1775 return (0); 1776 } 1777 1778 /* 1779 * Calculates hash index of flow entry. 1780 */ 1781 static uint32_t 1782 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1783 { 1784 flow_desc_t *fd = &flent->fe_flow_desc; 1785 1786 ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0); 1787 return (flow_l2_addrhash(fd->fd_dst_mac, 1788 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1789 } 1790 1791 /* 1792 * This is used for duplicate flow checking. 1793 */ 1794 /* ARGSUSED */ 1795 static boolean_t 1796 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1797 { 1798 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1799 1800 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1801 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1802 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1803 } 1804 1805 /* 1806 * Generic flow entry insertion function. 1807 * Used by flow tables that do not have ordering requirements. 1808 */ 1809 /* ARGSUSED */ 1810 static int 1811 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1812 flow_entry_t *flent) 1813 { 1814 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1815 1816 if (*headp != NULL) { 1817 ASSERT(flent->fe_next == NULL); 1818 flent->fe_next = *headp; 1819 } 1820 *headp = flent; 1821 return (0); 1822 } 1823 1824 /* 1825 * IP version independent DSField matching function. 1826 */ 1827 /* ARGSUSED */ 1828 static boolean_t 1829 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1830 { 1831 flow_l3info_t *l3info = &s->fs_l3info; 1832 flow_desc_t *fd = &flent->fe_flow_desc; 1833 1834 switch (l3info->l3_version) { 1835 case IPV4_VERSION: { 1836 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1837 1838 return ((ipha->ipha_type_of_service & 1839 fd->fd_dsfield_mask) == fd->fd_dsfield); 1840 } 1841 case IPV6_VERSION: { 1842 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1843 1844 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1845 fd->fd_dsfield_mask) == fd->fd_dsfield); 1846 } 1847 default: 1848 return (B_FALSE); 1849 } 1850 } 1851 1852 /* 1853 * IP v4 and v6 address matching. 1854 * The netmask only needs to be applied on the packet but not on the 1855 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1856 */ 1857 1858 /* ARGSUSED */ 1859 static boolean_t 1860 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1861 { 1862 flow_l3info_t *l3info = &s->fs_l3info; 1863 flow_desc_t *fd = &flent->fe_flow_desc; 1864 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1865 in_addr_t addr; 1866 1867 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1868 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1869 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1870 V4_PART_OF_V6(fd->fd_local_addr)); 1871 } 1872 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1873 V4_PART_OF_V6(fd->fd_remote_addr)); 1874 } 1875 1876 /* ARGSUSED */ 1877 static boolean_t 1878 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1879 { 1880 flow_l3info_t *l3info = &s->fs_l3info; 1881 flow_desc_t *fd = &flent->fe_flow_desc; 1882 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1883 in6_addr_t *addrp; 1884 1885 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1886 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1887 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1888 fd->fd_local_addr)); 1889 } 1890 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1891 } 1892 1893 /* ARGSUSED */ 1894 static boolean_t 1895 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1896 { 1897 flow_l3info_t *l3info = &s->fs_l3info; 1898 flow_desc_t *fd = &flent->fe_flow_desc; 1899 1900 return (l3info->l3_protocol == fd->fd_protocol); 1901 } 1902 1903 static uint32_t 1904 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1905 { 1906 flow_l3info_t *l3info = &s->fs_l3info; 1907 flow_mask_t mask = ft->ft_mask; 1908 1909 if ((mask & FLOW_IP_LOCAL) != 0) { 1910 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1911 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1912 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1913 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1914 /* 1915 * DSField flents are arranged as a single list. 1916 */ 1917 return (0); 1918 } 1919 /* 1920 * IP addr flents are hashed into two lists, v4 or v6. 1921 */ 1922 ASSERT(ft->ft_size >= 2); 1923 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1924 } 1925 1926 static uint32_t 1927 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1928 { 1929 flow_l3info_t *l3info = &s->fs_l3info; 1930 1931 return (l3info->l3_protocol % ft->ft_size); 1932 } 1933 1934 /* ARGSUSED */ 1935 static int 1936 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1937 { 1938 flow_l2info_t *l2info = &s->fs_l2info; 1939 flow_l3info_t *l3info = &s->fs_l3info; 1940 uint16_t sap = l2info->l2_sap; 1941 uchar_t *l3_start; 1942 1943 l3_start = l2info->l2_start + l2info->l2_hdrsize; 1944 1945 /* 1946 * Adjust start pointer if we're at the end of an mblk. 1947 */ 1948 CHECK_AND_ADJUST_START_PTR(s, l3_start); 1949 1950 l3info->l3_start = l3_start; 1951 if (!OK_32PTR(l3_start)) 1952 return (EINVAL); 1953 1954 switch (sap) { 1955 case ETHERTYPE_IP: { 1956 ipha_t *ipha = (ipha_t *)l3_start; 1957 1958 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1959 return (ENOBUFS); 1960 1961 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1962 l3info->l3_protocol = ipha->ipha_protocol; 1963 l3info->l3_version = IPV4_VERSION; 1964 l3info->l3_fragmented = 1965 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1966 break; 1967 } 1968 case ETHERTYPE_IPV6: { 1969 ip6_t *ip6h = (ip6_t *)l3_start; 1970 ip6_frag_t *frag = NULL; 1971 uint16_t ip6_hdrlen; 1972 uint8_t nexthdr; 1973 1974 if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen, 1975 &nexthdr, &frag)) { 1976 return (ENOBUFS); 1977 } 1978 l3info->l3_hdrsize = ip6_hdrlen; 1979 l3info->l3_protocol = nexthdr; 1980 l3info->l3_version = IPV6_VERSION; 1981 l3info->l3_fragmented = (frag != NULL); 1982 break; 1983 } 1984 default: 1985 return (EINVAL); 1986 } 1987 return (0); 1988 } 1989 1990 /* ARGSUSED */ 1991 static int 1992 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1993 { 1994 flow_desc_t *fd = &flent->fe_flow_desc; 1995 1996 switch (fd->fd_protocol) { 1997 case IPPROTO_TCP: 1998 case IPPROTO_UDP: 1999 case IPPROTO_SCTP: 2000 case IPPROTO_ICMP: 2001 case IPPROTO_ICMPV6: 2002 flent->fe_match = flow_ip_proto_match; 2003 return (0); 2004 default: 2005 return (EINVAL); 2006 } 2007 } 2008 2009 /* ARGSUSED */ 2010 static int 2011 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2012 { 2013 flow_desc_t *fd = &flent->fe_flow_desc; 2014 flow_mask_t mask; 2015 uint8_t version; 2016 in6_addr_t *addr, *netmask; 2017 2018 /* 2019 * DSField does not require a IP version. 2020 */ 2021 if (fd->fd_mask == FLOW_IP_DSFIELD) { 2022 if (fd->fd_dsfield_mask == 0) 2023 return (EINVAL); 2024 2025 flent->fe_match = flow_ip_dsfield_match; 2026 return (0); 2027 } 2028 2029 /* 2030 * IP addresses must come with a version to avoid ambiguity. 2031 */ 2032 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 2033 return (EINVAL); 2034 2035 version = fd->fd_ipversion; 2036 if (version != IPV4_VERSION && version != IPV6_VERSION) 2037 return (EINVAL); 2038 2039 mask = fd->fd_mask & ~FLOW_IP_VERSION; 2040 switch (mask) { 2041 case FLOW_IP_LOCAL: 2042 addr = &fd->fd_local_addr; 2043 netmask = &fd->fd_local_netmask; 2044 break; 2045 case FLOW_IP_REMOTE: 2046 addr = &fd->fd_remote_addr; 2047 netmask = &fd->fd_remote_netmask; 2048 break; 2049 default: 2050 return (EINVAL); 2051 } 2052 2053 /* 2054 * Apply netmask onto specified address. 2055 */ 2056 V6_MASK_COPY(*addr, *netmask, *addr); 2057 if (version == IPV4_VERSION) { 2058 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 2059 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 2060 2061 if (v4addr == 0 || v4mask == 0) 2062 return (EINVAL); 2063 flent->fe_match = flow_ip_v4_match; 2064 } else { 2065 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 2066 IN6_IS_ADDR_UNSPECIFIED(netmask)) 2067 return (EINVAL); 2068 flent->fe_match = flow_ip_v6_match; 2069 } 2070 return (0); 2071 } 2072 2073 static uint32_t 2074 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2075 { 2076 flow_desc_t *fd = &flent->fe_flow_desc; 2077 2078 return (fd->fd_protocol % ft->ft_size); 2079 } 2080 2081 static uint32_t 2082 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2083 { 2084 flow_desc_t *fd = &flent->fe_flow_desc; 2085 2086 /* 2087 * DSField flents are arranged as a single list. 2088 */ 2089 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2090 return (0); 2091 2092 /* 2093 * IP addr flents are hashed into two lists, v4 or v6. 2094 */ 2095 ASSERT(ft->ft_size >= 2); 2096 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 2097 } 2098 2099 /* ARGSUSED */ 2100 static boolean_t 2101 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2102 { 2103 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2104 2105 return (fd1->fd_protocol == fd2->fd_protocol); 2106 } 2107 2108 /* ARGSUSED */ 2109 static boolean_t 2110 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2111 { 2112 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2113 in6_addr_t *a1, *m1, *a2, *m2; 2114 2115 ASSERT(fd1->fd_mask == fd2->fd_mask); 2116 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 2117 return (fd1->fd_dsfield == fd2->fd_dsfield && 2118 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2119 } 2120 2121 /* 2122 * flow_ip_accept_fe() already validated the version. 2123 */ 2124 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2125 if (fd1->fd_ipversion != fd2->fd_ipversion) 2126 return (B_FALSE); 2127 2128 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2129 case FLOW_IP_LOCAL: 2130 a1 = &fd1->fd_local_addr; 2131 m1 = &fd1->fd_local_netmask; 2132 a2 = &fd2->fd_local_addr; 2133 m2 = &fd2->fd_local_netmask; 2134 break; 2135 case FLOW_IP_REMOTE: 2136 a1 = &fd1->fd_remote_addr; 2137 m1 = &fd1->fd_remote_netmask; 2138 a2 = &fd2->fd_remote_addr; 2139 m2 = &fd2->fd_remote_netmask; 2140 break; 2141 default: 2142 /* 2143 * This is unreachable given the checks in 2144 * flow_ip_accept_fe(). 2145 */ 2146 return (B_FALSE); 2147 } 2148 2149 if (fd1->fd_ipversion == IPV4_VERSION) { 2150 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2151 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2152 2153 } else { 2154 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2155 IN6_ARE_ADDR_EQUAL(m1, m2)); 2156 } 2157 } 2158 2159 static int 2160 flow_ip_mask2plen(in6_addr_t *v6mask) 2161 { 2162 int bits; 2163 int plen = IPV6_ABITS; 2164 int i; 2165 2166 for (i = 3; i >= 0; i--) { 2167 if (v6mask->s6_addr32[i] == 0) { 2168 plen -= 32; 2169 continue; 2170 } 2171 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2172 if (bits == 0) 2173 break; 2174 plen -= bits; 2175 } 2176 return (plen); 2177 } 2178 2179 /* ARGSUSED */ 2180 static int 2181 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2182 flow_entry_t *flent) 2183 { 2184 flow_entry_t **p = headp; 2185 flow_desc_t *fd0, *fd; 2186 in6_addr_t *m0, *m; 2187 int plen0, plen; 2188 2189 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2190 2191 /* 2192 * No special ordering needed for dsfield. 2193 */ 2194 fd0 = &flent->fe_flow_desc; 2195 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2196 if (*p != NULL) { 2197 ASSERT(flent->fe_next == NULL); 2198 flent->fe_next = *p; 2199 } 2200 *p = flent; 2201 return (0); 2202 } 2203 2204 /* 2205 * IP address flows are arranged in descending prefix length order. 2206 */ 2207 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2208 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2209 plen0 = flow_ip_mask2plen(m0); 2210 ASSERT(plen0 != 0); 2211 2212 for (; *p != NULL; p = &(*p)->fe_next) { 2213 fd = &(*p)->fe_flow_desc; 2214 2215 /* 2216 * Normally a dsfield flent shouldn't end up on the same 2217 * list as an IP address because flow tables are (for now) 2218 * disjoint. If we decide to support both IP and dsfield 2219 * in the same table in the future, this check will allow 2220 * for that. 2221 */ 2222 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2223 continue; 2224 2225 /* 2226 * We also allow for the mixing of local and remote address 2227 * flents within one list. 2228 */ 2229 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2230 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2231 plen = flow_ip_mask2plen(m); 2232 2233 if (plen <= plen0) 2234 break; 2235 } 2236 if (*p != NULL) { 2237 ASSERT(flent->fe_next == NULL); 2238 flent->fe_next = *p; 2239 } 2240 *p = flent; 2241 return (0); 2242 } 2243 2244 /* 2245 * Transport layer protocol and port matching functions. 2246 */ 2247 2248 /* ARGSUSED */ 2249 static boolean_t 2250 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2251 { 2252 flow_l3info_t *l3info = &s->fs_l3info; 2253 flow_l4info_t *l4info = &s->fs_l4info; 2254 flow_desc_t *fd = &flent->fe_flow_desc; 2255 2256 return (fd->fd_protocol == l3info->l3_protocol && 2257 fd->fd_local_port == l4info->l4_hash_port); 2258 } 2259 2260 /* ARGSUSED */ 2261 static boolean_t 2262 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2263 { 2264 flow_l3info_t *l3info = &s->fs_l3info; 2265 flow_l4info_t *l4info = &s->fs_l4info; 2266 flow_desc_t *fd = &flent->fe_flow_desc; 2267 2268 return (fd->fd_protocol == l3info->l3_protocol && 2269 fd->fd_remote_port == l4info->l4_hash_port); 2270 } 2271 2272 /* 2273 * Transport hash function. 2274 * Since we only support either local or remote port flows, 2275 * we only need to extract one of the ports to be used for 2276 * matching. 2277 */ 2278 static uint32_t 2279 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2280 { 2281 flow_l3info_t *l3info = &s->fs_l3info; 2282 flow_l4info_t *l4info = &s->fs_l4info; 2283 uint8_t proto = l3info->l3_protocol; 2284 boolean_t dst_or_src; 2285 2286 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2287 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2288 } else { 2289 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2290 } 2291 2292 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2293 l4info->l4_src_port; 2294 2295 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2296 } 2297 2298 /* 2299 * Unlike other accept() functions above, we do not need to get the header 2300 * size because this is our highest layer so far. If we want to do support 2301 * other higher layer protocols, we would need to save the l4_hdrsize 2302 * in the code below. 2303 */ 2304 2305 /* ARGSUSED */ 2306 static int 2307 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2308 { 2309 flow_l3info_t *l3info = &s->fs_l3info; 2310 flow_l4info_t *l4info = &s->fs_l4info; 2311 uint8_t proto = l3info->l3_protocol; 2312 uchar_t *l4_start; 2313 2314 l4_start = l3info->l3_start + l3info->l3_hdrsize; 2315 2316 /* 2317 * Adjust start pointer if we're at the end of an mblk. 2318 */ 2319 CHECK_AND_ADJUST_START_PTR(s, l4_start); 2320 2321 l4info->l4_start = l4_start; 2322 if (!OK_32PTR(l4_start)) 2323 return (EINVAL); 2324 2325 if (l3info->l3_fragmented == B_TRUE) 2326 return (EINVAL); 2327 2328 switch (proto) { 2329 case IPPROTO_TCP: { 2330 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2331 2332 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2333 return (ENOBUFS); 2334 2335 l4info->l4_src_port = tcph->th_sport; 2336 l4info->l4_dst_port = tcph->th_dport; 2337 break; 2338 } 2339 case IPPROTO_UDP: { 2340 struct udphdr *udph = (struct udphdr *)l4_start; 2341 2342 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2343 return (ENOBUFS); 2344 2345 l4info->l4_src_port = udph->uh_sport; 2346 l4info->l4_dst_port = udph->uh_dport; 2347 break; 2348 } 2349 case IPPROTO_SCTP: { 2350 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2351 2352 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2353 return (ENOBUFS); 2354 2355 l4info->l4_src_port = sctph->sh_sport; 2356 l4info->l4_dst_port = sctph->sh_dport; 2357 break; 2358 } 2359 default: 2360 return (EINVAL); 2361 } 2362 2363 return (0); 2364 } 2365 2366 /* 2367 * Validates transport flow entry. 2368 * The protocol field must be present. 2369 */ 2370 2371 /* ARGSUSED */ 2372 static int 2373 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2374 { 2375 flow_desc_t *fd = &flent->fe_flow_desc; 2376 flow_mask_t mask = fd->fd_mask; 2377 2378 if ((mask & FLOW_IP_PROTOCOL) == 0) 2379 return (EINVAL); 2380 2381 switch (fd->fd_protocol) { 2382 case IPPROTO_TCP: 2383 case IPPROTO_UDP: 2384 case IPPROTO_SCTP: 2385 break; 2386 default: 2387 return (EINVAL); 2388 } 2389 2390 switch (mask & ~FLOW_IP_PROTOCOL) { 2391 case FLOW_ULP_PORT_LOCAL: 2392 if (fd->fd_local_port == 0) 2393 return (EINVAL); 2394 2395 flent->fe_match = flow_transport_lport_match; 2396 break; 2397 case FLOW_ULP_PORT_REMOTE: 2398 if (fd->fd_remote_port == 0) 2399 return (EINVAL); 2400 2401 flent->fe_match = flow_transport_rport_match; 2402 break; 2403 case 0: 2404 /* 2405 * transport-only flows conflicts with our table type. 2406 */ 2407 return (EOPNOTSUPP); 2408 default: 2409 return (EINVAL); 2410 } 2411 2412 return (0); 2413 } 2414 2415 static uint32_t 2416 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2417 { 2418 flow_desc_t *fd = &flent->fe_flow_desc; 2419 uint16_t port = 0; 2420 2421 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2422 fd->fd_local_port : fd->fd_remote_port; 2423 2424 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2425 } 2426 2427 /* ARGSUSED */ 2428 static boolean_t 2429 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2430 { 2431 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2432 2433 if (fd1->fd_protocol != fd2->fd_protocol) 2434 return (B_FALSE); 2435 2436 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2437 return (fd1->fd_local_port == fd2->fd_local_port); 2438 2439 if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0) 2440 return (fd1->fd_remote_port == fd2->fd_remote_port); 2441 2442 return (B_TRUE); 2443 } 2444 2445 static flow_ops_t flow_l2_ops = { 2446 flow_l2_accept_fe, 2447 flow_l2_hash_fe, 2448 flow_l2_match_fe, 2449 flow_generic_insert_fe, 2450 flow_l2_hash, 2451 {flow_l2_accept} 2452 }; 2453 2454 static flow_ops_t flow_ip_ops = { 2455 flow_ip_accept_fe, 2456 flow_ip_hash_fe, 2457 flow_ip_match_fe, 2458 flow_ip_insert_fe, 2459 flow_ip_hash, 2460 {flow_l2_accept, flow_ip_accept} 2461 }; 2462 2463 static flow_ops_t flow_ip_proto_ops = { 2464 flow_ip_proto_accept_fe, 2465 flow_ip_proto_hash_fe, 2466 flow_ip_proto_match_fe, 2467 flow_generic_insert_fe, 2468 flow_ip_proto_hash, 2469 {flow_l2_accept, flow_ip_accept} 2470 }; 2471 2472 static flow_ops_t flow_transport_ops = { 2473 flow_transport_accept_fe, 2474 flow_transport_hash_fe, 2475 flow_transport_match_fe, 2476 flow_generic_insert_fe, 2477 flow_transport_hash, 2478 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2479 }; 2480 2481 static flow_tab_info_t flow_tab_info_list[] = { 2482 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2483 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2484 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2485 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2486 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}, 2487 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024} 2488 }; 2489 2490 #define FLOW_MAX_TAB_INFO \ 2491 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2492 2493 static flow_tab_info_t * 2494 mac_flow_tab_info_get(flow_mask_t mask) 2495 { 2496 int i; 2497 2498 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2499 if (mask == flow_tab_info_list[i].fti_mask) 2500 return (&flow_tab_info_list[i]); 2501 } 2502 return (NULL); 2503 } 2504