1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/strsun.h> 28 #include <sys/sdt.h> 29 #include <sys/mac.h> 30 #include <sys/mac_impl.h> 31 #include <sys/mac_client_impl.h> 32 #include <sys/mac_stat.h> 33 #include <sys/dls.h> 34 #include <sys/dls_impl.h> 35 #include <sys/mac_soft_ring.h> 36 #include <sys/ethernet.h> 37 #include <sys/cpupart.h> 38 #include <sys/pool.h> 39 #include <sys/pool_pset.h> 40 #include <sys/vlan.h> 41 #include <inet/ip.h> 42 #include <inet/ip6.h> 43 #include <netinet/tcp.h> 44 #include <netinet/udp.h> 45 #include <netinet/sctp.h> 46 47 typedef struct flow_stats_s { 48 uint64_t fs_obytes; 49 uint64_t fs_opackets; 50 uint64_t fs_oerrors; 51 uint64_t fs_ibytes; 52 uint64_t fs_ipackets; 53 uint64_t fs_ierrors; 54 } flow_stats_t; 55 56 57 /* global flow table, will be a per exclusive-zone table later */ 58 static mod_hash_t *flow_hash; 59 static krwlock_t flow_tab_lock; 60 61 static kmem_cache_t *flow_cache; 62 static kmem_cache_t *flow_tab_cache; 63 static flow_ops_t flow_l2_ops; 64 65 typedef struct { 66 const char *fs_name; 67 uint_t fs_offset; 68 } flow_stats_info_t; 69 70 #define FS_OFF(f) (offsetof(flow_stats_t, f)) 71 static flow_stats_info_t flow_stats_list[] = { 72 {"rbytes", FS_OFF(fs_ibytes)}, 73 {"ipackets", FS_OFF(fs_ipackets)}, 74 {"ierrors", FS_OFF(fs_ierrors)}, 75 {"obytes", FS_OFF(fs_obytes)}, 76 {"opackets", FS_OFF(fs_opackets)}, 77 {"oerrors", FS_OFF(fs_oerrors)} 78 }; 79 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) 80 81 /* 82 * Checks whether a flow mask is legal. 83 */ 84 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); 85 86 static void 87 flow_stat_init(kstat_named_t *knp) 88 { 89 int i; 90 91 for (i = 0; i < FS_SIZE; i++, knp++) { 92 kstat_named_init(knp, flow_stats_list[i].fs_name, 93 KSTAT_DATA_UINT64); 94 } 95 } 96 97 static int 98 flow_stat_update(kstat_t *ksp, int rw) 99 { 100 flow_entry_t *fep = ksp->ks_private; 101 kstat_named_t *knp = ksp->ks_data; 102 uint64_t *statp; 103 int i; 104 mac_rx_stats_t *mac_rx_stat; 105 mac_tx_stats_t *mac_tx_stat; 106 flow_stats_t flow_stats; 107 mac_soft_ring_set_t *mac_srs; 108 109 if (rw != KSTAT_READ) 110 return (EACCES); 111 112 bzero(&flow_stats, sizeof (flow_stats_t)); 113 114 for (i = 0; i < fep->fe_rx_srs_cnt; i++) { 115 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i]; 116 if (mac_srs == NULL) /* Multicast flow */ 117 break; 118 mac_rx_stat = &mac_srs->srs_rx.sr_stat; 119 120 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes + 121 mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes; 122 123 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt + 124 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt; 125 126 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors; 127 } 128 129 mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs; 130 if (mac_srs == NULL) /* Multicast flow */ 131 goto done; 132 mac_tx_stat = &mac_srs->srs_tx.st_stat; 133 134 flow_stats.fs_obytes = mac_tx_stat->mts_obytes; 135 flow_stats.fs_opackets = mac_tx_stat->mts_opackets; 136 flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors; 137 138 done: 139 for (i = 0; i < FS_SIZE; i++, knp++) { 140 statp = (uint64_t *) 141 ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset); 142 knp->value.ui64 = *statp; 143 } 144 return (0); 145 } 146 147 static void 148 flow_stat_create(flow_entry_t *fep) 149 { 150 kstat_t *ksp; 151 kstat_named_t *knp; 152 uint_t nstats = FS_SIZE; 153 154 /* 155 * Fow now, flow entries are only manipulated and visible from the 156 * global zone. 157 */ 158 ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow", 159 KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID); 160 if (ksp == NULL) 161 return; 162 163 ksp->ks_update = flow_stat_update; 164 ksp->ks_private = fep; 165 fep->fe_ksp = ksp; 166 167 knp = (kstat_named_t *)ksp->ks_data; 168 flow_stat_init(knp); 169 kstat_install(ksp); 170 } 171 172 void 173 flow_stat_destroy(flow_entry_t *fep) 174 { 175 if (fep->fe_ksp != NULL) { 176 kstat_delete(fep->fe_ksp); 177 fep->fe_ksp = NULL; 178 } 179 } 180 181 /* 182 * Initialize the flow table 183 */ 184 void 185 mac_flow_init() 186 { 187 flow_cache = kmem_cache_create("flow_entry_cache", 188 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 189 flow_tab_cache = kmem_cache_create("flow_tab_cache", 190 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 191 flow_hash = mod_hash_create_extended("flow_hash", 192 100, mod_hash_null_keydtor, mod_hash_null_valdtor, 193 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 194 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); 195 } 196 197 /* 198 * Cleanup and release the flow table 199 */ 200 void 201 mac_flow_fini() 202 { 203 kmem_cache_destroy(flow_cache); 204 kmem_cache_destroy(flow_tab_cache); 205 mod_hash_destroy_hash(flow_hash); 206 rw_destroy(&flow_tab_lock); 207 } 208 209 /* 210 * mac_create_flow(): create a flow_entry_t. 211 */ 212 int 213 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, 214 void *client_cookie, uint_t type, flow_entry_t **flentp) 215 { 216 flow_entry_t *flent = *flentp; 217 int err = 0; 218 219 if (mrp != NULL) { 220 err = mac_validate_props(NULL, mrp); 221 if (err != 0) 222 return (err); 223 } 224 225 if (flent == NULL) { 226 flent = kmem_cache_alloc(flow_cache, KM_SLEEP); 227 bzero(flent, sizeof (*flent)); 228 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); 229 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); 230 231 /* Initialize the receiver function to a safe routine */ 232 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; 233 flent->fe_index = -1; 234 } 235 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 236 237 /* This is an initial flow, will be configured later */ 238 if (fd == NULL) { 239 *flentp = flent; 240 return (0); 241 } 242 243 flent->fe_client_cookie = client_cookie; 244 flent->fe_type = type; 245 246 /* Save flow desc */ 247 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 248 249 if (mrp != NULL) { 250 /* 251 * We have already set fe_resource_props for a Link. 252 */ 253 if (type & FLOW_USER) { 254 bcopy(mrp, &flent->fe_resource_props, 255 sizeof (mac_resource_props_t)); 256 } 257 /* 258 * The effective resource list should reflect the priority 259 * that we set implicitly. 260 */ 261 if (!(mrp->mrp_mask & MRP_PRIORITY)) 262 mrp->mrp_mask |= MRP_PRIORITY; 263 if (type & FLOW_USER) 264 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 265 else 266 mrp->mrp_priority = MPL_LINK_DEFAULT; 267 bzero(mrp->mrp_pool, MAXPATHLEN); 268 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t)); 269 bcopy(mrp, &flent->fe_effective_props, 270 sizeof (mac_resource_props_t)); 271 } 272 flow_stat_create(flent); 273 274 *flentp = flent; 275 return (0); 276 } 277 278 /* 279 * Validate flow entry and add it to a flow table. 280 */ 281 int 282 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) 283 { 284 flow_entry_t **headp, **p; 285 flow_ops_t *ops = &ft->ft_ops; 286 flow_mask_t mask; 287 uint32_t index; 288 int err; 289 290 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 291 292 /* 293 * Check for invalid bits in mask. 294 */ 295 mask = flent->fe_flow_desc.fd_mask; 296 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) 297 return (EOPNOTSUPP); 298 299 /* 300 * Validate flent. 301 */ 302 if ((err = ops->fo_accept_fe(ft, flent)) != 0) { 303 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, 304 flow_entry_t *, flent, int, err); 305 return (err); 306 } 307 308 /* 309 * Flent is valid. now calculate hash and insert it 310 * into hash table. 311 */ 312 index = ops->fo_hash_fe(ft, flent); 313 314 /* 315 * We do not need a lock up until now because we were 316 * not accessing the flow table. 317 */ 318 rw_enter(&ft->ft_lock, RW_WRITER); 319 headp = &ft->ft_table[index]; 320 321 /* 322 * Check for duplicate flow. 323 */ 324 for (p = headp; *p != NULL; p = &(*p)->fe_next) { 325 if ((*p)->fe_flow_desc.fd_mask != 326 flent->fe_flow_desc.fd_mask) 327 continue; 328 329 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { 330 rw_exit(&ft->ft_lock); 331 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, 332 flow_entry_t *, flent, int, err); 333 return (EALREADY); 334 } 335 } 336 337 /* 338 * Insert flow to hash list. 339 */ 340 err = ops->fo_insert_fe(ft, headp, flent); 341 if (err != 0) { 342 rw_exit(&ft->ft_lock); 343 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, 344 flow_entry_t *, flent, int, err); 345 return (err); 346 } 347 348 /* 349 * Save the hash index so it can be used by mac_flow_remove(). 350 */ 351 flent->fe_index = (int)index; 352 353 /* 354 * Save the flow tab back reference. 355 */ 356 flent->fe_flow_tab = ft; 357 FLOW_MARK(flent, FE_FLOW_TAB); 358 ft->ft_flow_count++; 359 rw_exit(&ft->ft_lock); 360 return (0); 361 } 362 363 /* 364 * Remove a flow from a mac client's subflow table 365 */ 366 void 367 mac_flow_rem_subflow(flow_entry_t *flent) 368 { 369 flow_tab_t *ft = flent->fe_flow_tab; 370 mac_client_impl_t *mcip = ft->ft_mcip; 371 mac_handle_t mh = (mac_handle_t)ft->ft_mip; 372 373 ASSERT(MAC_PERIM_HELD(mh)); 374 375 mac_flow_remove(ft, flent, B_FALSE); 376 if (flent->fe_mcip == NULL) { 377 /* 378 * The interface is not yet plumbed and mac_client_flow_add 379 * was not done. 380 */ 381 if (FLOW_TAB_EMPTY(ft)) { 382 mac_flow_tab_destroy(ft); 383 mcip->mci_subflow_tab = NULL; 384 } 385 } else { 386 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 387 mac_link_flow_clean((mac_client_handle_t)mcip, flent); 388 } 389 mac_fastpath_enable(mh); 390 } 391 392 /* 393 * Add a flow to a mac client's subflow table and instantiate the flow 394 * in the mac by creating the associated SRSs etc. 395 */ 396 int 397 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, 398 boolean_t instantiate_flow) 399 { 400 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 401 mac_handle_t mh = (mac_handle_t)mcip->mci_mip; 402 flow_tab_info_t *ftinfo; 403 flow_mask_t mask; 404 flow_tab_t *ft; 405 int err; 406 boolean_t ft_created = B_FALSE; 407 408 ASSERT(MAC_PERIM_HELD(mh)); 409 410 if ((err = mac_fastpath_disable(mh)) != 0) 411 return (err); 412 413 /* 414 * If the subflow table exists already just add the new subflow 415 * to the existing table, else we create a new subflow table below. 416 */ 417 ft = mcip->mci_subflow_tab; 418 if (ft == NULL) { 419 mask = flent->fe_flow_desc.fd_mask; 420 /* 421 * Try to create a new table and then add the subflow to the 422 * newly created subflow table 423 */ 424 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) { 425 mac_fastpath_enable(mh); 426 return (EOPNOTSUPP); 427 } 428 429 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, 430 mcip->mci_mip, &ft); 431 ft_created = B_TRUE; 432 } 433 434 err = mac_flow_add(ft, flent); 435 if (err != 0) { 436 if (ft_created) 437 mac_flow_tab_destroy(ft); 438 mac_fastpath_enable(mh); 439 return (err); 440 } 441 442 if (instantiate_flow) { 443 /* Now activate the flow by creating its SRSs */ 444 ASSERT(MCIP_DATAPATH_SETUP(mcip)); 445 err = mac_link_flow_init((mac_client_handle_t)mcip, flent); 446 if (err != 0) { 447 mac_flow_remove(ft, flent, B_FALSE); 448 if (ft_created) 449 mac_flow_tab_destroy(ft); 450 mac_fastpath_enable(mh); 451 return (err); 452 } 453 } else { 454 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 455 } 456 if (ft_created) { 457 ASSERT(mcip->mci_subflow_tab == NULL); 458 ft->ft_mcip = mcip; 459 mcip->mci_subflow_tab = ft; 460 if (instantiate_flow) 461 mac_client_update_classifier(mcip, B_TRUE); 462 } 463 return (0); 464 } 465 466 /* 467 * Remove flow entry from flow table. 468 */ 469 void 470 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) 471 { 472 flow_entry_t **fp; 473 474 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 475 if (!(flent->fe_flags & FE_FLOW_TAB)) 476 return; 477 478 rw_enter(&ft->ft_lock, RW_WRITER); 479 /* 480 * If this is a permanent removal from the flow table, mark it 481 * CONDEMNED to prevent future references. If this is a temporary 482 * removal from the table, say to update the flow descriptor then 483 * we don't mark it CONDEMNED 484 */ 485 if (!temp) 486 FLOW_MARK(flent, FE_CONDEMNED); 487 /* 488 * Locate the specified flent. 489 */ 490 fp = &ft->ft_table[flent->fe_index]; 491 while (*fp != flent) 492 fp = &(*fp)->fe_next; 493 494 /* 495 * The flent must exist. Otherwise it's a bug. 496 */ 497 ASSERT(fp != NULL); 498 *fp = flent->fe_next; 499 flent->fe_next = NULL; 500 501 /* 502 * Reset fe_index to -1 so any attempt to call mac_flow_remove() 503 * on a flent that is supposed to be in the table (FE_FLOW_TAB) 504 * will panic. 505 */ 506 flent->fe_index = -1; 507 FLOW_UNMARK(flent, FE_FLOW_TAB); 508 ft->ft_flow_count--; 509 rw_exit(&ft->ft_lock); 510 } 511 512 /* 513 * This is the flow lookup routine used by the mac sw classifier engine. 514 */ 515 int 516 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) 517 { 518 flow_state_t s; 519 flow_entry_t *flent; 520 flow_ops_t *ops = &ft->ft_ops; 521 boolean_t retried = B_FALSE; 522 int i, err; 523 524 s.fs_flags = flags; 525 retry: 526 s.fs_mp = mp; 527 528 /* 529 * Walk the list of predeclared accept functions. 530 * Each of these would accumulate enough state to allow the next 531 * accept routine to make progress. 532 */ 533 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { 534 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { 535 mblk_t *last; 536 537 /* 538 * ENOBUFS indicates that the mp could be too short 539 * and may need a pullup. 540 */ 541 if (err != ENOBUFS || retried) 542 return (err); 543 544 /* 545 * The pullup is done on the last processed mblk, not 546 * the starting one. pullup is not done if the mblk 547 * has references or if b_cont is NULL. 548 */ 549 last = s.fs_mp; 550 if (DB_REF(last) > 1 || last->b_cont == NULL || 551 pullupmsg(last, -1) == 0) 552 return (EINVAL); 553 554 retried = B_TRUE; 555 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, 556 flow_state_t *, &s); 557 goto retry; 558 } 559 } 560 561 /* 562 * The packet is considered sane. We may now attempt to 563 * find the corresponding flent. 564 */ 565 rw_enter(&ft->ft_lock, RW_READER); 566 flent = ft->ft_table[ops->fo_hash(ft, &s)]; 567 for (; flent != NULL; flent = flent->fe_next) { 568 if (flent->fe_match(ft, flent, &s)) { 569 FLOW_TRY_REFHOLD(flent, err); 570 if (err != 0) 571 continue; 572 *flentp = flent; 573 rw_exit(&ft->ft_lock); 574 return (0); 575 } 576 } 577 rw_exit(&ft->ft_lock); 578 return (ENOENT); 579 } 580 581 /* 582 * Walk flow table. 583 * The caller is assumed to have proper perimeter protection. 584 */ 585 int 586 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 587 void *arg) 588 { 589 int err, i, cnt = 0; 590 flow_entry_t *flent; 591 592 if (ft == NULL) 593 return (0); 594 595 for (i = 0; i < ft->ft_size; i++) { 596 for (flent = ft->ft_table[i]; flent != NULL; 597 flent = flent->fe_next) { 598 cnt++; 599 err = (*fn)(flent, arg); 600 if (err != 0) 601 return (err); 602 } 603 } 604 VERIFY(cnt == ft->ft_flow_count); 605 return (0); 606 } 607 608 /* 609 * Same as the above except a mutex is used for protection here. 610 */ 611 int 612 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), 613 void *arg) 614 { 615 int err; 616 617 if (ft == NULL) 618 return (0); 619 620 rw_enter(&ft->ft_lock, RW_WRITER); 621 err = mac_flow_walk_nolock(ft, fn, arg); 622 rw_exit(&ft->ft_lock); 623 return (err); 624 } 625 626 static boolean_t mac_flow_clean(flow_entry_t *); 627 628 /* 629 * Destroy a flow entry. Called when the last reference on a flow is released. 630 */ 631 void 632 mac_flow_destroy(flow_entry_t *flent) 633 { 634 ASSERT(flent->fe_refcnt == 0); 635 636 if ((flent->fe_type & FLOW_USER) != 0) { 637 ASSERT(mac_flow_clean(flent)); 638 } else { 639 mac_flow_cleanup(flent); 640 } 641 mac_misc_stat_delete(flent); 642 mutex_destroy(&flent->fe_lock); 643 cv_destroy(&flent->fe_cv); 644 flow_stat_destroy(flent); 645 kmem_cache_free(flow_cache, flent); 646 } 647 648 /* 649 * XXX eric 650 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and 651 * mac_link_flow_modify() should really be moved/reworked into the 652 * two functions below. This would consolidate all the mac property 653 * checking in one place. I'm leaving this alone for now since it's 654 * out of scope of the new flows work. 655 */ 656 /* ARGSUSED */ 657 uint32_t 658 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) 659 { 660 uint32_t changed_mask = 0; 661 mac_resource_props_t *fmrp = &flent->fe_effective_props; 662 int i; 663 664 if ((mrp->mrp_mask & MRP_MAXBW) != 0 && 665 (!(fmrp->mrp_mask & MRP_MAXBW) || 666 (fmrp->mrp_maxbw != mrp->mrp_maxbw))) { 667 changed_mask |= MRP_MAXBW; 668 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 669 fmrp->mrp_mask &= ~MRP_MAXBW; 670 fmrp->mrp_maxbw = 0; 671 } else { 672 fmrp->mrp_mask |= MRP_MAXBW; 673 fmrp->mrp_maxbw = mrp->mrp_maxbw; 674 } 675 } 676 677 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { 678 if (fmrp->mrp_priority != mrp->mrp_priority) 679 changed_mask |= MRP_PRIORITY; 680 if (mrp->mrp_priority == MPL_RESET) { 681 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; 682 fmrp->mrp_mask &= ~MRP_PRIORITY; 683 } else { 684 fmrp->mrp_priority = mrp->mrp_priority; 685 fmrp->mrp_mask |= MRP_PRIORITY; 686 } 687 } 688 689 /* modify fanout */ 690 if ((mrp->mrp_mask & MRP_CPUS) != 0) { 691 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && 692 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { 693 for (i = 0; i < mrp->mrp_ncpus; i++) { 694 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) 695 break; 696 } 697 if (i == mrp->mrp_ncpus) { 698 /* 699 * The new set of cpus passed is exactly 700 * the same as the existing set. 701 */ 702 return (changed_mask); 703 } 704 } 705 changed_mask |= MRP_CPUS; 706 MAC_COPY_CPUS(mrp, fmrp); 707 } 708 709 /* 710 * Modify the rings property. 711 */ 712 if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS) 713 mac_set_rings_effective(flent->fe_mcip); 714 715 if ((mrp->mrp_mask & MRP_POOL) != 0) { 716 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0) 717 changed_mask |= MRP_POOL; 718 if (strlen(mrp->mrp_pool) == 0) 719 fmrp->mrp_mask &= ~MRP_POOL; 720 else 721 fmrp->mrp_mask |= MRP_POOL; 722 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN); 723 } 724 return (changed_mask); 725 } 726 727 void 728 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) 729 { 730 uint32_t changed_mask; 731 mac_client_impl_t *mcip = flent->fe_mcip; 732 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 733 mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip); 734 cpupart_t *cpupart = NULL; 735 boolean_t use_default = B_FALSE; 736 737 ASSERT(flent != NULL); 738 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 739 740 rw_enter(&ft->ft_lock, RW_WRITER); 741 742 /* Update the cached values inside the subflow entry */ 743 changed_mask = mac_flow_modify_props(flent, mrp); 744 rw_exit(&ft->ft_lock); 745 /* 746 * Push the changed parameters to the scheduling code in the 747 * SRS's, to take effect right away. 748 */ 749 if (changed_mask & MRP_MAXBW) { 750 mac_srs_update_bwlimit(flent, mrp); 751 /* 752 * If bandwidth is changed, we may have to change 753 * the number of soft ring to be used for fanout. 754 * Call mac_flow_update_fanout() if MAC_BIND_CPU 755 * is not set and there is no user supplied cpu 756 * info. This applies only to link at this time. 757 */ 758 if (!(flent->fe_type & FLOW_USER) && 759 !(changed_mask & MRP_CPUS) && 760 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { 761 mac_fanout_setup(mcip, flent, mcip_mrp, 762 mac_rx_deliver, mcip, NULL, NULL); 763 } 764 } 765 if (mrp->mrp_mask & MRP_PRIORITY) 766 mac_flow_update_priority(mcip, flent); 767 768 if (changed_mask & MRP_CPUS) 769 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, 770 NULL); 771 772 if (mrp->mrp_mask & MRP_POOL) { 773 pool_lock(); 774 cpupart = mac_pset_find(mrp, &use_default); 775 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, 776 cpupart); 777 mac_set_pool_effective(use_default, cpupart, mrp, emrp); 778 pool_unlock(); 779 } 780 } 781 782 /* 783 * This function waits for a certain condition to be met and is generally 784 * used before a destructive or quiescing operation. 785 */ 786 void 787 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) 788 { 789 mutex_enter(&flent->fe_lock); 790 flent->fe_flags |= FE_WAITER; 791 792 switch (event) { 793 case FLOW_DRIVER_UPCALL: 794 /* 795 * We want to make sure the driver upcalls have finished before 796 * we signal the Rx SRS worker to quit. 797 */ 798 while (flent->fe_refcnt != 1) 799 cv_wait(&flent->fe_cv, &flent->fe_lock); 800 break; 801 802 case FLOW_USER_REF: 803 /* 804 * Wait for the fe_user_refcnt to drop to 0. The flow has 805 * been removed from the global flow hash. 806 */ 807 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); 808 while (flent->fe_user_refcnt != 0) 809 cv_wait(&flent->fe_cv, &flent->fe_lock); 810 break; 811 812 default: 813 ASSERT(0); 814 } 815 816 flent->fe_flags &= ~FE_WAITER; 817 mutex_exit(&flent->fe_lock); 818 } 819 820 static boolean_t 821 mac_flow_clean(flow_entry_t *flent) 822 { 823 ASSERT(flent->fe_next == NULL); 824 ASSERT(flent->fe_tx_srs == NULL); 825 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); 826 ASSERT(flent->fe_mbg == NULL); 827 828 return (B_TRUE); 829 } 830 831 void 832 mac_flow_cleanup(flow_entry_t *flent) 833 { 834 if ((flent->fe_type & FLOW_USER) == 0) { 835 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || 836 (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); 837 ASSERT(flent->fe_refcnt == 0); 838 } else { 839 ASSERT(flent->fe_refcnt == 1); 840 } 841 842 if (flent->fe_mbg != NULL) { 843 ASSERT(flent->fe_tx_srs == NULL); 844 /* This is a multicast or broadcast flow entry */ 845 mac_bcast_grp_free(flent->fe_mbg); 846 flent->fe_mbg = NULL; 847 } 848 849 if (flent->fe_tx_srs != NULL) { 850 ASSERT(flent->fe_mbg == NULL); 851 mac_srs_free(flent->fe_tx_srs); 852 flent->fe_tx_srs = NULL; 853 } 854 855 /* 856 * In the normal case fe_rx_srs_cnt is 1. However in the error case 857 * when mac_unicast_add fails we may not have set up any SRS 858 * in which case fe_rx_srs_cnt will be zero. 859 */ 860 if (flent->fe_rx_srs_cnt != 0) { 861 ASSERT(flent->fe_rx_srs_cnt == 1); 862 mac_srs_free(flent->fe_rx_srs[0]); 863 flent->fe_rx_srs[0] = NULL; 864 flent->fe_rx_srs_cnt = 0; 865 } 866 ASSERT(flent->fe_rx_srs[0] == NULL); 867 } 868 869 void 870 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) 871 { 872 /* 873 * Grab the fe_lock to see a self-consistent fe_flow_desc. 874 * Updates to the fe_flow_desc happen under the fe_lock 875 * after removing the flent from the flow table 876 */ 877 mutex_enter(&flent->fe_lock); 878 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); 879 mutex_exit(&flent->fe_lock); 880 } 881 882 /* 883 * Update a field of a flow entry. The mac perimeter ensures that 884 * this is the only thread doing a modify operation on this mac end point. 885 * So the flow table can't change or disappear. The ft_lock protects access 886 * to the flow entry, and holding the lock ensures that there isn't any thread 887 * accessing the flow entry or attempting a flow table lookup. However 888 * data threads that are using the flow entry based on the old descriptor 889 * will continue to use the flow entry. If strong coherence is required 890 * then the flow will have to be quiesced before the descriptor can be 891 * changed. 892 */ 893 void 894 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) 895 { 896 flow_tab_t *ft = flent->fe_flow_tab; 897 flow_desc_t old_desc; 898 int err; 899 900 if (ft == NULL) { 901 /* 902 * The flow hasn't yet been inserted into the table, 903 * so only the caller knows about this flow, however for 904 * uniformity we grab the fe_lock here. 905 */ 906 mutex_enter(&flent->fe_lock); 907 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 908 mutex_exit(&flent->fe_lock); 909 } 910 911 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 912 913 /* 914 * Need to remove the flow entry from the table and reinsert it, 915 * into a potentially diference hash line. The hash depends on 916 * the new descriptor fields. However access to fe_desc itself 917 * is always under the fe_lock. This helps log and stat functions 918 * see a self-consistent fe_flow_desc. 919 */ 920 mac_flow_remove(ft, flent, B_TRUE); 921 old_desc = flent->fe_flow_desc; 922 923 mutex_enter(&flent->fe_lock); 924 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); 925 mutex_exit(&flent->fe_lock); 926 927 if (mac_flow_add(ft, flent) != 0) { 928 /* 929 * The add failed say due to an invalid flow descriptor. 930 * Undo the update 931 */ 932 flent->fe_flow_desc = old_desc; 933 err = mac_flow_add(ft, flent); 934 ASSERT(err == 0); 935 } 936 } 937 938 void 939 mac_flow_set_name(flow_entry_t *flent, const char *name) 940 { 941 flow_tab_t *ft = flent->fe_flow_tab; 942 943 if (ft == NULL) { 944 /* 945 * The flow hasn't yet been inserted into the table, 946 * so only the caller knows about this flow 947 */ 948 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 949 } else { 950 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 951 } 952 953 mutex_enter(&flent->fe_lock); 954 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); 955 mutex_exit(&flent->fe_lock); 956 } 957 958 /* 959 * Return the client-private cookie that was associated with 960 * the flow when it was created. 961 */ 962 void * 963 mac_flow_get_client_cookie(flow_entry_t *flent) 964 { 965 return (flent->fe_client_cookie); 966 } 967 968 /* 969 * Forward declarations. 970 */ 971 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); 972 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *); 973 static int flow_l2_accept(flow_tab_t *, flow_state_t *); 974 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); 975 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *); 976 static int flow_ether_accept(flow_tab_t *, flow_state_t *); 977 978 /* 979 * Create flow table. 980 */ 981 void 982 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, 983 mac_impl_t *mip, flow_tab_t **ftp) 984 { 985 flow_tab_t *ft; 986 flow_ops_t *new_ops; 987 988 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); 989 bzero(ft, sizeof (*ft)); 990 991 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); 992 993 /* 994 * We make a copy of the ops vector instead of just pointing to it 995 * because we might want to customize the ops vector on a per table 996 * basis (e.g. for optimization). 997 */ 998 new_ops = &ft->ft_ops; 999 bcopy(ops, new_ops, sizeof (*ops)); 1000 ft->ft_mask = mask; 1001 ft->ft_size = size; 1002 ft->ft_mip = mip; 1003 1004 /* 1005 * Optimizations for DL_ETHER media. 1006 */ 1007 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 1008 if (new_ops->fo_hash == flow_l2_hash) 1009 new_ops->fo_hash = flow_ether_hash; 1010 if (new_ops->fo_hash_fe == flow_l2_hash_fe) 1011 new_ops->fo_hash_fe = flow_ether_hash_fe; 1012 if (new_ops->fo_accept[0] == flow_l2_accept) 1013 new_ops->fo_accept[0] = flow_ether_accept; 1014 } 1015 *ftp = ft; 1016 } 1017 1018 void 1019 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) 1020 { 1021 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 1022 1024, mip, ftp); 1023 } 1024 1025 /* 1026 * Destroy flow table. 1027 */ 1028 void 1029 mac_flow_tab_destroy(flow_tab_t *ft) 1030 { 1031 if (ft == NULL) 1032 return; 1033 1034 ASSERT(ft->ft_flow_count == 0); 1035 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); 1036 bzero(ft, sizeof (*ft)); 1037 kmem_cache_free(flow_tab_cache, ft); 1038 } 1039 1040 /* 1041 * Add a new flow entry to the global flow hash table 1042 */ 1043 int 1044 mac_flow_hash_add(flow_entry_t *flent) 1045 { 1046 int err; 1047 1048 rw_enter(&flow_tab_lock, RW_WRITER); 1049 err = mod_hash_insert(flow_hash, 1050 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); 1051 if (err != 0) { 1052 rw_exit(&flow_tab_lock); 1053 return (EEXIST); 1054 } 1055 /* Mark as inserted into the global flow hash table */ 1056 FLOW_MARK(flent, FE_G_FLOW_HASH); 1057 rw_exit(&flow_tab_lock); 1058 return (err); 1059 } 1060 1061 /* 1062 * Remove a flow entry from the global flow hash table 1063 */ 1064 void 1065 mac_flow_hash_remove(flow_entry_t *flent) 1066 { 1067 mod_hash_val_t val; 1068 1069 rw_enter(&flow_tab_lock, RW_WRITER); 1070 VERIFY(mod_hash_remove(flow_hash, 1071 (mod_hash_key_t)flent->fe_flow_name, &val) == 0); 1072 1073 /* Clear the mark that says inserted into the global flow hash table */ 1074 FLOW_UNMARK(flent, FE_G_FLOW_HASH); 1075 rw_exit(&flow_tab_lock); 1076 } 1077 1078 /* 1079 * Retrieve a flow entry from the global flow hash table. 1080 */ 1081 int 1082 mac_flow_lookup_byname(char *name, flow_entry_t **flentp) 1083 { 1084 int err; 1085 flow_entry_t *flent; 1086 1087 rw_enter(&flow_tab_lock, RW_READER); 1088 err = mod_hash_find(flow_hash, (mod_hash_key_t)name, 1089 (mod_hash_val_t *)&flent); 1090 if (err != 0) { 1091 rw_exit(&flow_tab_lock); 1092 return (ENOENT); 1093 } 1094 ASSERT(flent != NULL); 1095 FLOW_USER_REFHOLD(flent); 1096 rw_exit(&flow_tab_lock); 1097 1098 *flentp = flent; 1099 return (0); 1100 } 1101 1102 /* 1103 * Initialize or release mac client flows by walking the subflow table. 1104 * These are typically invoked during plumb/unplumb of links. 1105 */ 1106 1107 static int 1108 mac_link_init_flows_cb(flow_entry_t *flent, void *arg) 1109 { 1110 mac_client_impl_t *mcip = arg; 1111 1112 if (mac_link_flow_init(arg, flent) != 0) { 1113 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", 1114 flent->fe_flow_name, mcip->mci_name); 1115 } else { 1116 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); 1117 } 1118 return (0); 1119 } 1120 1121 void 1122 mac_link_init_flows(mac_client_handle_t mch) 1123 { 1124 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1125 1126 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1127 mac_link_init_flows_cb, mcip); 1128 /* 1129 * If mac client had subflow(s) configured before plumb, change 1130 * function to mac_rx_srs_subflow_process and in case of hardware 1131 * classification, disable polling. 1132 */ 1133 mac_client_update_classifier(mcip, B_TRUE); 1134 1135 } 1136 1137 boolean_t 1138 mac_link_has_flows(mac_client_handle_t mch) 1139 { 1140 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1141 1142 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1143 return (B_TRUE); 1144 1145 return (B_FALSE); 1146 } 1147 1148 static int 1149 mac_link_release_flows_cb(flow_entry_t *flent, void *arg) 1150 { 1151 FLOW_MARK(flent, FE_UF_NO_DATAPATH); 1152 mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 1153 mac_link_flow_clean(arg, flent); 1154 return (0); 1155 } 1156 1157 void 1158 mac_link_release_flows(mac_client_handle_t mch) 1159 { 1160 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1161 1162 /* 1163 * Change the mci_flent callback back to mac_rx_srs_process() 1164 * because flows are about to be deactivated. 1165 */ 1166 mac_client_update_classifier(mcip, B_FALSE); 1167 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, 1168 mac_link_release_flows_cb, mcip); 1169 } 1170 1171 void 1172 mac_rename_flow(flow_entry_t *fep, const char *new_name) 1173 { 1174 mac_flow_set_name(fep, new_name); 1175 if (fep->fe_ksp != NULL) { 1176 flow_stat_destroy(fep); 1177 flow_stat_create(fep); 1178 } 1179 } 1180 1181 /* 1182 * mac_link_flow_init() 1183 * Internal flow interface used for allocating SRSs and related 1184 * data structures. Not meant to be used by mac clients. 1185 */ 1186 int 1187 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) 1188 { 1189 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1190 mac_impl_t *mip = mcip->mci_mip; 1191 int err; 1192 1193 ASSERT(mch != NULL); 1194 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1195 1196 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) 1197 return (err); 1198 1199 sub_flow->fe_mcip = mcip; 1200 1201 return (0); 1202 } 1203 1204 /* 1205 * mac_link_flow_add() 1206 * Used by flowadm(1m) or kernel mac clients for creating flows. 1207 */ 1208 int 1209 mac_link_flow_add(datalink_id_t linkid, char *flow_name, 1210 flow_desc_t *flow_desc, mac_resource_props_t *mrp) 1211 { 1212 flow_entry_t *flent = NULL; 1213 int err; 1214 dls_dl_handle_t dlh; 1215 dls_link_t *dlp; 1216 boolean_t link_held = B_FALSE; 1217 boolean_t hash_added = B_FALSE; 1218 mac_perim_handle_t mph; 1219 1220 err = mac_flow_lookup_byname(flow_name, &flent); 1221 if (err == 0) { 1222 FLOW_USER_REFRELE(flent); 1223 return (EEXIST); 1224 } 1225 1226 /* 1227 * First create a flow entry given the description provided 1228 * by the caller. 1229 */ 1230 err = mac_flow_create(flow_desc, mrp, flow_name, NULL, 1231 FLOW_USER | FLOW_OTHER, &flent); 1232 1233 if (err != 0) 1234 return (err); 1235 1236 /* 1237 * We've got a local variable referencing this flow now, so we need 1238 * to hold it. We'll release this flow before returning. 1239 * All failures until we return will undo any action that may internally 1240 * held the flow, so the last REFRELE will assure a clean freeing 1241 * of resources. 1242 */ 1243 FLOW_REFHOLD(flent); 1244 1245 flent->fe_link_id = linkid; 1246 FLOW_MARK(flent, FE_INCIPIENT); 1247 1248 err = mac_perim_enter_by_linkid(linkid, &mph); 1249 if (err != 0) { 1250 FLOW_FINAL_REFRELE(flent); 1251 return (err); 1252 } 1253 1254 /* 1255 * dls will eventually be merged with mac so it's ok 1256 * to call dls' internal functions. 1257 */ 1258 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1259 if (err != 0) 1260 goto bail; 1261 1262 link_held = B_TRUE; 1263 1264 /* 1265 * Add the flow to the global flow table, this table will be per 1266 * exclusive zone so each zone can have its own flow namespace. 1267 * RFE 6625651 will fix this. 1268 * 1269 */ 1270 if ((err = mac_flow_hash_add(flent)) != 0) 1271 goto bail; 1272 1273 hash_added = B_TRUE; 1274 1275 /* 1276 * do not allow flows to be configured on an anchor VNIC 1277 */ 1278 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { 1279 err = ENOTSUP; 1280 goto bail; 1281 } 1282 1283 /* 1284 * Add the subflow to the subflow table. Also instantiate the flow 1285 * in the mac if there is an active user (we check if the MAC client's 1286 * datapath has been setup). 1287 */ 1288 err = mac_flow_add_subflow(dlp->dl_mch, flent, 1289 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); 1290 if (err != 0) 1291 goto bail; 1292 1293 FLOW_UNMARK(flent, FE_INCIPIENT); 1294 dls_devnet_rele_link(dlh, dlp); 1295 mac_perim_exit(mph); 1296 return (0); 1297 1298 bail: 1299 if (hash_added) 1300 mac_flow_hash_remove(flent); 1301 1302 if (link_held) 1303 dls_devnet_rele_link(dlh, dlp); 1304 1305 /* 1306 * Wait for any transient global flow hash refs to clear 1307 * and then release the creation reference on the flow 1308 */ 1309 mac_flow_wait(flent, FLOW_USER_REF); 1310 FLOW_FINAL_REFRELE(flent); 1311 mac_perim_exit(mph); 1312 return (err); 1313 } 1314 1315 /* 1316 * mac_link_flow_clean() 1317 * Internal flow interface used for freeing SRSs and related 1318 * data structures. Not meant to be used by mac clients. 1319 */ 1320 void 1321 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) 1322 { 1323 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 1324 mac_impl_t *mip = mcip->mci_mip; 1325 boolean_t last_subflow; 1326 1327 ASSERT(mch != NULL); 1328 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1329 1330 /* 1331 * This sub flow entry may fail to be fully initialized by 1332 * mac_link_flow_init(). If so, simply return. 1333 */ 1334 if (sub_flow->fe_mcip == NULL) 1335 return; 1336 1337 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); 1338 /* 1339 * Tear down the data path 1340 */ 1341 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); 1342 sub_flow->fe_mcip = NULL; 1343 1344 /* 1345 * Delete the SRSs associated with this subflow. If this is being 1346 * driven by flowadm(1M) then the subflow will be deleted by 1347 * dls_rem_flow. However if this is a result of the interface being 1348 * unplumbed then the subflow itself won't be deleted. 1349 */ 1350 mac_flow_cleanup(sub_flow); 1351 1352 /* 1353 * If all the subflows are gone, renable some of the stuff 1354 * we disabled when adding a subflow, polling etc. 1355 */ 1356 if (last_subflow) { 1357 /* 1358 * The subflow table itself is not protected by any locks or 1359 * refcnts. Hence quiesce the client upfront before clearing 1360 * mci_subflow_tab. 1361 */ 1362 mac_client_quiesce(mcip); 1363 mac_client_update_classifier(mcip, B_FALSE); 1364 mac_flow_tab_destroy(mcip->mci_subflow_tab); 1365 mcip->mci_subflow_tab = NULL; 1366 mac_client_restart(mcip); 1367 } 1368 } 1369 1370 /* 1371 * mac_link_flow_remove() 1372 * Used by flowadm(1m) or kernel mac clients for removing flows. 1373 */ 1374 int 1375 mac_link_flow_remove(char *flow_name) 1376 { 1377 flow_entry_t *flent; 1378 mac_perim_handle_t mph; 1379 int err; 1380 datalink_id_t linkid; 1381 1382 err = mac_flow_lookup_byname(flow_name, &flent); 1383 if (err != 0) 1384 return (err); 1385 1386 linkid = flent->fe_link_id; 1387 FLOW_USER_REFRELE(flent); 1388 1389 /* 1390 * The perim must be acquired before acquiring any other references 1391 * to maintain the lock and perimeter hierarchy. Please note the 1392 * FLOW_REFRELE above. 1393 */ 1394 err = mac_perim_enter_by_linkid(linkid, &mph); 1395 if (err != 0) 1396 return (err); 1397 1398 /* 1399 * Note the second lookup of the flow, because a concurrent thread 1400 * may have removed it already while we were waiting to enter the 1401 * link's perimeter. 1402 */ 1403 err = mac_flow_lookup_byname(flow_name, &flent); 1404 if (err != 0) { 1405 mac_perim_exit(mph); 1406 return (err); 1407 } 1408 FLOW_USER_REFRELE(flent); 1409 1410 /* 1411 * Remove the flow from the subflow table and deactivate the flow 1412 * by quiescing and removings its SRSs 1413 */ 1414 mac_flow_rem_subflow(flent); 1415 1416 /* 1417 * Finally, remove the flow from the global table. 1418 */ 1419 mac_flow_hash_remove(flent); 1420 1421 /* 1422 * Wait for any transient global flow hash refs to clear 1423 * and then release the creation reference on the flow 1424 */ 1425 mac_flow_wait(flent, FLOW_USER_REF); 1426 FLOW_FINAL_REFRELE(flent); 1427 1428 mac_perim_exit(mph); 1429 1430 return (0); 1431 } 1432 1433 /* 1434 * mac_link_flow_modify() 1435 * Modifies the properties of a flow identified by its name. 1436 */ 1437 int 1438 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) 1439 { 1440 flow_entry_t *flent; 1441 mac_client_impl_t *mcip; 1442 int err = 0; 1443 mac_perim_handle_t mph; 1444 datalink_id_t linkid; 1445 flow_tab_t *flow_tab; 1446 1447 err = mac_validate_props(NULL, mrp); 1448 if (err != 0) 1449 return (err); 1450 1451 err = mac_flow_lookup_byname(flow_name, &flent); 1452 if (err != 0) 1453 return (err); 1454 1455 linkid = flent->fe_link_id; 1456 FLOW_USER_REFRELE(flent); 1457 1458 /* 1459 * The perim must be acquired before acquiring any other references 1460 * to maintain the lock and perimeter hierarchy. Please note the 1461 * FLOW_REFRELE above. 1462 */ 1463 err = mac_perim_enter_by_linkid(linkid, &mph); 1464 if (err != 0) 1465 return (err); 1466 1467 /* 1468 * Note the second lookup of the flow, because a concurrent thread 1469 * may have removed it already while we were waiting to enter the 1470 * link's perimeter. 1471 */ 1472 err = mac_flow_lookup_byname(flow_name, &flent); 1473 if (err != 0) { 1474 mac_perim_exit(mph); 1475 return (err); 1476 } 1477 FLOW_USER_REFRELE(flent); 1478 1479 /* 1480 * If this flow is attached to a MAC client, then pass the request 1481 * along to the client. 1482 * Otherwise, just update the cached values. 1483 */ 1484 mcip = flent->fe_mcip; 1485 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); 1486 if (mcip != NULL) { 1487 if ((flow_tab = mcip->mci_subflow_tab) == NULL) { 1488 err = ENOENT; 1489 } else { 1490 mac_flow_modify(flow_tab, flent, mrp); 1491 } 1492 } else { 1493 (void) mac_flow_modify_props(flent, mrp); 1494 } 1495 1496 done: 1497 mac_perim_exit(mph); 1498 return (err); 1499 } 1500 1501 1502 /* 1503 * State structure and misc functions used by mac_link_flow_walk(). 1504 */ 1505 typedef struct { 1506 int (*ws_func)(mac_flowinfo_t *, void *); 1507 void *ws_arg; 1508 } flow_walk_state_t; 1509 1510 static void 1511 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) 1512 { 1513 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, 1514 MAXFLOWNAMELEN); 1515 finfop->fi_link_id = flent->fe_link_id; 1516 finfop->fi_flow_desc = flent->fe_flow_desc; 1517 finfop->fi_resource_props = flent->fe_resource_props; 1518 } 1519 1520 static int 1521 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) 1522 { 1523 flow_walk_state_t *statep = arg; 1524 mac_flowinfo_t *finfo; 1525 int err; 1526 1527 finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP); 1528 mac_link_flowinfo_copy(finfo, flent); 1529 err = statep->ws_func(finfo, statep->ws_arg); 1530 kmem_free(finfo, sizeof (*finfo)); 1531 return (err); 1532 } 1533 1534 /* 1535 * mac_link_flow_walk() 1536 * Invokes callback 'func' for all flows belonging to the specified link. 1537 */ 1538 int 1539 mac_link_flow_walk(datalink_id_t linkid, 1540 int (*func)(mac_flowinfo_t *, void *), void *arg) 1541 { 1542 mac_client_impl_t *mcip; 1543 mac_perim_handle_t mph; 1544 flow_walk_state_t state; 1545 dls_dl_handle_t dlh; 1546 dls_link_t *dlp; 1547 int err; 1548 1549 err = mac_perim_enter_by_linkid(linkid, &mph); 1550 if (err != 0) 1551 return (err); 1552 1553 err = dls_devnet_hold_link(linkid, &dlh, &dlp); 1554 if (err != 0) { 1555 mac_perim_exit(mph); 1556 return (err); 1557 } 1558 1559 mcip = (mac_client_impl_t *)dlp->dl_mch; 1560 state.ws_func = func; 1561 state.ws_arg = arg; 1562 1563 err = mac_flow_walk_nolock(mcip->mci_subflow_tab, 1564 mac_link_flow_walk_cb, &state); 1565 1566 dls_devnet_rele_link(dlh, dlp); 1567 mac_perim_exit(mph); 1568 return (err); 1569 } 1570 1571 /* 1572 * mac_link_flow_info() 1573 * Retrieves information about a specific flow. 1574 */ 1575 int 1576 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) 1577 { 1578 flow_entry_t *flent; 1579 int err; 1580 1581 err = mac_flow_lookup_byname(flow_name, &flent); 1582 if (err != 0) 1583 return (err); 1584 1585 mac_link_flowinfo_copy(finfo, flent); 1586 FLOW_USER_REFRELE(flent); 1587 return (0); 1588 } 1589 1590 /* 1591 * Hash function macro that takes an Ethernet address and VLAN id as input. 1592 */ 1593 #define HASH_ETHER_VID(a, v, s) \ 1594 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) 1595 1596 /* 1597 * Generic layer-2 address hashing function that takes an address and address 1598 * length as input. This is the DJB hash function. 1599 */ 1600 static uint32_t 1601 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize) 1602 { 1603 uint32_t hash = 5381; 1604 size_t i; 1605 1606 for (i = 0; i < addrlen; i++) 1607 hash = ((hash << 5) + hash) + addr[i]; 1608 return (hash % htsize); 1609 } 1610 1611 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) 1612 1613 #define CHECK_AND_ADJUST_START_PTR(s, start) { \ 1614 if ((s)->fs_mp->b_wptr == (start)) { \ 1615 mblk_t *next = (s)->fs_mp->b_cont; \ 1616 if (next == NULL) \ 1617 return (EINVAL); \ 1618 \ 1619 (s)->fs_mp = next; \ 1620 (start) = next->b_rptr; \ 1621 } \ 1622 } 1623 1624 /* ARGSUSED */ 1625 static boolean_t 1626 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1627 { 1628 flow_l2info_t *l2 = &s->fs_l2info; 1629 flow_desc_t *fd = &flent->fe_flow_desc; 1630 1631 return (l2->l2_vid == fd->fd_vid && 1632 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); 1633 } 1634 1635 /* 1636 * Layer 2 hash function. 1637 * Must be paired with flow_l2_accept() within a set of flow_ops 1638 * because it assumes the dest address is already extracted. 1639 */ 1640 static uint32_t 1641 flow_l2_hash(flow_tab_t *ft, flow_state_t *s) 1642 { 1643 return (flow_l2_addrhash(s->fs_l2info.l2_daddr, 1644 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1645 } 1646 1647 /* 1648 * This is the generic layer 2 accept function. 1649 * It makes use of mac_header_info() to extract the header length, 1650 * sap, vlan ID and destination address. 1651 */ 1652 static int 1653 flow_l2_accept(flow_tab_t *ft, flow_state_t *s) 1654 { 1655 boolean_t is_ether; 1656 flow_l2info_t *l2 = &s->fs_l2info; 1657 mac_header_info_t mhi; 1658 int err; 1659 1660 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); 1661 if ((err = mac_header_info((mac_handle_t)ft->ft_mip, 1662 s->fs_mp, &mhi)) != 0) { 1663 if (err == EINVAL) 1664 err = ENOBUFS; 1665 1666 return (err); 1667 } 1668 1669 l2->l2_start = s->fs_mp->b_rptr; 1670 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; 1671 1672 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && 1673 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1674 struct ether_vlan_header *evhp = 1675 (struct ether_vlan_header *)l2->l2_start; 1676 1677 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1678 return (ENOBUFS); 1679 1680 l2->l2_sap = ntohs(evhp->ether_type); 1681 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1682 l2->l2_hdrsize = sizeof (*evhp); 1683 } else { 1684 l2->l2_sap = mhi.mhi_bindsap; 1685 l2->l2_vid = 0; 1686 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; 1687 } 1688 return (0); 1689 } 1690 1691 /* 1692 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ 1693 * accept(). The notable difference is that dest address is now extracted 1694 * by hash() rather than by accept(). This saves a few memory references 1695 * for flow tables that do not care about mac addresses. 1696 */ 1697 static uint32_t 1698 flow_ether_hash(flow_tab_t *ft, flow_state_t *s) 1699 { 1700 flow_l2info_t *l2 = &s->fs_l2info; 1701 struct ether_vlan_header *evhp; 1702 1703 evhp = (struct ether_vlan_header *)l2->l2_start; 1704 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; 1705 return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); 1706 } 1707 1708 static uint32_t 1709 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1710 { 1711 flow_desc_t *fd = &flent->fe_flow_desc; 1712 1713 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); 1714 return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); 1715 } 1716 1717 /* ARGSUSED */ 1718 static int 1719 flow_ether_accept(flow_tab_t *ft, flow_state_t *s) 1720 { 1721 flow_l2info_t *l2 = &s->fs_l2info; 1722 struct ether_vlan_header *evhp; 1723 uint16_t sap; 1724 1725 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; 1726 l2->l2_start = (uchar_t *)evhp; 1727 1728 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) 1729 return (ENOBUFS); 1730 1731 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && 1732 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { 1733 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) 1734 return (ENOBUFS); 1735 1736 l2->l2_sap = ntohs(evhp->ether_type); 1737 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); 1738 l2->l2_hdrsize = sizeof (struct ether_vlan_header); 1739 } else { 1740 l2->l2_sap = sap; 1741 l2->l2_vid = 0; 1742 l2->l2_hdrsize = sizeof (struct ether_header); 1743 } 1744 return (0); 1745 } 1746 1747 /* 1748 * Validates a layer 2 flow entry. 1749 */ 1750 static int 1751 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1752 { 1753 flow_desc_t *fd = &flent->fe_flow_desc; 1754 1755 /* 1756 * Dest address is mandatory, and 0 length addresses are not yet 1757 * supported. 1758 */ 1759 if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0) 1760 return (EINVAL); 1761 1762 if ((fd->fd_mask & FLOW_LINK_VID) != 0) { 1763 /* 1764 * VLAN flows are only supported over ethernet macs. 1765 */ 1766 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) 1767 return (EINVAL); 1768 1769 if (fd->fd_vid == 0) 1770 return (EINVAL); 1771 1772 } 1773 flent->fe_match = flow_l2_match; 1774 return (0); 1775 } 1776 1777 /* 1778 * Calculates hash index of flow entry. 1779 */ 1780 static uint32_t 1781 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 1782 { 1783 flow_desc_t *fd = &flent->fe_flow_desc; 1784 1785 ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0); 1786 return (flow_l2_addrhash(fd->fd_dst_mac, 1787 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); 1788 } 1789 1790 /* 1791 * This is used for duplicate flow checking. 1792 */ 1793 /* ARGSUSED */ 1794 static boolean_t 1795 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 1796 { 1797 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 1798 1799 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); 1800 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, 1801 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); 1802 } 1803 1804 /* 1805 * Generic flow entry insertion function. 1806 * Used by flow tables that do not have ordering requirements. 1807 */ 1808 /* ARGSUSED */ 1809 static int 1810 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 1811 flow_entry_t *flent) 1812 { 1813 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 1814 1815 if (*headp != NULL) { 1816 ASSERT(flent->fe_next == NULL); 1817 flent->fe_next = *headp; 1818 } 1819 *headp = flent; 1820 return (0); 1821 } 1822 1823 /* 1824 * IP version independent DSField matching function. 1825 */ 1826 /* ARGSUSED */ 1827 static boolean_t 1828 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1829 { 1830 flow_l3info_t *l3info = &s->fs_l3info; 1831 flow_desc_t *fd = &flent->fe_flow_desc; 1832 1833 switch (l3info->l3_version) { 1834 case IPV4_VERSION: { 1835 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1836 1837 return ((ipha->ipha_type_of_service & 1838 fd->fd_dsfield_mask) == fd->fd_dsfield); 1839 } 1840 case IPV6_VERSION: { 1841 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1842 1843 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & 1844 fd->fd_dsfield_mask) == fd->fd_dsfield); 1845 } 1846 default: 1847 return (B_FALSE); 1848 } 1849 } 1850 1851 /* 1852 * IP v4 and v6 address matching. 1853 * The netmask only needs to be applied on the packet but not on the 1854 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. 1855 */ 1856 1857 /* ARGSUSED */ 1858 static boolean_t 1859 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1860 { 1861 flow_l3info_t *l3info = &s->fs_l3info; 1862 flow_desc_t *fd = &flent->fe_flow_desc; 1863 ipha_t *ipha = (ipha_t *)l3info->l3_start; 1864 in_addr_t addr; 1865 1866 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); 1867 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1868 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == 1869 V4_PART_OF_V6(fd->fd_local_addr)); 1870 } 1871 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == 1872 V4_PART_OF_V6(fd->fd_remote_addr)); 1873 } 1874 1875 /* ARGSUSED */ 1876 static boolean_t 1877 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1878 { 1879 flow_l3info_t *l3info = &s->fs_l3info; 1880 flow_desc_t *fd = &flent->fe_flow_desc; 1881 ip6_t *ip6h = (ip6_t *)l3info->l3_start; 1882 in6_addr_t *addrp; 1883 1884 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); 1885 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { 1886 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, 1887 fd->fd_local_addr)); 1888 } 1889 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); 1890 } 1891 1892 /* ARGSUSED */ 1893 static boolean_t 1894 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 1895 { 1896 flow_l3info_t *l3info = &s->fs_l3info; 1897 flow_desc_t *fd = &flent->fe_flow_desc; 1898 1899 return (l3info->l3_protocol == fd->fd_protocol); 1900 } 1901 1902 static uint32_t 1903 flow_ip_hash(flow_tab_t *ft, flow_state_t *s) 1904 { 1905 flow_l3info_t *l3info = &s->fs_l3info; 1906 flow_mask_t mask = ft->ft_mask; 1907 1908 if ((mask & FLOW_IP_LOCAL) != 0) { 1909 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 1910 } else if ((mask & FLOW_IP_REMOTE) != 0) { 1911 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 1912 } else if ((mask & FLOW_IP_DSFIELD) != 0) { 1913 /* 1914 * DSField flents are arranged as a single list. 1915 */ 1916 return (0); 1917 } 1918 /* 1919 * IP addr flents are hashed into two lists, v4 or v6. 1920 */ 1921 ASSERT(ft->ft_size >= 2); 1922 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); 1923 } 1924 1925 static uint32_t 1926 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) 1927 { 1928 flow_l3info_t *l3info = &s->fs_l3info; 1929 1930 return (l3info->l3_protocol % ft->ft_size); 1931 } 1932 1933 /* ARGSUSED */ 1934 static int 1935 flow_ip_accept(flow_tab_t *ft, flow_state_t *s) 1936 { 1937 flow_l2info_t *l2info = &s->fs_l2info; 1938 flow_l3info_t *l3info = &s->fs_l3info; 1939 uint16_t sap = l2info->l2_sap; 1940 uchar_t *l3_start; 1941 1942 l3_start = l2info->l2_start + l2info->l2_hdrsize; 1943 1944 /* 1945 * Adjust start pointer if we're at the end of an mblk. 1946 */ 1947 CHECK_AND_ADJUST_START_PTR(s, l3_start); 1948 1949 l3info->l3_start = l3_start; 1950 if (!OK_32PTR(l3_start)) 1951 return (EINVAL); 1952 1953 switch (sap) { 1954 case ETHERTYPE_IP: { 1955 ipha_t *ipha = (ipha_t *)l3_start; 1956 1957 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) 1958 return (ENOBUFS); 1959 1960 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); 1961 l3info->l3_protocol = ipha->ipha_protocol; 1962 l3info->l3_version = IPV4_VERSION; 1963 l3info->l3_fragmented = 1964 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); 1965 break; 1966 } 1967 case ETHERTYPE_IPV6: { 1968 ip6_t *ip6h = (ip6_t *)l3_start; 1969 ip6_frag_t *frag = NULL; 1970 uint16_t ip6_hdrlen; 1971 uint8_t nexthdr; 1972 1973 if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen, 1974 &nexthdr, &frag)) { 1975 return (ENOBUFS); 1976 } 1977 l3info->l3_hdrsize = ip6_hdrlen; 1978 l3info->l3_protocol = nexthdr; 1979 l3info->l3_version = IPV6_VERSION; 1980 l3info->l3_fragmented = (frag != NULL); 1981 break; 1982 } 1983 default: 1984 return (EINVAL); 1985 } 1986 return (0); 1987 } 1988 1989 /* ARGSUSED */ 1990 static int 1991 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 1992 { 1993 flow_desc_t *fd = &flent->fe_flow_desc; 1994 1995 switch (fd->fd_protocol) { 1996 case IPPROTO_TCP: 1997 case IPPROTO_UDP: 1998 case IPPROTO_SCTP: 1999 case IPPROTO_ICMP: 2000 case IPPROTO_ICMPV6: 2001 flent->fe_match = flow_ip_proto_match; 2002 return (0); 2003 default: 2004 return (EINVAL); 2005 } 2006 } 2007 2008 /* ARGSUSED */ 2009 static int 2010 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2011 { 2012 flow_desc_t *fd = &flent->fe_flow_desc; 2013 flow_mask_t mask; 2014 uint8_t version; 2015 in6_addr_t *addr, *netmask; 2016 2017 /* 2018 * DSField does not require a IP version. 2019 */ 2020 if (fd->fd_mask == FLOW_IP_DSFIELD) { 2021 if (fd->fd_dsfield_mask == 0) 2022 return (EINVAL); 2023 2024 flent->fe_match = flow_ip_dsfield_match; 2025 return (0); 2026 } 2027 2028 /* 2029 * IP addresses must come with a version to avoid ambiguity. 2030 */ 2031 if ((fd->fd_mask & FLOW_IP_VERSION) == 0) 2032 return (EINVAL); 2033 2034 version = fd->fd_ipversion; 2035 if (version != IPV4_VERSION && version != IPV6_VERSION) 2036 return (EINVAL); 2037 2038 mask = fd->fd_mask & ~FLOW_IP_VERSION; 2039 switch (mask) { 2040 case FLOW_IP_LOCAL: 2041 addr = &fd->fd_local_addr; 2042 netmask = &fd->fd_local_netmask; 2043 break; 2044 case FLOW_IP_REMOTE: 2045 addr = &fd->fd_remote_addr; 2046 netmask = &fd->fd_remote_netmask; 2047 break; 2048 default: 2049 return (EINVAL); 2050 } 2051 2052 /* 2053 * Apply netmask onto specified address. 2054 */ 2055 V6_MASK_COPY(*addr, *netmask, *addr); 2056 if (version == IPV4_VERSION) { 2057 ipaddr_t v4addr = V4_PART_OF_V6((*addr)); 2058 ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); 2059 2060 if (v4addr == 0 || v4mask == 0) 2061 return (EINVAL); 2062 flent->fe_match = flow_ip_v4_match; 2063 } else { 2064 if (IN6_IS_ADDR_UNSPECIFIED(addr) || 2065 IN6_IS_ADDR_UNSPECIFIED(netmask)) 2066 return (EINVAL); 2067 flent->fe_match = flow_ip_v6_match; 2068 } 2069 return (0); 2070 } 2071 2072 static uint32_t 2073 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2074 { 2075 flow_desc_t *fd = &flent->fe_flow_desc; 2076 2077 return (fd->fd_protocol % ft->ft_size); 2078 } 2079 2080 static uint32_t 2081 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2082 { 2083 flow_desc_t *fd = &flent->fe_flow_desc; 2084 2085 /* 2086 * DSField flents are arranged as a single list. 2087 */ 2088 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2089 return (0); 2090 2091 /* 2092 * IP addr flents are hashed into two lists, v4 or v6. 2093 */ 2094 ASSERT(ft->ft_size >= 2); 2095 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); 2096 } 2097 2098 /* ARGSUSED */ 2099 static boolean_t 2100 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2101 { 2102 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2103 2104 return (fd1->fd_protocol == fd2->fd_protocol); 2105 } 2106 2107 /* ARGSUSED */ 2108 static boolean_t 2109 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2110 { 2111 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2112 in6_addr_t *a1, *m1, *a2, *m2; 2113 2114 ASSERT(fd1->fd_mask == fd2->fd_mask); 2115 if (fd1->fd_mask == FLOW_IP_DSFIELD) { 2116 return (fd1->fd_dsfield == fd2->fd_dsfield && 2117 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); 2118 } 2119 2120 /* 2121 * flow_ip_accept_fe() already validated the version. 2122 */ 2123 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); 2124 if (fd1->fd_ipversion != fd2->fd_ipversion) 2125 return (B_FALSE); 2126 2127 switch (fd1->fd_mask & ~FLOW_IP_VERSION) { 2128 case FLOW_IP_LOCAL: 2129 a1 = &fd1->fd_local_addr; 2130 m1 = &fd1->fd_local_netmask; 2131 a2 = &fd2->fd_local_addr; 2132 m2 = &fd2->fd_local_netmask; 2133 break; 2134 case FLOW_IP_REMOTE: 2135 a1 = &fd1->fd_remote_addr; 2136 m1 = &fd1->fd_remote_netmask; 2137 a2 = &fd2->fd_remote_addr; 2138 m2 = &fd2->fd_remote_netmask; 2139 break; 2140 default: 2141 /* 2142 * This is unreachable given the checks in 2143 * flow_ip_accept_fe(). 2144 */ 2145 return (B_FALSE); 2146 } 2147 2148 if (fd1->fd_ipversion == IPV4_VERSION) { 2149 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && 2150 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); 2151 2152 } else { 2153 return (IN6_ARE_ADDR_EQUAL(a1, a2) && 2154 IN6_ARE_ADDR_EQUAL(m1, m2)); 2155 } 2156 } 2157 2158 static int 2159 flow_ip_mask2plen(in6_addr_t *v6mask) 2160 { 2161 int bits; 2162 int plen = IPV6_ABITS; 2163 int i; 2164 2165 for (i = 3; i >= 0; i--) { 2166 if (v6mask->s6_addr32[i] == 0) { 2167 plen -= 32; 2168 continue; 2169 } 2170 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 2171 if (bits == 0) 2172 break; 2173 plen -= bits; 2174 } 2175 return (plen); 2176 } 2177 2178 /* ARGSUSED */ 2179 static int 2180 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, 2181 flow_entry_t *flent) 2182 { 2183 flow_entry_t **p = headp; 2184 flow_desc_t *fd0, *fd; 2185 in6_addr_t *m0, *m; 2186 int plen0, plen; 2187 2188 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); 2189 2190 /* 2191 * No special ordering needed for dsfield. 2192 */ 2193 fd0 = &flent->fe_flow_desc; 2194 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { 2195 if (*p != NULL) { 2196 ASSERT(flent->fe_next == NULL); 2197 flent->fe_next = *p; 2198 } 2199 *p = flent; 2200 return (0); 2201 } 2202 2203 /* 2204 * IP address flows are arranged in descending prefix length order. 2205 */ 2206 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? 2207 &fd0->fd_local_netmask : &fd0->fd_remote_netmask; 2208 plen0 = flow_ip_mask2plen(m0); 2209 ASSERT(plen0 != 0); 2210 2211 for (; *p != NULL; p = &(*p)->fe_next) { 2212 fd = &(*p)->fe_flow_desc; 2213 2214 /* 2215 * Normally a dsfield flent shouldn't end up on the same 2216 * list as an IP address because flow tables are (for now) 2217 * disjoint. If we decide to support both IP and dsfield 2218 * in the same table in the future, this check will allow 2219 * for that. 2220 */ 2221 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) 2222 continue; 2223 2224 /* 2225 * We also allow for the mixing of local and remote address 2226 * flents within one list. 2227 */ 2228 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? 2229 &fd->fd_local_netmask : &fd->fd_remote_netmask; 2230 plen = flow_ip_mask2plen(m); 2231 2232 if (plen <= plen0) 2233 break; 2234 } 2235 if (*p != NULL) { 2236 ASSERT(flent->fe_next == NULL); 2237 flent->fe_next = *p; 2238 } 2239 *p = flent; 2240 return (0); 2241 } 2242 2243 /* 2244 * Transport layer protocol and port matching functions. 2245 */ 2246 2247 /* ARGSUSED */ 2248 static boolean_t 2249 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2250 { 2251 flow_l3info_t *l3info = &s->fs_l3info; 2252 flow_l4info_t *l4info = &s->fs_l4info; 2253 flow_desc_t *fd = &flent->fe_flow_desc; 2254 2255 return (fd->fd_protocol == l3info->l3_protocol && 2256 fd->fd_local_port == l4info->l4_hash_port); 2257 } 2258 2259 /* ARGSUSED */ 2260 static boolean_t 2261 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) 2262 { 2263 flow_l3info_t *l3info = &s->fs_l3info; 2264 flow_l4info_t *l4info = &s->fs_l4info; 2265 flow_desc_t *fd = &flent->fe_flow_desc; 2266 2267 return (fd->fd_protocol == l3info->l3_protocol && 2268 fd->fd_remote_port == l4info->l4_hash_port); 2269 } 2270 2271 /* 2272 * Transport hash function. 2273 * Since we only support either local or remote port flows, 2274 * we only need to extract one of the ports to be used for 2275 * matching. 2276 */ 2277 static uint32_t 2278 flow_transport_hash(flow_tab_t *ft, flow_state_t *s) 2279 { 2280 flow_l3info_t *l3info = &s->fs_l3info; 2281 flow_l4info_t *l4info = &s->fs_l4info; 2282 uint8_t proto = l3info->l3_protocol; 2283 boolean_t dst_or_src; 2284 2285 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { 2286 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); 2287 } else { 2288 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); 2289 } 2290 2291 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : 2292 l4info->l4_src_port; 2293 2294 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); 2295 } 2296 2297 /* 2298 * Unlike other accept() functions above, we do not need to get the header 2299 * size because this is our highest layer so far. If we want to do support 2300 * other higher layer protocols, we would need to save the l4_hdrsize 2301 * in the code below. 2302 */ 2303 2304 /* ARGSUSED */ 2305 static int 2306 flow_transport_accept(flow_tab_t *ft, flow_state_t *s) 2307 { 2308 flow_l3info_t *l3info = &s->fs_l3info; 2309 flow_l4info_t *l4info = &s->fs_l4info; 2310 uint8_t proto = l3info->l3_protocol; 2311 uchar_t *l4_start; 2312 2313 l4_start = l3info->l3_start + l3info->l3_hdrsize; 2314 2315 /* 2316 * Adjust start pointer if we're at the end of an mblk. 2317 */ 2318 CHECK_AND_ADJUST_START_PTR(s, l4_start); 2319 2320 l4info->l4_start = l4_start; 2321 if (!OK_32PTR(l4_start)) 2322 return (EINVAL); 2323 2324 if (l3info->l3_fragmented == B_TRUE) 2325 return (EINVAL); 2326 2327 switch (proto) { 2328 case IPPROTO_TCP: { 2329 struct tcphdr *tcph = (struct tcphdr *)l4_start; 2330 2331 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) 2332 return (ENOBUFS); 2333 2334 l4info->l4_src_port = tcph->th_sport; 2335 l4info->l4_dst_port = tcph->th_dport; 2336 break; 2337 } 2338 case IPPROTO_UDP: { 2339 struct udphdr *udph = (struct udphdr *)l4_start; 2340 2341 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) 2342 return (ENOBUFS); 2343 2344 l4info->l4_src_port = udph->uh_sport; 2345 l4info->l4_dst_port = udph->uh_dport; 2346 break; 2347 } 2348 case IPPROTO_SCTP: { 2349 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; 2350 2351 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) 2352 return (ENOBUFS); 2353 2354 l4info->l4_src_port = sctph->sh_sport; 2355 l4info->l4_dst_port = sctph->sh_dport; 2356 break; 2357 } 2358 default: 2359 return (EINVAL); 2360 } 2361 2362 return (0); 2363 } 2364 2365 /* 2366 * Validates transport flow entry. 2367 * The protocol field must be present. 2368 */ 2369 2370 /* ARGSUSED */ 2371 static int 2372 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) 2373 { 2374 flow_desc_t *fd = &flent->fe_flow_desc; 2375 flow_mask_t mask = fd->fd_mask; 2376 2377 if ((mask & FLOW_IP_PROTOCOL) == 0) 2378 return (EINVAL); 2379 2380 switch (fd->fd_protocol) { 2381 case IPPROTO_TCP: 2382 case IPPROTO_UDP: 2383 case IPPROTO_SCTP: 2384 break; 2385 default: 2386 return (EINVAL); 2387 } 2388 2389 switch (mask & ~FLOW_IP_PROTOCOL) { 2390 case FLOW_ULP_PORT_LOCAL: 2391 if (fd->fd_local_port == 0) 2392 return (EINVAL); 2393 2394 flent->fe_match = flow_transport_lport_match; 2395 break; 2396 case FLOW_ULP_PORT_REMOTE: 2397 if (fd->fd_remote_port == 0) 2398 return (EINVAL); 2399 2400 flent->fe_match = flow_transport_rport_match; 2401 break; 2402 case 0: 2403 /* 2404 * transport-only flows conflicts with our table type. 2405 */ 2406 return (EOPNOTSUPP); 2407 default: 2408 return (EINVAL); 2409 } 2410 2411 return (0); 2412 } 2413 2414 static uint32_t 2415 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) 2416 { 2417 flow_desc_t *fd = &flent->fe_flow_desc; 2418 uint16_t port = 0; 2419 2420 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? 2421 fd->fd_local_port : fd->fd_remote_port; 2422 2423 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); 2424 } 2425 2426 /* ARGSUSED */ 2427 static boolean_t 2428 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) 2429 { 2430 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; 2431 2432 if (fd1->fd_protocol != fd2->fd_protocol) 2433 return (B_FALSE); 2434 2435 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) 2436 return (fd1->fd_local_port == fd2->fd_local_port); 2437 2438 if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0) 2439 return (fd1->fd_remote_port == fd2->fd_remote_port); 2440 2441 return (B_TRUE); 2442 } 2443 2444 static flow_ops_t flow_l2_ops = { 2445 flow_l2_accept_fe, 2446 flow_l2_hash_fe, 2447 flow_l2_match_fe, 2448 flow_generic_insert_fe, 2449 flow_l2_hash, 2450 {flow_l2_accept} 2451 }; 2452 2453 static flow_ops_t flow_ip_ops = { 2454 flow_ip_accept_fe, 2455 flow_ip_hash_fe, 2456 flow_ip_match_fe, 2457 flow_ip_insert_fe, 2458 flow_ip_hash, 2459 {flow_l2_accept, flow_ip_accept} 2460 }; 2461 2462 static flow_ops_t flow_ip_proto_ops = { 2463 flow_ip_proto_accept_fe, 2464 flow_ip_proto_hash_fe, 2465 flow_ip_proto_match_fe, 2466 flow_generic_insert_fe, 2467 flow_ip_proto_hash, 2468 {flow_l2_accept, flow_ip_accept} 2469 }; 2470 2471 static flow_ops_t flow_transport_ops = { 2472 flow_transport_accept_fe, 2473 flow_transport_hash_fe, 2474 flow_transport_match_fe, 2475 flow_generic_insert_fe, 2476 flow_transport_hash, 2477 {flow_l2_accept, flow_ip_accept, flow_transport_accept} 2478 }; 2479 2480 static flow_tab_info_t flow_tab_info_list[] = { 2481 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, 2482 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, 2483 {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, 2484 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, 2485 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}, 2486 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024} 2487 }; 2488 2489 #define FLOW_MAX_TAB_INFO \ 2490 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) 2491 2492 static flow_tab_info_t * 2493 mac_flow_tab_info_get(flow_mask_t mask) 2494 { 2495 int i; 2496 2497 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { 2498 if (mask == flow_tab_info_list[i].fti_mask) 2499 return (&flow_tab_info_list[i]); 2500 } 2501 return (NULL); 2502 } 2503