/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include <sys/strsun.h> #include <sys/sdt.h> #include <sys/mac.h> #include <sys/mac_impl.h> #include <sys/mac_client_impl.h> #include <sys/mac_stat.h> #include <sys/dls.h> #include <sys/dls_impl.h> #include <sys/mac_soft_ring.h> #include <sys/ethernet.h> #include <sys/cpupart.h> #include <sys/pool.h> #include <sys/pool_pset.h> #include <sys/vlan.h> #include <inet/ip.h> #include <inet/ip6.h> #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> typedef struct flow_stats_s { uint64_t fs_obytes; uint64_t fs_opackets; uint64_t fs_oerrors; uint64_t fs_ibytes; uint64_t fs_ipackets; uint64_t fs_ierrors; } flow_stats_t; /* global flow table, will be a per exclusive-zone table later */ static mod_hash_t *flow_hash; static krwlock_t flow_tab_lock; static kmem_cache_t *flow_cache; static kmem_cache_t *flow_tab_cache; static flow_ops_t flow_l2_ops; typedef struct { const char *fs_name; uint_t fs_offset; } flow_stats_info_t; #define FS_OFF(f) (offsetof(flow_stats_t, f)) static flow_stats_info_t flow_stats_list[] = { {"rbytes", FS_OFF(fs_ibytes)}, {"ipackets", FS_OFF(fs_ipackets)}, {"ierrors", FS_OFF(fs_ierrors)}, {"obytes", FS_OFF(fs_obytes)}, {"opackets", FS_OFF(fs_opackets)}, {"oerrors", FS_OFF(fs_oerrors)} }; #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) /* * Checks whether a flow mask is legal. */ static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); static void flow_stat_init(kstat_named_t *knp) { int i; for (i = 0; i < FS_SIZE; i++, knp++) { kstat_named_init(knp, flow_stats_list[i].fs_name, KSTAT_DATA_UINT64); } } static int flow_stat_update(kstat_t *ksp, int rw) { flow_entry_t *fep = ksp->ks_private; kstat_named_t *knp = ksp->ks_data; uint64_t *statp; int i; mac_rx_stats_t *mac_rx_stat; mac_tx_stats_t *mac_tx_stat; flow_stats_t flow_stats; mac_soft_ring_set_t *mac_srs; if (rw != KSTAT_READ) return (EACCES); bzero(&flow_stats, sizeof (flow_stats_t)); for (i = 0; i < fep->fe_rx_srs_cnt; i++) { mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i]; if (mac_srs == NULL) /* Multicast flow */ break; mac_rx_stat = &mac_srs->srs_rx.sr_stat; flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes + mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes; flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt + mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt; flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors; } mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs; if (mac_srs == NULL) /* Multicast flow */ goto done; mac_tx_stat = &mac_srs->srs_tx.st_stat; flow_stats.fs_obytes = mac_tx_stat->mts_obytes; flow_stats.fs_opackets = mac_tx_stat->mts_opackets; flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors; done: for (i = 0; i < FS_SIZE; i++, knp++) { statp = (uint64_t *) ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset); knp->value.ui64 = *statp; } return (0); } static void flow_stat_create(flow_entry_t *fep) { kstat_t *ksp; kstat_named_t *knp; uint_t nstats = FS_SIZE; /* * Fow now, flow entries are only manipulated and visible from the * global zone. */ ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow", KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID); if (ksp == NULL) return; ksp->ks_update = flow_stat_update; ksp->ks_private = fep; fep->fe_ksp = ksp; knp = (kstat_named_t *)ksp->ks_data; flow_stat_init(knp); kstat_install(ksp); } void flow_stat_destroy(flow_entry_t *fep) { if (fep->fe_ksp != NULL) { kstat_delete(fep->fe_ksp); fep->fe_ksp = NULL; } } /* * Initialize the flow table */ void mac_flow_init() { flow_cache = kmem_cache_create("flow_entry_cache", sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); flow_tab_cache = kmem_cache_create("flow_tab_cache", sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); flow_hash = mod_hash_create_extended("flow_hash", 100, mod_hash_null_keydtor, mod_hash_null_valdtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); } /* * Cleanup and release the flow table */ void mac_flow_fini() { kmem_cache_destroy(flow_cache); kmem_cache_destroy(flow_tab_cache); mod_hash_destroy_hash(flow_hash); rw_destroy(&flow_tab_lock); } /* * mac_create_flow(): create a flow_entry_t. */ int mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, void *client_cookie, uint_t type, flow_entry_t **flentp) { flow_entry_t *flent = *flentp; int err = 0; if (mrp != NULL) { err = mac_validate_props(NULL, mrp); if (err != 0) return (err); } if (flent == NULL) { flent = kmem_cache_alloc(flow_cache, KM_SLEEP); bzero(flent, sizeof (*flent)); mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); /* Initialize the receiver function to a safe routine */ flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; flent->fe_index = -1; } (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); /* This is an initial flow, will be configured later */ if (fd == NULL) { *flentp = flent; return (0); } flent->fe_client_cookie = client_cookie; flent->fe_type = type; /* Save flow desc */ bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); if (mrp != NULL) { /* * We have already set fe_resource_props for a Link. */ if (type & FLOW_USER) { bcopy(mrp, &flent->fe_resource_props, sizeof (mac_resource_props_t)); } /* * The effective resource list should reflect the priority * that we set implicitly. */ if (!(mrp->mrp_mask & MRP_PRIORITY)) mrp->mrp_mask |= MRP_PRIORITY; if (type & FLOW_USER) mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; else mrp->mrp_priority = MPL_LINK_DEFAULT; bzero(mrp->mrp_pool, MAXPATHLEN); bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t)); bcopy(mrp, &flent->fe_effective_props, sizeof (mac_resource_props_t)); } flow_stat_create(flent); *flentp = flent; return (0); } /* * Validate flow entry and add it to a flow table. */ int mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) { flow_entry_t **headp, **p; flow_ops_t *ops = &ft->ft_ops; flow_mask_t mask; uint32_t index; int err; ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); /* * Check for invalid bits in mask. */ mask = flent->fe_flow_desc.fd_mask; if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) return (EOPNOTSUPP); /* * Validate flent. */ if ((err = ops->fo_accept_fe(ft, flent)) != 0) { DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, flow_entry_t *, flent, int, err); return (err); } /* * Flent is valid. now calculate hash and insert it * into hash table. */ index = ops->fo_hash_fe(ft, flent); /* * We do not need a lock up until now because we were * not accessing the flow table. */ rw_enter(&ft->ft_lock, RW_WRITER); headp = &ft->ft_table[index]; /* * Check for duplicate flow. */ for (p = headp; *p != NULL; p = &(*p)->fe_next) { if ((*p)->fe_flow_desc.fd_mask != flent->fe_flow_desc.fd_mask) continue; if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { rw_exit(&ft->ft_lock); DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, flow_entry_t *, flent, int, err); return (EALREADY); } } /* * Insert flow to hash list. */ err = ops->fo_insert_fe(ft, headp, flent); if (err != 0) { rw_exit(&ft->ft_lock); DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, flow_entry_t *, flent, int, err); return (err); } /* * Save the hash index so it can be used by mac_flow_remove(). */ flent->fe_index = (int)index; /* * Save the flow tab back reference. */ flent->fe_flow_tab = ft; FLOW_MARK(flent, FE_FLOW_TAB); ft->ft_flow_count++; rw_exit(&ft->ft_lock); return (0); } /* * Remove a flow from a mac client's subflow table */ void mac_flow_rem_subflow(flow_entry_t *flent) { flow_tab_t *ft = flent->fe_flow_tab; mac_client_impl_t *mcip = ft->ft_mcip; mac_handle_t mh = (mac_handle_t)ft->ft_mip; ASSERT(MAC_PERIM_HELD(mh)); mac_flow_remove(ft, flent, B_FALSE); if (flent->fe_mcip == NULL) { /* * The interface is not yet plumbed and mac_client_flow_add * was not done. */ if (FLOW_TAB_EMPTY(ft)) { mac_flow_tab_destroy(ft); mcip->mci_subflow_tab = NULL; } } else { mac_flow_wait(flent, FLOW_DRIVER_UPCALL); mac_link_flow_clean((mac_client_handle_t)mcip, flent); } mac_fastpath_enable(mh); } /* * Add a flow to a mac client's subflow table and instantiate the flow * in the mac by creating the associated SRSs etc. */ int mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, boolean_t instantiate_flow) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_handle_t mh = (mac_handle_t)mcip->mci_mip; flow_tab_info_t *ftinfo; flow_mask_t mask; flow_tab_t *ft; int err; boolean_t ft_created = B_FALSE; ASSERT(MAC_PERIM_HELD(mh)); if ((err = mac_fastpath_disable(mh)) != 0) return (err); /* * If the subflow table exists already just add the new subflow * to the existing table, else we create a new subflow table below. */ ft = mcip->mci_subflow_tab; if (ft == NULL) { mask = flent->fe_flow_desc.fd_mask; /* * Try to create a new table and then add the subflow to the * newly created subflow table */ if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) { mac_fastpath_enable(mh); return (EOPNOTSUPP); } mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, mcip->mci_mip, &ft); ft_created = B_TRUE; } err = mac_flow_add(ft, flent); if (err != 0) { if (ft_created) mac_flow_tab_destroy(ft); mac_fastpath_enable(mh); return (err); } if (instantiate_flow) { /* Now activate the flow by creating its SRSs */ ASSERT(MCIP_DATAPATH_SETUP(mcip)); err = mac_link_flow_init((mac_client_handle_t)mcip, flent); if (err != 0) { mac_flow_remove(ft, flent, B_FALSE); if (ft_created) mac_flow_tab_destroy(ft); mac_fastpath_enable(mh); return (err); } } else { FLOW_MARK(flent, FE_UF_NO_DATAPATH); } if (ft_created) { ASSERT(mcip->mci_subflow_tab == NULL); ft->ft_mcip = mcip; mcip->mci_subflow_tab = ft; if (instantiate_flow) mac_client_update_classifier(mcip, B_TRUE); } return (0); } /* * Remove flow entry from flow table. */ void mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) { flow_entry_t **fp; ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); if (!(flent->fe_flags & FE_FLOW_TAB)) return; rw_enter(&ft->ft_lock, RW_WRITER); /* * If this is a permanent removal from the flow table, mark it * CONDEMNED to prevent future references. If this is a temporary * removal from the table, say to update the flow descriptor then * we don't mark it CONDEMNED */ if (!temp) FLOW_MARK(flent, FE_CONDEMNED); /* * Locate the specified flent. */ fp = &ft->ft_table[flent->fe_index]; while (*fp != flent) fp = &(*fp)->fe_next; /* * The flent must exist. Otherwise it's a bug. */ ASSERT(fp != NULL); *fp = flent->fe_next; flent->fe_next = NULL; /* * Reset fe_index to -1 so any attempt to call mac_flow_remove() * on a flent that is supposed to be in the table (FE_FLOW_TAB) * will panic. */ flent->fe_index = -1; FLOW_UNMARK(flent, FE_FLOW_TAB); ft->ft_flow_count--; rw_exit(&ft->ft_lock); } /* * This is the flow lookup routine used by the mac sw classifier engine. */ int mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) { flow_state_t s; flow_entry_t *flent; flow_ops_t *ops = &ft->ft_ops; boolean_t retried = B_FALSE; int i, err; s.fs_flags = flags; retry: s.fs_mp = mp; /* * Walk the list of predeclared accept functions. * Each of these would accumulate enough state to allow the next * accept routine to make progress. */ for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { mblk_t *last; /* * ENOBUFS indicates that the mp could be too short * and may need a pullup. */ if (err != ENOBUFS || retried) return (err); /* * The pullup is done on the last processed mblk, not * the starting one. pullup is not done if the mblk * has references or if b_cont is NULL. */ last = s.fs_mp; if (DB_REF(last) > 1 || last->b_cont == NULL || pullupmsg(last, -1) == 0) return (EINVAL); retried = B_TRUE; DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, flow_state_t *, &s); goto retry; } } /* * The packet is considered sane. We may now attempt to * find the corresponding flent. */ rw_enter(&ft->ft_lock, RW_READER); flent = ft->ft_table[ops->fo_hash(ft, &s)]; for (; flent != NULL; flent = flent->fe_next) { if (flent->fe_match(ft, flent, &s)) { FLOW_TRY_REFHOLD(flent, err); if (err != 0) continue; *flentp = flent; rw_exit(&ft->ft_lock); return (0); } } rw_exit(&ft->ft_lock); return (ENOENT); } /* * Walk flow table. * The caller is assumed to have proper perimeter protection. */ int mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), void *arg) { int err, i, cnt = 0; flow_entry_t *flent; if (ft == NULL) return (0); for (i = 0; i < ft->ft_size; i++) { for (flent = ft->ft_table[i]; flent != NULL; flent = flent->fe_next) { cnt++; err = (*fn)(flent, arg); if (err != 0) return (err); } } VERIFY(cnt == ft->ft_flow_count); return (0); } /* * Same as the above except a mutex is used for protection here. */ int mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), void *arg) { int err; if (ft == NULL) return (0); rw_enter(&ft->ft_lock, RW_WRITER); err = mac_flow_walk_nolock(ft, fn, arg); rw_exit(&ft->ft_lock); return (err); } static boolean_t mac_flow_clean(flow_entry_t *); /* * Destroy a flow entry. Called when the last reference on a flow is released. */ void mac_flow_destroy(flow_entry_t *flent) { ASSERT(flent->fe_refcnt == 0); if ((flent->fe_type & FLOW_USER) != 0) { ASSERT(mac_flow_clean(flent)); } else { mac_flow_cleanup(flent); } mac_misc_stat_delete(flent); mutex_destroy(&flent->fe_lock); cv_destroy(&flent->fe_cv); flow_stat_destroy(flent); kmem_cache_free(flow_cache, flent); } /* * XXX eric * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and * mac_link_flow_modify() should really be moved/reworked into the * two functions below. This would consolidate all the mac property * checking in one place. I'm leaving this alone for now since it's * out of scope of the new flows work. */ /* ARGSUSED */ uint32_t mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) { uint32_t changed_mask = 0; mac_resource_props_t *fmrp = &flent->fe_effective_props; int i; if ((mrp->mrp_mask & MRP_MAXBW) != 0 && (!(fmrp->mrp_mask & MRP_MAXBW) || (fmrp->mrp_maxbw != mrp->mrp_maxbw))) { changed_mask |= MRP_MAXBW; if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { fmrp->mrp_mask &= ~MRP_MAXBW; fmrp->mrp_maxbw = 0; } else { fmrp->mrp_mask |= MRP_MAXBW; fmrp->mrp_maxbw = mrp->mrp_maxbw; } } if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { if (fmrp->mrp_priority != mrp->mrp_priority) changed_mask |= MRP_PRIORITY; if (mrp->mrp_priority == MPL_RESET) { fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; fmrp->mrp_mask &= ~MRP_PRIORITY; } else { fmrp->mrp_priority = mrp->mrp_priority; fmrp->mrp_mask |= MRP_PRIORITY; } } /* modify fanout */ if ((mrp->mrp_mask & MRP_CPUS) != 0) { if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { for (i = 0; i < mrp->mrp_ncpus; i++) { if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) break; } if (i == mrp->mrp_ncpus) { /* * The new set of cpus passed is exactly * the same as the existing set. */ return (changed_mask); } } changed_mask |= MRP_CPUS; MAC_COPY_CPUS(mrp, fmrp); } /* * Modify the rings property. */ if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS) mac_set_rings_effective(flent->fe_mcip); if ((mrp->mrp_mask & MRP_POOL) != 0) { if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0) changed_mask |= MRP_POOL; if (strlen(mrp->mrp_pool) == 0) fmrp->mrp_mask &= ~MRP_POOL; else fmrp->mrp_mask |= MRP_POOL; (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN); } return (changed_mask); } void mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) { uint32_t changed_mask; mac_client_impl_t *mcip = flent->fe_mcip; mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip); cpupart_t *cpupart = NULL; boolean_t use_default = B_FALSE; ASSERT(flent != NULL); ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); rw_enter(&ft->ft_lock, RW_WRITER); /* Update the cached values inside the subflow entry */ changed_mask = mac_flow_modify_props(flent, mrp); rw_exit(&ft->ft_lock); /* * Push the changed parameters to the scheduling code in the * SRS's, to take effect right away. */ if (changed_mask & MRP_MAXBW) { mac_srs_update_bwlimit(flent, mrp); /* * If bandwidth is changed, we may have to change * the number of soft ring to be used for fanout. * Call mac_flow_update_fanout() if MAC_BIND_CPU * is not set and there is no user supplied cpu * info. This applies only to link at this time. */ if (!(flent->fe_type & FLOW_USER) && !(changed_mask & MRP_CPUS) && !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { mac_fanout_setup(mcip, flent, mcip_mrp, mac_rx_deliver, mcip, NULL, NULL); } } if (mrp->mrp_mask & MRP_PRIORITY) mac_flow_update_priority(mcip, flent); if (changed_mask & MRP_CPUS) mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, NULL); if (mrp->mrp_mask & MRP_POOL) { pool_lock(); cpupart = mac_pset_find(mrp, &use_default); mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL, cpupart); mac_set_pool_effective(use_default, cpupart, mrp, emrp); pool_unlock(); } } /* * This function waits for a certain condition to be met and is generally * used before a destructive or quiescing operation. */ void mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) { mutex_enter(&flent->fe_lock); flent->fe_flags |= FE_WAITER; switch (event) { case FLOW_DRIVER_UPCALL: /* * We want to make sure the driver upcalls have finished before * we signal the Rx SRS worker to quit. */ while (flent->fe_refcnt != 1) cv_wait(&flent->fe_cv, &flent->fe_lock); break; case FLOW_USER_REF: /* * Wait for the fe_user_refcnt to drop to 0. The flow has * been removed from the global flow hash. */ ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); while (flent->fe_user_refcnt != 0) cv_wait(&flent->fe_cv, &flent->fe_lock); break; default: ASSERT(0); } flent->fe_flags &= ~FE_WAITER; mutex_exit(&flent->fe_lock); } static boolean_t mac_flow_clean(flow_entry_t *flent) { ASSERT(flent->fe_next == NULL); ASSERT(flent->fe_tx_srs == NULL); ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); ASSERT(flent->fe_mbg == NULL); return (B_TRUE); } void mac_flow_cleanup(flow_entry_t *flent) { if ((flent->fe_type & FLOW_USER) == 0) { ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); ASSERT(flent->fe_refcnt == 0); } else { ASSERT(flent->fe_refcnt == 1); } if (flent->fe_mbg != NULL) { ASSERT(flent->fe_tx_srs == NULL); /* This is a multicast or broadcast flow entry */ mac_bcast_grp_free(flent->fe_mbg); flent->fe_mbg = NULL; } if (flent->fe_tx_srs != NULL) { ASSERT(flent->fe_mbg == NULL); mac_srs_free(flent->fe_tx_srs); flent->fe_tx_srs = NULL; } /* * In the normal case fe_rx_srs_cnt is 1. However in the error case * when mac_unicast_add fails we may not have set up any SRS * in which case fe_rx_srs_cnt will be zero. */ if (flent->fe_rx_srs_cnt != 0) { ASSERT(flent->fe_rx_srs_cnt == 1); mac_srs_free(flent->fe_rx_srs[0]); flent->fe_rx_srs[0] = NULL; flent->fe_rx_srs_cnt = 0; } ASSERT(flent->fe_rx_srs[0] == NULL); } void mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) { /* * Grab the fe_lock to see a self-consistent fe_flow_desc. * Updates to the fe_flow_desc happen under the fe_lock * after removing the flent from the flow table */ mutex_enter(&flent->fe_lock); bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); mutex_exit(&flent->fe_lock); } /* * Update a field of a flow entry. The mac perimeter ensures that * this is the only thread doing a modify operation on this mac end point. * So the flow table can't change or disappear. The ft_lock protects access * to the flow entry, and holding the lock ensures that there isn't any thread * accessing the flow entry or attempting a flow table lookup. However * data threads that are using the flow entry based on the old descriptor * will continue to use the flow entry. If strong coherence is required * then the flow will have to be quiesced before the descriptor can be * changed. */ void mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) { flow_tab_t *ft = flent->fe_flow_tab; flow_desc_t old_desc; int err; if (ft == NULL) { /* * The flow hasn't yet been inserted into the table, * so only the caller knows about this flow, however for * uniformity we grab the fe_lock here. */ mutex_enter(&flent->fe_lock); bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); mutex_exit(&flent->fe_lock); } ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); /* * Need to remove the flow entry from the table and reinsert it, * into a potentially diference hash line. The hash depends on * the new descriptor fields. However access to fe_desc itself * is always under the fe_lock. This helps log and stat functions * see a self-consistent fe_flow_desc. */ mac_flow_remove(ft, flent, B_TRUE); old_desc = flent->fe_flow_desc; mutex_enter(&flent->fe_lock); bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); mutex_exit(&flent->fe_lock); if (mac_flow_add(ft, flent) != 0) { /* * The add failed say due to an invalid flow descriptor. * Undo the update */ flent->fe_flow_desc = old_desc; err = mac_flow_add(ft, flent); ASSERT(err == 0); } } void mac_flow_set_name(flow_entry_t *flent, const char *name) { flow_tab_t *ft = flent->fe_flow_tab; if (ft == NULL) { /* * The flow hasn't yet been inserted into the table, * so only the caller knows about this flow */ (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); } else { ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); } mutex_enter(&flent->fe_lock); (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); mutex_exit(&flent->fe_lock); } /* * Return the client-private cookie that was associated with * the flow when it was created. */ void * mac_flow_get_client_cookie(flow_entry_t *flent) { return (flent->fe_client_cookie); } /* * Forward declarations. */ static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *); static int flow_l2_accept(flow_tab_t *, flow_state_t *); static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *); static int flow_ether_accept(flow_tab_t *, flow_state_t *); /* * Create flow table. */ void mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, mac_impl_t *mip, flow_tab_t **ftp) { flow_tab_t *ft; flow_ops_t *new_ops; ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); bzero(ft, sizeof (*ft)); ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); /* * We make a copy of the ops vector instead of just pointing to it * because we might want to customize the ops vector on a per table * basis (e.g. for optimization). */ new_ops = &ft->ft_ops; bcopy(ops, new_ops, sizeof (*ops)); ft->ft_mask = mask; ft->ft_size = size; ft->ft_mip = mip; /* * Optimizations for DL_ETHER media. */ if (mip->mi_info.mi_nativemedia == DL_ETHER) { if (new_ops->fo_hash == flow_l2_hash) new_ops->fo_hash = flow_ether_hash; if (new_ops->fo_hash_fe == flow_l2_hash_fe) new_ops->fo_hash_fe = flow_ether_hash_fe; if (new_ops->fo_accept[0] == flow_l2_accept) new_ops->fo_accept[0] = flow_ether_accept; } *ftp = ft; } void mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) { mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, 1024, mip, ftp); } /* * Destroy flow table. */ void mac_flow_tab_destroy(flow_tab_t *ft) { if (ft == NULL) return; ASSERT(ft->ft_flow_count == 0); kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); bzero(ft, sizeof (*ft)); kmem_cache_free(flow_tab_cache, ft); } /* * Add a new flow entry to the global flow hash table */ int mac_flow_hash_add(flow_entry_t *flent) { int err; rw_enter(&flow_tab_lock, RW_WRITER); err = mod_hash_insert(flow_hash, (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); if (err != 0) { rw_exit(&flow_tab_lock); return (EEXIST); } /* Mark as inserted into the global flow hash table */ FLOW_MARK(flent, FE_G_FLOW_HASH); rw_exit(&flow_tab_lock); return (err); } /* * Remove a flow entry from the global flow hash table */ void mac_flow_hash_remove(flow_entry_t *flent) { mod_hash_val_t val; rw_enter(&flow_tab_lock, RW_WRITER); VERIFY(mod_hash_remove(flow_hash, (mod_hash_key_t)flent->fe_flow_name, &val) == 0); /* Clear the mark that says inserted into the global flow hash table */ FLOW_UNMARK(flent, FE_G_FLOW_HASH); rw_exit(&flow_tab_lock); } /* * Retrieve a flow entry from the global flow hash table. */ int mac_flow_lookup_byname(char *name, flow_entry_t **flentp) { int err; flow_entry_t *flent; rw_enter(&flow_tab_lock, RW_READER); err = mod_hash_find(flow_hash, (mod_hash_key_t)name, (mod_hash_val_t *)&flent); if (err != 0) { rw_exit(&flow_tab_lock); return (ENOENT); } ASSERT(flent != NULL); FLOW_USER_REFHOLD(flent); rw_exit(&flow_tab_lock); *flentp = flent; return (0); } /* * Initialize or release mac client flows by walking the subflow table. * These are typically invoked during plumb/unplumb of links. */ static int mac_link_init_flows_cb(flow_entry_t *flent, void *arg) { mac_client_impl_t *mcip = arg; if (mac_link_flow_init(arg, flent) != 0) { cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", flent->fe_flow_name, mcip->mci_name); } else { FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); } return (0); } void mac_link_init_flows(mac_client_handle_t mch) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, mac_link_init_flows_cb, mcip); /* * If mac client had subflow(s) configured before plumb, change * function to mac_rx_srs_subflow_process and in case of hardware * classification, disable polling. */ mac_client_update_classifier(mcip, B_TRUE); } boolean_t mac_link_has_flows(mac_client_handle_t mch) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) return (B_TRUE); return (B_FALSE); } static int mac_link_release_flows_cb(flow_entry_t *flent, void *arg) { FLOW_MARK(flent, FE_UF_NO_DATAPATH); mac_flow_wait(flent, FLOW_DRIVER_UPCALL); mac_link_flow_clean(arg, flent); return (0); } void mac_link_release_flows(mac_client_handle_t mch) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; /* * Change the mci_flent callback back to mac_rx_srs_process() * because flows are about to be deactivated. */ mac_client_update_classifier(mcip, B_FALSE); (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, mac_link_release_flows_cb, mcip); } void mac_rename_flow(flow_entry_t *fep, const char *new_name) { mac_flow_set_name(fep, new_name); if (fep->fe_ksp != NULL) { flow_stat_destroy(fep); flow_stat_create(fep); } } /* * mac_link_flow_init() * Internal flow interface used for allocating SRSs and related * data structures. Not meant to be used by mac clients. */ int mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_impl_t *mip = mcip->mci_mip; int err; ASSERT(mch != NULL); ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) return (err); sub_flow->fe_mcip = mcip; return (0); } /* * mac_link_flow_add() * Used by flowadm(1m) or kernel mac clients for creating flows. */ int mac_link_flow_add(datalink_id_t linkid, char *flow_name, flow_desc_t *flow_desc, mac_resource_props_t *mrp) { flow_entry_t *flent = NULL; int err; dls_dl_handle_t dlh; dls_link_t *dlp; boolean_t link_held = B_FALSE; boolean_t hash_added = B_FALSE; mac_perim_handle_t mph; err = mac_flow_lookup_byname(flow_name, &flent); if (err == 0) { FLOW_USER_REFRELE(flent); return (EEXIST); } /* * First create a flow entry given the description provided * by the caller. */ err = mac_flow_create(flow_desc, mrp, flow_name, NULL, FLOW_USER | FLOW_OTHER, &flent); if (err != 0) return (err); /* * We've got a local variable referencing this flow now, so we need * to hold it. We'll release this flow before returning. * All failures until we return will undo any action that may internally * held the flow, so the last REFRELE will assure a clean freeing * of resources. */ FLOW_REFHOLD(flent); flent->fe_link_id = linkid; FLOW_MARK(flent, FE_INCIPIENT); err = mac_perim_enter_by_linkid(linkid, &mph); if (err != 0) { FLOW_FINAL_REFRELE(flent); return (err); } /* * dls will eventually be merged with mac so it's ok * to call dls' internal functions. */ err = dls_devnet_hold_link(linkid, &dlh, &dlp); if (err != 0) goto bail; link_held = B_TRUE; /* * Add the flow to the global flow table, this table will be per * exclusive zone so each zone can have its own flow namespace. * RFE 6625651 will fix this. * */ if ((err = mac_flow_hash_add(flent)) != 0) goto bail; hash_added = B_TRUE; /* * do not allow flows to be configured on an anchor VNIC */ if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { err = ENOTSUP; goto bail; } /* * Add the subflow to the subflow table. Also instantiate the flow * in the mac if there is an active user (we check if the MAC client's * datapath has been setup). */ err = mac_flow_add_subflow(dlp->dl_mch, flent, MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); if (err != 0) goto bail; FLOW_UNMARK(flent, FE_INCIPIENT); dls_devnet_rele_link(dlh, dlp); mac_perim_exit(mph); return (0); bail: if (hash_added) mac_flow_hash_remove(flent); if (link_held) dls_devnet_rele_link(dlh, dlp); /* * Wait for any transient global flow hash refs to clear * and then release the creation reference on the flow */ mac_flow_wait(flent, FLOW_USER_REF); FLOW_FINAL_REFRELE(flent); mac_perim_exit(mph); return (err); } /* * mac_link_flow_clean() * Internal flow interface used for freeing SRSs and related * data structures. Not meant to be used by mac clients. */ void mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_impl_t *mip = mcip->mci_mip; boolean_t last_subflow; ASSERT(mch != NULL); ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); /* * This sub flow entry may fail to be fully initialized by * mac_link_flow_init(). If so, simply return. */ if (sub_flow->fe_mcip == NULL) return; last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); /* * Tear down the data path */ mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); sub_flow->fe_mcip = NULL; /* * Delete the SRSs associated with this subflow. If this is being * driven by flowadm(1M) then the subflow will be deleted by * dls_rem_flow. However if this is a result of the interface being * unplumbed then the subflow itself won't be deleted. */ mac_flow_cleanup(sub_flow); /* * If all the subflows are gone, renable some of the stuff * we disabled when adding a subflow, polling etc. */ if (last_subflow) { /* * The subflow table itself is not protected by any locks or * refcnts. Hence quiesce the client upfront before clearing * mci_subflow_tab. */ mac_client_quiesce(mcip); mac_client_update_classifier(mcip, B_FALSE); mac_flow_tab_destroy(mcip->mci_subflow_tab); mcip->mci_subflow_tab = NULL; mac_client_restart(mcip); } } /* * mac_link_flow_remove() * Used by flowadm(1m) or kernel mac clients for removing flows. */ int mac_link_flow_remove(char *flow_name) { flow_entry_t *flent; mac_perim_handle_t mph; int err; datalink_id_t linkid; err = mac_flow_lookup_byname(flow_name, &flent); if (err != 0) return (err); linkid = flent->fe_link_id; FLOW_USER_REFRELE(flent); /* * The perim must be acquired before acquiring any other references * to maintain the lock and perimeter hierarchy. Please note the * FLOW_REFRELE above. */ err = mac_perim_enter_by_linkid(linkid, &mph); if (err != 0) return (err); /* * Note the second lookup of the flow, because a concurrent thread * may have removed it already while we were waiting to enter the * link's perimeter. */ err = mac_flow_lookup_byname(flow_name, &flent); if (err != 0) { mac_perim_exit(mph); return (err); } FLOW_USER_REFRELE(flent); /* * Remove the flow from the subflow table and deactivate the flow * by quiescing and removings its SRSs */ mac_flow_rem_subflow(flent); /* * Finally, remove the flow from the global table. */ mac_flow_hash_remove(flent); /* * Wait for any transient global flow hash refs to clear * and then release the creation reference on the flow */ mac_flow_wait(flent, FLOW_USER_REF); FLOW_FINAL_REFRELE(flent); mac_perim_exit(mph); return (0); } /* * mac_link_flow_modify() * Modifies the properties of a flow identified by its name. */ int mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) { flow_entry_t *flent; mac_client_impl_t *mcip; int err = 0; mac_perim_handle_t mph; datalink_id_t linkid; flow_tab_t *flow_tab; err = mac_validate_props(NULL, mrp); if (err != 0) return (err); err = mac_flow_lookup_byname(flow_name, &flent); if (err != 0) return (err); linkid = flent->fe_link_id; FLOW_USER_REFRELE(flent); /* * The perim must be acquired before acquiring any other references * to maintain the lock and perimeter hierarchy. Please note the * FLOW_REFRELE above. */ err = mac_perim_enter_by_linkid(linkid, &mph); if (err != 0) return (err); /* * Note the second lookup of the flow, because a concurrent thread * may have removed it already while we were waiting to enter the * link's perimeter. */ err = mac_flow_lookup_byname(flow_name, &flent); if (err != 0) { mac_perim_exit(mph); return (err); } FLOW_USER_REFRELE(flent); /* * If this flow is attached to a MAC client, then pass the request * along to the client. * Otherwise, just update the cached values. */ mcip = flent->fe_mcip; mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); if (mcip != NULL) { if ((flow_tab = mcip->mci_subflow_tab) == NULL) { err = ENOENT; } else { mac_flow_modify(flow_tab, flent, mrp); } } else { (void) mac_flow_modify_props(flent, mrp); } done: mac_perim_exit(mph); return (err); } /* * State structure and misc functions used by mac_link_flow_walk(). */ typedef struct { int (*ws_func)(mac_flowinfo_t *, void *); void *ws_arg; } flow_walk_state_t; static void mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) { (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, MAXFLOWNAMELEN); finfop->fi_link_id = flent->fe_link_id; finfop->fi_flow_desc = flent->fe_flow_desc; finfop->fi_resource_props = flent->fe_resource_props; } static int mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) { flow_walk_state_t *statep = arg; mac_flowinfo_t *finfo; int err; finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP); mac_link_flowinfo_copy(finfo, flent); err = statep->ws_func(finfo, statep->ws_arg); kmem_free(finfo, sizeof (*finfo)); return (err); } /* * mac_link_flow_walk() * Invokes callback 'func' for all flows belonging to the specified link. */ int mac_link_flow_walk(datalink_id_t linkid, int (*func)(mac_flowinfo_t *, void *), void *arg) { mac_client_impl_t *mcip; mac_perim_handle_t mph; flow_walk_state_t state; dls_dl_handle_t dlh; dls_link_t *dlp; int err; err = mac_perim_enter_by_linkid(linkid, &mph); if (err != 0) return (err); err = dls_devnet_hold_link(linkid, &dlh, &dlp); if (err != 0) { mac_perim_exit(mph); return (err); } mcip = (mac_client_impl_t *)dlp->dl_mch; state.ws_func = func; state.ws_arg = arg; err = mac_flow_walk_nolock(mcip->mci_subflow_tab, mac_link_flow_walk_cb, &state); dls_devnet_rele_link(dlh, dlp); mac_perim_exit(mph); return (err); } /* * mac_link_flow_info() * Retrieves information about a specific flow. */ int mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) { flow_entry_t *flent; int err; err = mac_flow_lookup_byname(flow_name, &flent); if (err != 0) return (err); mac_link_flowinfo_copy(finfo, flent); FLOW_USER_REFRELE(flent); return (0); } /* * Hash function macro that takes an Ethernet address and VLAN id as input. */ #define HASH_ETHER_VID(a, v, s) \ ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) /* * Generic layer-2 address hashing function that takes an address and address * length as input. This is the DJB hash function. */ static uint32_t flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize) { uint32_t hash = 5381; size_t i; for (i = 0; i < addrlen; i++) hash = ((hash << 5) + hash) + addr[i]; return (hash % htsize); } #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) #define CHECK_AND_ADJUST_START_PTR(s, start) { \ if ((s)->fs_mp->b_wptr == (start)) { \ mblk_t *next = (s)->fs_mp->b_cont; \ if (next == NULL) \ return (EINVAL); \ \ (s)->fs_mp = next; \ (start) = next->b_rptr; \ } \ } /* ARGSUSED */ static boolean_t flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l2info_t *l2 = &s->fs_l2info; flow_desc_t *fd = &flent->fe_flow_desc; return (l2->l2_vid == fd->fd_vid && bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); } /* * Layer 2 hash function. * Must be paired with flow_l2_accept() within a set of flow_ops * because it assumes the dest address is already extracted. */ static uint32_t flow_l2_hash(flow_tab_t *ft, flow_state_t *s) { return (flow_l2_addrhash(s->fs_l2info.l2_daddr, ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); } /* * This is the generic layer 2 accept function. * It makes use of mac_header_info() to extract the header length, * sap, vlan ID and destination address. */ static int flow_l2_accept(flow_tab_t *ft, flow_state_t *s) { boolean_t is_ether; flow_l2info_t *l2 = &s->fs_l2info; mac_header_info_t mhi; int err; is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); if ((err = mac_header_info((mac_handle_t)ft->ft_mip, s->fs_mp, &mhi)) != 0) { if (err == EINVAL) err = ENOBUFS; return (err); } l2->l2_start = s->fs_mp->b_rptr; l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { struct ether_vlan_header *evhp = (struct ether_vlan_header *)l2->l2_start; if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) return (ENOBUFS); l2->l2_sap = ntohs(evhp->ether_type); l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); l2->l2_hdrsize = sizeof (*evhp); } else { l2->l2_sap = mhi.mhi_bindsap; l2->l2_vid = 0; l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; } return (0); } /* * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ * accept(). The notable difference is that dest address is now extracted * by hash() rather than by accept(). This saves a few memory references * for flow tables that do not care about mac addresses. */ static uint32_t flow_ether_hash(flow_tab_t *ft, flow_state_t *s) { flow_l2info_t *l2 = &s->fs_l2info; struct ether_vlan_header *evhp; evhp = (struct ether_vlan_header *)l2->l2_start; l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); } static uint32_t flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); } /* ARGSUSED */ static int flow_ether_accept(flow_tab_t *ft, flow_state_t *s) { flow_l2info_t *l2 = &s->fs_l2info; struct ether_vlan_header *evhp; uint16_t sap; evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; l2->l2_start = (uchar_t *)evhp; if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) return (ENOBUFS); if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) return (ENOBUFS); l2->l2_sap = ntohs(evhp->ether_type); l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); l2->l2_hdrsize = sizeof (struct ether_vlan_header); } else { l2->l2_sap = sap; l2->l2_vid = 0; l2->l2_hdrsize = sizeof (struct ether_header); } return (0); } /* * Validates a layer 2 flow entry. */ static int flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; /* * Dest address is mandatory, and 0 length addresses are not yet * supported. */ if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0) return (EINVAL); if ((fd->fd_mask & FLOW_LINK_VID) != 0) { /* * VLAN flows are only supported over ethernet macs. */ if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) return (EINVAL); if (fd->fd_vid == 0) return (EINVAL); } flent->fe_match = flow_l2_match; return (0); } /* * Calculates hash index of flow entry. */ static uint32_t flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0); return (flow_l2_addrhash(fd->fd_dst_mac, ft->ft_mip->mi_type->mt_addr_length, ft->ft_size)); } /* * This is used for duplicate flow checking. */ /* ARGSUSED */ static boolean_t flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) { flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); } /* * Generic flow entry insertion function. * Used by flow tables that do not have ordering requirements. */ /* ARGSUSED */ static int flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, flow_entry_t *flent) { ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); if (*headp != NULL) { ASSERT(flent->fe_next == NULL); flent->fe_next = *headp; } *headp = flent; return (0); } /* * IP version independent DSField matching function. */ /* ARGSUSED */ static boolean_t flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_desc_t *fd = &flent->fe_flow_desc; switch (l3info->l3_version) { case IPV4_VERSION: { ipha_t *ipha = (ipha_t *)l3info->l3_start; return ((ipha->ipha_type_of_service & fd->fd_dsfield_mask) == fd->fd_dsfield); } case IPV6_VERSION: { ip6_t *ip6h = (ip6_t *)l3info->l3_start; return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & fd->fd_dsfield_mask) == fd->fd_dsfield); } default: return (B_FALSE); } } /* * IP v4 and v6 address matching. * The netmask only needs to be applied on the packet but not on the * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. */ /* ARGSUSED */ static boolean_t flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_desc_t *fd = &flent->fe_flow_desc; ipha_t *ipha = (ipha_t *)l3info->l3_start; in_addr_t addr; addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == V4_PART_OF_V6(fd->fd_local_addr)); } return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == V4_PART_OF_V6(fd->fd_remote_addr)); } /* ARGSUSED */ static boolean_t flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_desc_t *fd = &flent->fe_flow_desc; ip6_t *ip6h = (ip6_t *)l3info->l3_start; in6_addr_t *addrp; addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, fd->fd_local_addr)); } return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); } /* ARGSUSED */ static boolean_t flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_desc_t *fd = &flent->fe_flow_desc; return (l3info->l3_protocol == fd->fd_protocol); } static uint32_t flow_ip_hash(flow_tab_t *ft, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_mask_t mask = ft->ft_mask; if ((mask & FLOW_IP_LOCAL) != 0) { l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); } else if ((mask & FLOW_IP_REMOTE) != 0) { l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); } else if ((mask & FLOW_IP_DSFIELD) != 0) { /* * DSField flents are arranged as a single list. */ return (0); } /* * IP addr flents are hashed into two lists, v4 or v6. */ ASSERT(ft->ft_size >= 2); return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); } static uint32_t flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; return (l3info->l3_protocol % ft->ft_size); } /* ARGSUSED */ static int flow_ip_accept(flow_tab_t *ft, flow_state_t *s) { flow_l2info_t *l2info = &s->fs_l2info; flow_l3info_t *l3info = &s->fs_l3info; uint16_t sap = l2info->l2_sap; uchar_t *l3_start; l3_start = l2info->l2_start + l2info->l2_hdrsize; /* * Adjust start pointer if we're at the end of an mblk. */ CHECK_AND_ADJUST_START_PTR(s, l3_start); l3info->l3_start = l3_start; if (!OK_32PTR(l3_start)) return (EINVAL); switch (sap) { case ETHERTYPE_IP: { ipha_t *ipha = (ipha_t *)l3_start; if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) return (ENOBUFS); l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); l3info->l3_protocol = ipha->ipha_protocol; l3info->l3_version = IPV4_VERSION; l3info->l3_fragmented = IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); break; } case ETHERTYPE_IPV6: { ip6_t *ip6h = (ip6_t *)l3_start; ip6_frag_t *frag = NULL; uint16_t ip6_hdrlen; uint8_t nexthdr; if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen, &nexthdr, &frag)) { return (ENOBUFS); } l3info->l3_hdrsize = ip6_hdrlen; l3info->l3_protocol = nexthdr; l3info->l3_version = IPV6_VERSION; l3info->l3_fragmented = (frag != NULL); break; } default: return (EINVAL); } return (0); } /* ARGSUSED */ static int flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; switch (fd->fd_protocol) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: case IPPROTO_ICMP: case IPPROTO_ICMPV6: flent->fe_match = flow_ip_proto_match; return (0); default: return (EINVAL); } } /* ARGSUSED */ static int flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; flow_mask_t mask; uint8_t version; in6_addr_t *addr, *netmask; /* * DSField does not require a IP version. */ if (fd->fd_mask == FLOW_IP_DSFIELD) { if (fd->fd_dsfield_mask == 0) return (EINVAL); flent->fe_match = flow_ip_dsfield_match; return (0); } /* * IP addresses must come with a version to avoid ambiguity. */ if ((fd->fd_mask & FLOW_IP_VERSION) == 0) return (EINVAL); version = fd->fd_ipversion; if (version != IPV4_VERSION && version != IPV6_VERSION) return (EINVAL); mask = fd->fd_mask & ~FLOW_IP_VERSION; switch (mask) { case FLOW_IP_LOCAL: addr = &fd->fd_local_addr; netmask = &fd->fd_local_netmask; break; case FLOW_IP_REMOTE: addr = &fd->fd_remote_addr; netmask = &fd->fd_remote_netmask; break; default: return (EINVAL); } /* * Apply netmask onto specified address. */ V6_MASK_COPY(*addr, *netmask, *addr); if (version == IPV4_VERSION) { ipaddr_t v4addr = V4_PART_OF_V6((*addr)); ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); if (v4addr == 0 || v4mask == 0) return (EINVAL); flent->fe_match = flow_ip_v4_match; } else { if (IN6_IS_ADDR_UNSPECIFIED(addr) || IN6_IS_ADDR_UNSPECIFIED(netmask)) return (EINVAL); flent->fe_match = flow_ip_v6_match; } return (0); } static uint32_t flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; return (fd->fd_protocol % ft->ft_size); } static uint32_t flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; /* * DSField flents are arranged as a single list. */ if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) return (0); /* * IP addr flents are hashed into two lists, v4 or v6. */ ASSERT(ft->ft_size >= 2); return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); } /* ARGSUSED */ static boolean_t flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) { flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; return (fd1->fd_protocol == fd2->fd_protocol); } /* ARGSUSED */ static boolean_t flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) { flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; in6_addr_t *a1, *m1, *a2, *m2; ASSERT(fd1->fd_mask == fd2->fd_mask); if (fd1->fd_mask == FLOW_IP_DSFIELD) { return (fd1->fd_dsfield == fd2->fd_dsfield && fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); } /* * flow_ip_accept_fe() already validated the version. */ ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); if (fd1->fd_ipversion != fd2->fd_ipversion) return (B_FALSE); switch (fd1->fd_mask & ~FLOW_IP_VERSION) { case FLOW_IP_LOCAL: a1 = &fd1->fd_local_addr; m1 = &fd1->fd_local_netmask; a2 = &fd2->fd_local_addr; m2 = &fd2->fd_local_netmask; break; case FLOW_IP_REMOTE: a1 = &fd1->fd_remote_addr; m1 = &fd1->fd_remote_netmask; a2 = &fd2->fd_remote_addr; m2 = &fd2->fd_remote_netmask; break; default: /* * This is unreachable given the checks in * flow_ip_accept_fe(). */ return (B_FALSE); } if (fd1->fd_ipversion == IPV4_VERSION) { return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); } else { return (IN6_ARE_ADDR_EQUAL(a1, a2) && IN6_ARE_ADDR_EQUAL(m1, m2)); } } static int flow_ip_mask2plen(in6_addr_t *v6mask) { int bits; int plen = IPV6_ABITS; int i; for (i = 3; i >= 0; i--) { if (v6mask->s6_addr32[i] == 0) { plen -= 32; continue; } bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; if (bits == 0) break; plen -= bits; } return (plen); } /* ARGSUSED */ static int flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, flow_entry_t *flent) { flow_entry_t **p = headp; flow_desc_t *fd0, *fd; in6_addr_t *m0, *m; int plen0, plen; ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); /* * No special ordering needed for dsfield. */ fd0 = &flent->fe_flow_desc; if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { if (*p != NULL) { ASSERT(flent->fe_next == NULL); flent->fe_next = *p; } *p = flent; return (0); } /* * IP address flows are arranged in descending prefix length order. */ m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? &fd0->fd_local_netmask : &fd0->fd_remote_netmask; plen0 = flow_ip_mask2plen(m0); ASSERT(plen0 != 0); for (; *p != NULL; p = &(*p)->fe_next) { fd = &(*p)->fe_flow_desc; /* * Normally a dsfield flent shouldn't end up on the same * list as an IP address because flow tables are (for now) * disjoint. If we decide to support both IP and dsfield * in the same table in the future, this check will allow * for that. */ if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) continue; /* * We also allow for the mixing of local and remote address * flents within one list. */ m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? &fd->fd_local_netmask : &fd->fd_remote_netmask; plen = flow_ip_mask2plen(m); if (plen <= plen0) break; } if (*p != NULL) { ASSERT(flent->fe_next == NULL); flent->fe_next = *p; } *p = flent; return (0); } /* * Transport layer protocol and port matching functions. */ /* ARGSUSED */ static boolean_t flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_l4info_t *l4info = &s->fs_l4info; flow_desc_t *fd = &flent->fe_flow_desc; return (fd->fd_protocol == l3info->l3_protocol && fd->fd_local_port == l4info->l4_hash_port); } /* ARGSUSED */ static boolean_t flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_l4info_t *l4info = &s->fs_l4info; flow_desc_t *fd = &flent->fe_flow_desc; return (fd->fd_protocol == l3info->l3_protocol && fd->fd_remote_port == l4info->l4_hash_port); } /* * Transport hash function. * Since we only support either local or remote port flows, * we only need to extract one of the ports to be used for * matching. */ static uint32_t flow_transport_hash(flow_tab_t *ft, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_l4info_t *l4info = &s->fs_l4info; uint8_t proto = l3info->l3_protocol; boolean_t dst_or_src; if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); } else { dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); } l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : l4info->l4_src_port; return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); } /* * Unlike other accept() functions above, we do not need to get the header * size because this is our highest layer so far. If we want to do support * other higher layer protocols, we would need to save the l4_hdrsize * in the code below. */ /* ARGSUSED */ static int flow_transport_accept(flow_tab_t *ft, flow_state_t *s) { flow_l3info_t *l3info = &s->fs_l3info; flow_l4info_t *l4info = &s->fs_l4info; uint8_t proto = l3info->l3_protocol; uchar_t *l4_start; l4_start = l3info->l3_start + l3info->l3_hdrsize; /* * Adjust start pointer if we're at the end of an mblk. */ CHECK_AND_ADJUST_START_PTR(s, l4_start); l4info->l4_start = l4_start; if (!OK_32PTR(l4_start)) return (EINVAL); if (l3info->l3_fragmented == B_TRUE) return (EINVAL); switch (proto) { case IPPROTO_TCP: { struct tcphdr *tcph = (struct tcphdr *)l4_start; if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) return (ENOBUFS); l4info->l4_src_port = tcph->th_sport; l4info->l4_dst_port = tcph->th_dport; break; } case IPPROTO_UDP: { struct udphdr *udph = (struct udphdr *)l4_start; if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) return (ENOBUFS); l4info->l4_src_port = udph->uh_sport; l4info->l4_dst_port = udph->uh_dport; break; } case IPPROTO_SCTP: { sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) return (ENOBUFS); l4info->l4_src_port = sctph->sh_sport; l4info->l4_dst_port = sctph->sh_dport; break; } default: return (EINVAL); } return (0); } /* * Validates transport flow entry. * The protocol field must be present. */ /* ARGSUSED */ static int flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; flow_mask_t mask = fd->fd_mask; if ((mask & FLOW_IP_PROTOCOL) == 0) return (EINVAL); switch (fd->fd_protocol) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: break; default: return (EINVAL); } switch (mask & ~FLOW_IP_PROTOCOL) { case FLOW_ULP_PORT_LOCAL: if (fd->fd_local_port == 0) return (EINVAL); flent->fe_match = flow_transport_lport_match; break; case FLOW_ULP_PORT_REMOTE: if (fd->fd_remote_port == 0) return (EINVAL); flent->fe_match = flow_transport_rport_match; break; case 0: /* * transport-only flows conflicts with our table type. */ return (EOPNOTSUPP); default: return (EINVAL); } return (0); } static uint32_t flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) { flow_desc_t *fd = &flent->fe_flow_desc; uint16_t port = 0; port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? fd->fd_local_port : fd->fd_remote_port; return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); } /* ARGSUSED */ static boolean_t flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) { flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; if (fd1->fd_protocol != fd2->fd_protocol) return (B_FALSE); if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) return (fd1->fd_local_port == fd2->fd_local_port); if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0) return (fd1->fd_remote_port == fd2->fd_remote_port); return (B_TRUE); } static flow_ops_t flow_l2_ops = { flow_l2_accept_fe, flow_l2_hash_fe, flow_l2_match_fe, flow_generic_insert_fe, flow_l2_hash, {flow_l2_accept} }; static flow_ops_t flow_ip_ops = { flow_ip_accept_fe, flow_ip_hash_fe, flow_ip_match_fe, flow_ip_insert_fe, flow_ip_hash, {flow_l2_accept, flow_ip_accept} }; static flow_ops_t flow_ip_proto_ops = { flow_ip_proto_accept_fe, flow_ip_proto_hash_fe, flow_ip_proto_match_fe, flow_generic_insert_fe, flow_ip_proto_hash, {flow_l2_accept, flow_ip_accept} }; static flow_ops_t flow_transport_ops = { flow_transport_accept_fe, flow_transport_hash_fe, flow_transport_match_fe, flow_generic_insert_fe, flow_transport_hash, {flow_l2_accept, flow_ip_accept, flow_transport_accept} }; static flow_tab_info_t flow_tab_info_list[] = { {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}, {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024} }; #define FLOW_MAX_TAB_INFO \ ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) static flow_tab_info_t * mac_flow_tab_info_get(flow_mask_t mask) { int i; for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { if (mask == flow_tab_info_list[i].fti_mask) return (&flow_tab_info_list[i]); } return (NULL); }