io/mac/mac_flow.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2018 Joyent, Inc.
 */

#include <sys/strsun.h>
#include <sys/sdt.h>
#include <sys/mac.h>
#include <sys/mac_impl.h>
#include <sys/mac_client_impl.h>
#include <sys/mac_stat.h>
#include <sys/dls.h>
#include <sys/dls_impl.h>
#include <sys/mac_soft_ring.h>
#include <sys/ethernet.h>
#include <sys/cpupart.h>
#include <sys/pool.h>
#include <sys/pool_pset.h>
#include <sys/vlan.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/sctp.h>

typedef struct flow_stats_s {
	uint64_t	fs_obytes;
	uint64_t	fs_opackets;
	uint64_t	fs_oerrors;
	uint64_t	fs_ibytes;
	uint64_t	fs_ipackets;
	uint64_t	fs_ierrors;
} flow_stats_t;


/* global flow table, will be a per exclusive-zone table later */
static mod_hash_t	*flow_hash;
static krwlock_t	flow_tab_lock;

static kmem_cache_t	*flow_cache;
static kmem_cache_t	*flow_tab_cache;
static flow_ops_t	flow_l2_ops;

typedef struct {
	const char	*fs_name;
	uint_t		fs_offset;
} flow_stats_info_t;

#define	FS_OFF(f)	(offsetof(flow_stats_t, f))
static flow_stats_info_t flow_stats_list[] = {
	{"rbytes",	FS_OFF(fs_ibytes)},
	{"ipackets",	FS_OFF(fs_ipackets)},
	{"ierrors",	FS_OFF(fs_ierrors)},
	{"obytes",	FS_OFF(fs_obytes)},
	{"opackets",	FS_OFF(fs_opackets)},
	{"oerrors",	FS_OFF(fs_oerrors)}
};
#define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))

/*
 * Checks whether a flow mask is legal.
 */
static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);

static void
flow_stat_init(kstat_named_t *knp)
{
	int	i;

	for (i = 0; i < FS_SIZE; i++, knp++) {
		kstat_named_init(knp, flow_stats_list[i].fs_name,
		    KSTAT_DATA_UINT64);
	}
}

static int
flow_stat_update(kstat_t *ksp, int rw)
{
	flow_entry_t		*fep = ksp->ks_private;
	kstat_named_t		*knp = ksp->ks_data;
	uint64_t		*statp;
	int			i;
	mac_rx_stats_t		*mac_rx_stat;
	mac_tx_stats_t		*mac_tx_stat;
	flow_stats_t		flow_stats;
	mac_soft_ring_set_t	*mac_srs;

	if (rw != KSTAT_READ)
		return (EACCES);

	bzero(&flow_stats, sizeof (flow_stats_t));

	for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
		mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
		if (mac_srs == NULL) 		/* Multicast flow */
			break;
		mac_rx_stat = &mac_srs->srs_rx.sr_stat;

		flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
		    mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;

		flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
		    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;

		flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
	}

	mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
	if (mac_srs == NULL) 		/* Multicast flow */
		goto done;
	mac_tx_stat = &mac_srs->srs_tx.st_stat;

	flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
	flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
	flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;

done:
	for (i = 0; i < FS_SIZE; i++, knp++) {
		statp = (uint64_t *)
		    ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
		knp->value.ui64 = *statp;
	}
	return (0);
}

static void
flow_stat_create(flow_entry_t *fep)
{
	kstat_t		*ksp;
	kstat_named_t	*knp;
	uint_t		nstats = FS_SIZE;

	/*
	 * Fow now, flow entries are only manipulated and visible from the
	 * global zone.
	 */
	ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
	    KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
	if (ksp == NULL)
		return;

	ksp->ks_update = flow_stat_update;
	ksp->ks_private = fep;
	fep->fe_ksp = ksp;

	knp = (kstat_named_t *)ksp->ks_data;
	flow_stat_init(knp);
	kstat_install(ksp);
}

void
flow_stat_destroy(flow_entry_t *fep)
{
	if (fep->fe_ksp != NULL) {
		kstat_delete(fep->fe_ksp);
		fep->fe_ksp = NULL;
	}
}

/*
 * Initialize the flow table
 */
void
mac_flow_init()
{
	flow_cache = kmem_cache_create("flow_entry_cache",
	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	flow_tab_cache = kmem_cache_create("flow_tab_cache",
	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	flow_hash = mod_hash_create_extended("flow_hash",
	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
}

/*
 * Cleanup and release the flow table
 */
void
mac_flow_fini()
{
	kmem_cache_destroy(flow_cache);
	kmem_cache_destroy(flow_tab_cache);
	mod_hash_destroy_hash(flow_hash);
	rw_destroy(&flow_tab_lock);
}

/*
 * mac_create_flow(): create a flow_entry_t.
 */
int
mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
    void *client_cookie, uint_t type, flow_entry_t **flentp)
{
	flow_entry_t		*flent = *flentp;
	int			err = 0;

	if (mrp != NULL) {
		err = mac_validate_props(NULL, mrp);
		if (err != 0)
			return (err);
	}

	if (flent == NULL) {
		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
		bzero(flent, sizeof (*flent));
		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);

		/* Initialize the receiver function to a safe routine */
		flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
		flent->fe_index = -1;
	}
	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);

	/* This is an initial flow, will be configured later */
	if (fd == NULL) {
		*flentp = flent;
		return (0);
	}

	flent->fe_client_cookie = client_cookie;
	flent->fe_type = type;

	/* Save flow desc */
	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));

	if (mrp != NULL) {
		/*
		 * We have already set fe_resource_props for a Link.
		 */
		if (type & FLOW_USER) {
			bcopy(mrp, &flent->fe_resource_props,
			    sizeof (mac_resource_props_t));
		}
		/*
		 * The effective resource list should reflect the priority
		 * that we set implicitly.
		 */
		if (!(mrp->mrp_mask & MRP_PRIORITY))
			mrp->mrp_mask |= MRP_PRIORITY;
		if (type & FLOW_USER)
			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
		else
			mrp->mrp_priority = MPL_LINK_DEFAULT;
		bzero(mrp->mrp_pool, MAXPATHLEN);
		bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
		bcopy(mrp, &flent->fe_effective_props,
		    sizeof (mac_resource_props_t));
	}
	flow_stat_create(flent);

	*flentp = flent;
	return (0);
}

/*
 * Validate flow entry and add it to a flow table.
 */
int
mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_entry_t	**headp, **p;
	flow_ops_t	*ops = &ft->ft_ops;
	flow_mask_t	mask;
	uint32_t	index;
	int		err;

	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));

	/*
	 * Check for invalid bits in mask.
	 */
	mask = flent->fe_flow_desc.fd_mask;
	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
		return (EOPNOTSUPP);

	/*
	 * Validate flent.
	 */
	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
		    flow_entry_t *, flent, int, err);
		return (err);
	}

	/*
	 * Flent is valid. now calculate hash and insert it
	 * into hash table.
	 */
	index = ops->fo_hash_fe(ft, flent);

	/*
	 * We do not need a lock up until now because we were
	 * not accessing the flow table.
	 */
	rw_enter(&ft->ft_lock, RW_WRITER);
	headp = &ft->ft_table[index];

	/*
	 * Check for duplicate flow.
	 */
	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
		if ((*p)->fe_flow_desc.fd_mask !=
		    flent->fe_flow_desc.fd_mask)
			continue;

		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
			rw_exit(&ft->ft_lock);
			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
			    flow_entry_t *, flent, int, err);
			return (EALREADY);
		}
	}

	/*
	 * Insert flow to hash list.
	 */
	err = ops->fo_insert_fe(ft, headp, flent);
	if (err != 0) {
		rw_exit(&ft->ft_lock);
		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
		    flow_entry_t *, flent, int, err);
		return (err);
	}

	/*
	 * Save the hash index so it can be used by mac_flow_remove().
	 */
	flent->fe_index = (int)index;

	/*
	 * Save the flow tab back reference.
	 */
	flent->fe_flow_tab = ft;
	FLOW_MARK(flent, FE_FLOW_TAB);
	ft->ft_flow_count++;
	rw_exit(&ft->ft_lock);
	return (0);
}

/*
 * Remove a flow from a mac client's subflow table
 */
void
mac_flow_rem_subflow(flow_entry_t *flent)
{
	flow_tab_t		*ft = flent->fe_flow_tab;
	mac_client_impl_t	*mcip = ft->ft_mcip;
	mac_handle_t		mh = (mac_handle_t)ft->ft_mip;

	ASSERT(MAC_PERIM_HELD(mh));

	mac_flow_remove(ft, flent, B_FALSE);
	if (flent->fe_mcip == NULL) {
		/*
		 * The interface is not yet plumbed and mac_client_flow_add
		 * was not done.
		 */
		if (FLOW_TAB_EMPTY(ft)) {
			mac_flow_tab_destroy(ft);
			mcip->mci_subflow_tab = NULL;
		}
	} else {
		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
		mac_link_flow_clean((mac_client_handle_t)mcip, flent);
	}
	mac_fastpath_enable(mh);
}

/*
 * Add a flow to a mac client's subflow table and instantiate the flow
 * in the mac by creating the associated SRSs etc.
 */
int
mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
    boolean_t instantiate_flow)
{
	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
	flow_tab_info_t		*ftinfo;
	flow_mask_t		mask;
	flow_tab_t		*ft;
	int			err;
	boolean_t		ft_created = B_FALSE;

	ASSERT(MAC_PERIM_HELD(mh));

	if ((err = mac_fastpath_disable(mh)) != 0)
		return (err);

	/*
	 * If the subflow table exists already just add the new subflow
	 * to the existing table, else we create a new subflow table below.
	 */
	ft = mcip->mci_subflow_tab;
	if (ft == NULL) {
		mask = flent->fe_flow_desc.fd_mask;
		/*
		 * Try to create a new table and then add the subflow to the
		 * newly created subflow table
		 */
		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
			mac_fastpath_enable(mh);
			return (EOPNOTSUPP);
		}

		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
		    mcip->mci_mip, &ft);
		ft_created = B_TRUE;
	}

	err = mac_flow_add(ft, flent);
	if (err != 0) {
		if (ft_created)
			mac_flow_tab_destroy(ft);
		mac_fastpath_enable(mh);
		return (err);
	}

	if (instantiate_flow) {
		/* Now activate the flow by creating its SRSs */
		ASSERT(MCIP_DATAPATH_SETUP(mcip));
		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
		if (err != 0) {
			mac_flow_remove(ft, flent, B_FALSE);
			if (ft_created)
				mac_flow_tab_destroy(ft);
			mac_fastpath_enable(mh);
			return (err);
		}
	} else {
		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
	}
	if (ft_created) {
		ASSERT(mcip->mci_subflow_tab == NULL);
		ft->ft_mcip = mcip;
		mcip->mci_subflow_tab = ft;
		if (instantiate_flow)
			mac_client_update_classifier(mcip, B_TRUE);
	}
	return (0);
}

/*
 * Remove flow entry from flow table.
 */
void
mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
{
	flow_entry_t	**fp;

	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
	if (!(flent->fe_flags & FE_FLOW_TAB))
		return;

	rw_enter(&ft->ft_lock, RW_WRITER);
	/*
	 * If this is a permanent removal from the flow table, mark it
	 * CONDEMNED to prevent future references. If this is a temporary
	 * removal from the table, say to update the flow descriptor then
	 * we don't mark it CONDEMNED
	 */
	if (!temp)
		FLOW_MARK(flent, FE_CONDEMNED);
	/*
	 * Locate the specified flent.
	 */
	fp = &ft->ft_table[flent->fe_index];
	while (*fp != flent)
		fp = &(*fp)->fe_next;

	/*
	 * The flent must exist. Otherwise it's a bug.
	 */
	ASSERT(fp != NULL);
	*fp = flent->fe_next;
	flent->fe_next = NULL;

	/*
	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
	 * will panic.
	 */
	flent->fe_index = -1;
	FLOW_UNMARK(flent, FE_FLOW_TAB);
	ft->ft_flow_count--;
	rw_exit(&ft->ft_lock);
}

/*
 * This is the flow lookup routine used by the mac sw classifier engine.
 */
int
mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
{
	flow_state_t	s;
	flow_entry_t	*flent;
	flow_ops_t	*ops = &ft->ft_ops;
	boolean_t	retried = B_FALSE;
	int		i, err;

	s.fs_flags = flags;
retry:
	s.fs_mp = mp;

	/*
	 * Walk the list of predeclared accept functions.
	 * Each of these would accumulate enough state to allow the next
	 * accept routine to make progress.
	 */
	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
			mblk_t	*last;

			/*
			 * ENOBUFS indicates that the mp could be too short
			 * and may need a pullup.
			 */
			if (err != ENOBUFS || retried)
				return (err);

			/*
			 * The pullup is done on the last processed mblk, not
			 * the starting one. pullup is not done if the mblk
			 * has references or if b_cont is NULL.
			 */
			last = s.fs_mp;
			if (DB_REF(last) > 1 || last->b_cont == NULL ||
			    pullupmsg(last, -1) == 0)
				return (EINVAL);

			retried = B_TRUE;
			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
			    flow_state_t *, &s);
			goto retry;
		}
	}

	/*
	 * The packet is considered sane. We may now attempt to
	 * find the corresponding flent.
	 */
	rw_enter(&ft->ft_lock, RW_READER);
	flent = ft->ft_table[ops->fo_hash(ft, &s)];
	for (; flent != NULL; flent = flent->fe_next) {
		if (flent->fe_match(ft, flent, &s)) {
			FLOW_TRY_REFHOLD(flent, err);
			if (err != 0)
				continue;
			*flentp = flent;
			rw_exit(&ft->ft_lock);
			return (0);
		}
	}
	rw_exit(&ft->ft_lock);
	return (ENOENT);
}

/*
 * Walk flow table.
 * The caller is assumed to have proper perimeter protection.
 */
int
mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
    void *arg)
{
	int		err, i, cnt = 0;
	flow_entry_t	*flent;

	if (ft == NULL)
		return (0);

	for (i = 0; i < ft->ft_size; i++) {
		for (flent = ft->ft_table[i]; flent != NULL;
		    flent = flent->fe_next) {
			cnt++;
			err = (*fn)(flent, arg);
			if (err != 0)
				return (err);
		}
	}
	VERIFY(cnt == ft->ft_flow_count);
	return (0);
}

/*
 * Same as the above except a mutex is used for protection here.
 */
int
mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
    void *arg)
{
	int		err;

	if (ft == NULL)
		return (0);

	rw_enter(&ft->ft_lock, RW_WRITER);
	err = mac_flow_walk_nolock(ft, fn, arg);
	rw_exit(&ft->ft_lock);
	return (err);
}

static boolean_t	mac_flow_clean(flow_entry_t *);

/*
 * Destroy a flow entry. Called when the last reference on a flow is released.
 */
void
mac_flow_destroy(flow_entry_t *flent)
{
	ASSERT(flent->fe_refcnt == 0);

	if ((flent->fe_type & FLOW_USER) != 0) {
		ASSERT(mac_flow_clean(flent));
	} else {
		mac_flow_cleanup(flent);
	}
	mac_misc_stat_delete(flent);
	mutex_destroy(&flent->fe_lock);
	cv_destroy(&flent->fe_cv);
	flow_stat_destroy(flent);
	kmem_cache_free(flow_cache, flent);
}

/*
 * XXX eric
 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
 * mac_link_flow_modify() should really be moved/reworked into the
 * two functions below. This would consolidate all the mac property
 * checking in one place. I'm leaving this alone for now since it's
 * out of scope of the new flows work.
 */
/* ARGSUSED */
uint32_t
mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
{
	uint32_t		changed_mask = 0;
	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
	int			i;

	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
	    (!(fmrp->mrp_mask & MRP_MAXBW) ||
	    (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
		changed_mask |= MRP_MAXBW;
		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
			fmrp->mrp_mask &= ~MRP_MAXBW;
			fmrp->mrp_maxbw = 0;
		} else {
			fmrp->mrp_mask |= MRP_MAXBW;
			fmrp->mrp_maxbw = mrp->mrp_maxbw;
		}
	}

	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
		if (fmrp->mrp_priority != mrp->mrp_priority)
			changed_mask |= MRP_PRIORITY;
		if (mrp->mrp_priority == MPL_RESET) {
			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
			fmrp->mrp_mask &= ~MRP_PRIORITY;
		} else {
			fmrp->mrp_priority = mrp->mrp_priority;
			fmrp->mrp_mask |= MRP_PRIORITY;
		}
	}

	/* modify fanout */
	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
			for (i = 0; i < mrp->mrp_ncpus; i++) {
				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
					break;
			}
			if (i == mrp->mrp_ncpus) {
				/*
				 * The new set of cpus passed is exactly
				 * the same as the existing set.
				 */
				return (changed_mask);
			}
		}
		changed_mask |= MRP_CPUS;
		MAC_COPY_CPUS(mrp, fmrp);
	}

	/*
	 * Modify the rings property.
	 */
	if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
		mac_set_rings_effective(flent->fe_mcip);

	if ((mrp->mrp_mask & MRP_POOL) != 0) {
		if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
			changed_mask |= MRP_POOL;
		if (strlen(mrp->mrp_pool) == 0)
			fmrp->mrp_mask &= ~MRP_POOL;
		else
			fmrp->mrp_mask |= MRP_POOL;
		(void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
	}
	return (changed_mask);
}

void
mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
{
	uint32_t changed_mask;
	mac_client_impl_t *mcip = flent->fe_mcip;
	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
	mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
	cpupart_t *cpupart = NULL;
	boolean_t use_default = B_FALSE;

	ASSERT(flent != NULL);
	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));

	rw_enter(&ft->ft_lock, RW_WRITER);

	/* Update the cached values inside the subflow entry */
	changed_mask = mac_flow_modify_props(flent, mrp);
	rw_exit(&ft->ft_lock);
	/*
	 * Push the changed parameters to the scheduling code in the
	 * SRS's, to take effect right away.
	 */
	if (changed_mask & MRP_MAXBW) {
		mac_srs_update_bwlimit(flent, mrp);
		/*
		 * If bandwidth is changed, we may have to change
		 * the number of soft ring to be used for fanout.
		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
		 * is not set and there is no user supplied cpu
		 * info. This applies only to link at this time.
		 */
		if (!(flent->fe_type & FLOW_USER) &&
		    !(changed_mask & MRP_CPUS) &&
		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
			mac_fanout_setup(mcip, flent, mcip_mrp,
			    mac_rx_deliver, mcip, NULL, NULL);
		}
	}
	if (mrp->mrp_mask & MRP_PRIORITY)
		mac_flow_update_priority(mcip, flent);

	if (changed_mask & MRP_CPUS)
		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
		    NULL);

	if (mrp->mrp_mask & MRP_POOL) {
		pool_lock();
		cpupart = mac_pset_find(mrp, &use_default);
		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
		    cpupart);
		mac_set_pool_effective(use_default, cpupart, mrp, emrp);
		pool_unlock();
	}
}

/*
 * This function waits for a certain condition to be met and is generally
 * used before a destructive or quiescing operation.
 */
void
mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
{
	mutex_enter(&flent->fe_lock);
	flent->fe_flags |= FE_WAITER;

	switch (event) {
	case FLOW_DRIVER_UPCALL:
		/*
		 * We want to make sure the driver upcalls have finished before
		 * we signal the Rx SRS worker to quit.
		 */
		while (flent->fe_refcnt != 1)
			cv_wait(&flent->fe_cv, &flent->fe_lock);
		break;

	case FLOW_USER_REF:
		/*
		 * Wait for the fe_user_refcnt to drop to 0. The flow has
		 * been removed from the global flow hash.
		 */
		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
		while (flent->fe_user_refcnt != 0)
			cv_wait(&flent->fe_cv, &flent->fe_lock);
		break;

	default:
		ASSERT(0);
	}

	flent->fe_flags &= ~FE_WAITER;
	mutex_exit(&flent->fe_lock);
}

static boolean_t
mac_flow_clean(flow_entry_t *flent)
{
	ASSERT(flent->fe_next == NULL);
	ASSERT(flent->fe_tx_srs == NULL);
	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
	ASSERT(flent->fe_mbg == NULL);

	return (B_TRUE);
}

void
mac_flow_cleanup(flow_entry_t *flent)
{
	if ((flent->fe_type & FLOW_USER) == 0) {
		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
		ASSERT(flent->fe_refcnt == 0);
	} else {
		ASSERT(flent->fe_refcnt == 1);
	}

	if (flent->fe_mbg != NULL) {
		ASSERT(flent->fe_tx_srs == NULL);
		/* This is a multicast or broadcast flow entry */
		mac_bcast_grp_free(flent->fe_mbg);
		flent->fe_mbg = NULL;
	}

	if (flent->fe_tx_srs != NULL) {
		ASSERT(flent->fe_mbg == NULL);
		mac_srs_free(flent->fe_tx_srs);
		flent->fe_tx_srs = NULL;
	}

	/*
	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
	 * when mac_unicast_add fails we may not have set up any SRS
	 * in which case fe_rx_srs_cnt will be zero.
	 */
	if (flent->fe_rx_srs_cnt != 0) {
		ASSERT(flent->fe_rx_srs_cnt == 1);
		mac_srs_free(flent->fe_rx_srs[0]);
		flent->fe_rx_srs[0] = NULL;
		flent->fe_rx_srs_cnt = 0;
	}
	ASSERT(flent->fe_rx_srs[0] == NULL);
}

void
mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
{
	/*
	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
	 * Updates to the fe_flow_desc happen under the fe_lock
	 * after removing the flent from the flow table
	 */
	mutex_enter(&flent->fe_lock);
	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
	mutex_exit(&flent->fe_lock);
}

/*
 * Update a field of a flow entry. The mac perimeter ensures that
 * this is the only thread doing a modify operation on this mac end point.
 * So the flow table can't change or disappear. The ft_lock protects access
 * to the flow entry, and holding the lock ensures that there isn't any thread
 * accessing the flow entry or attempting a flow table lookup. However
 * data threads that are using the flow entry based on the old descriptor
 * will continue to use the flow entry. If strong coherence is required
 * then the flow will have to be quiesced before the descriptor can be
 * changed.
 */
void
mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
{
	flow_tab_t	*ft = flent->fe_flow_tab;
	flow_desc_t	old_desc;
	int		err;

	if (ft == NULL) {
		/*
		 * The flow hasn't yet been inserted into the table,
		 * so only the caller knows about this flow, however for
		 * uniformity we grab the fe_lock here.
		 */
		mutex_enter(&flent->fe_lock);
		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
		mutex_exit(&flent->fe_lock);
	}

	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));

	/*
	 * Need to remove the flow entry from the table and reinsert it,
	 * into a potentially diference hash line. The hash depends on
	 * the new descriptor fields. However access to fe_desc itself
	 * is always under the fe_lock. This helps log and stat functions
	 * see a self-consistent fe_flow_desc.
	 */
	mac_flow_remove(ft, flent, B_TRUE);
	old_desc = flent->fe_flow_desc;

	mutex_enter(&flent->fe_lock);
	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
	mutex_exit(&flent->fe_lock);

	if (mac_flow_add(ft, flent) != 0) {
		/*
		 * The add failed say due to an invalid flow descriptor.
		 * Undo the update
		 */
		flent->fe_flow_desc = old_desc;
		err = mac_flow_add(ft, flent);
		ASSERT(err == 0);
	}
}

void
mac_flow_set_name(flow_entry_t *flent, const char *name)
{
	flow_tab_t	*ft = flent->fe_flow_tab;

	if (ft == NULL) {
		/*
		 *  The flow hasn't yet been inserted into the table,
		 * so only the caller knows about this flow
		 */
		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
	} else {
		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
	}

	mutex_enter(&flent->fe_lock);
	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
	mutex_exit(&flent->fe_lock);
}

/*
 * Return the client-private cookie that was associated with
 * the flow when it was created.
 */
void *
mac_flow_get_client_cookie(flow_entry_t *flent)
{
	return (flent->fe_client_cookie);
}

/*
 * Forward declarations.
 */
static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
static uint32_t	flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
static uint32_t	flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
static int	flow_ether_accept(flow_tab_t *, flow_state_t *);

/*
 * Create flow table.
 */
void
mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
    mac_impl_t *mip, flow_tab_t **ftp)
{
	flow_tab_t	*ft;
	flow_ops_t	*new_ops;

	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
	bzero(ft, sizeof (*ft));

	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);

	/*
	 * We make a copy of the ops vector instead of just pointing to it
	 * because we might want to customize the ops vector on a per table
	 * basis (e.g. for optimization).
	 */
	new_ops = &ft->ft_ops;
	bcopy(ops, new_ops, sizeof (*ops));
	ft->ft_mask = mask;
	ft->ft_size = size;
	ft->ft_mip = mip;

	/*
	 * Optimizations for DL_ETHER media.
	 */
	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
		if (new_ops->fo_hash == flow_l2_hash)
			new_ops->fo_hash = flow_ether_hash;
		if (new_ops->fo_hash_fe == flow_l2_hash_fe)
			new_ops->fo_hash_fe = flow_ether_hash_fe;
		if (new_ops->fo_accept[0] == flow_l2_accept)
			new_ops->fo_accept[0] = flow_ether_accept;
	}
	*ftp = ft;
}

void
mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
{
	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
	    1024, mip, ftp);
}

/*
 * Destroy flow table.
 */
void
mac_flow_tab_destroy(flow_tab_t *ft)
{
	if (ft == NULL)
		return;

	ASSERT(ft->ft_flow_count == 0);
	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
	bzero(ft, sizeof (*ft));
	kmem_cache_free(flow_tab_cache, ft);
}

/*
 * Add a new flow entry to the global flow hash table
 */
int
mac_flow_hash_add(flow_entry_t *flent)
{
	int	err;

	rw_enter(&flow_tab_lock, RW_WRITER);
	err = mod_hash_insert(flow_hash,
	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
	if (err != 0) {
		rw_exit(&flow_tab_lock);
		return (EEXIST);
	}
	/* Mark as inserted into the global flow hash table */
	FLOW_MARK(flent, FE_G_FLOW_HASH);
	rw_exit(&flow_tab_lock);
	return (err);
}

/*
 * Remove a flow entry from the global flow hash table
 */
void
mac_flow_hash_remove(flow_entry_t *flent)
{
	mod_hash_val_t	val;

	rw_enter(&flow_tab_lock, RW_WRITER);
	VERIFY(mod_hash_remove(flow_hash,
	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);

	/* Clear the mark that says inserted into the global flow hash table */
	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
	rw_exit(&flow_tab_lock);
}

/*
 * Retrieve a flow entry from the global flow hash table.
 */
int
mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
{
	int		err;
	flow_entry_t	*flent;

	rw_enter(&flow_tab_lock, RW_READER);
	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
	    (mod_hash_val_t *)&flent);
	if (err != 0) {
		rw_exit(&flow_tab_lock);
		return (ENOENT);
	}
	ASSERT(flent != NULL);
	FLOW_USER_REFHOLD(flent);
	rw_exit(&flow_tab_lock);

	*flentp = flent;
	return (0);
}

/*
 * Initialize or release mac client flows by walking the subflow table.
 * These are typically invoked during plumb/unplumb of links.
 */

static int
mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
{
	mac_client_impl_t	*mcip = arg;

	if (mac_link_flow_init(arg, flent) != 0) {
		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
		    flent->fe_flow_name, mcip->mci_name);
	} else {
		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
	}
	return (0);
}

void
mac_link_init_flows(mac_client_handle_t mch)
{
	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;

	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
	    mac_link_init_flows_cb, mcip);
	/*
	 * If mac client had subflow(s) configured before plumb, change
	 * function to mac_rx_srs_subflow_process and in case of hardware
	 * classification, disable polling.
	 */
	mac_client_update_classifier(mcip, B_TRUE);

}

boolean_t
mac_link_has_flows(mac_client_handle_t mch)
{
	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;

	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
		return (B_TRUE);

	return (B_FALSE);
}

static int
mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
{
	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
	mac_link_flow_clean(arg, flent);
	return (0);
}

void
mac_link_release_flows(mac_client_handle_t mch)
{
	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;

	/*
	 * Change the mci_flent callback back to mac_rx_srs_process()
	 * because flows are about to be deactivated.
	 */
	mac_client_update_classifier(mcip, B_FALSE);
	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
	    mac_link_release_flows_cb, mcip);
}

void
mac_rename_flow(flow_entry_t *fep, const char *new_name)
{
	mac_flow_set_name(fep, new_name);
	if (fep->fe_ksp != NULL) {
		flow_stat_destroy(fep);
		flow_stat_create(fep);
	}
}

/*
 * mac_link_flow_init()
 * Internal flow interface used for allocating SRSs and related
 * data structures. Not meant to be used by mac clients.
 */
int
mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
{
	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
	mac_impl_t		*mip = mcip->mci_mip;
	int			err;

	ASSERT(mch != NULL);
	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));

	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
		return (err);

	sub_flow->fe_mcip = mcip;

	return (0);
}

/*
 * mac_link_flow_add()
 * Used by flowadm(1m) or kernel mac clients for creating flows.
 */
int
mac_link_flow_add(datalink_id_t linkid, char *flow_name,
    flow_desc_t *flow_desc, mac_resource_props_t *mrp)
{
	flow_entry_t		*flent = NULL;
	int			err;
	dls_dl_handle_t		dlh;
	dls_link_t		*dlp;
	boolean_t		link_held = B_FALSE;
	boolean_t		hash_added = B_FALSE;
	mac_perim_handle_t	mph;

	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err == 0) {
		FLOW_USER_REFRELE(flent);
		return (EEXIST);
	}

	/*
	 * First create a flow entry given the description provided
	 * by the caller.
	 */
	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
	    FLOW_USER | FLOW_OTHER, &flent);

	if (err != 0)
		return (err);

	/*
	 * We've got a local variable referencing this flow now, so we need
	 * to hold it. We'll release this flow before returning.
	 * All failures until we return will undo any action that may internally
	 * held the flow, so the last REFRELE will assure a clean freeing
	 * of resources.
	 */
	FLOW_REFHOLD(flent);

	flent->fe_link_id = linkid;
	FLOW_MARK(flent, FE_INCIPIENT);

	err = mac_perim_enter_by_linkid(linkid, &mph);
	if (err != 0) {
		FLOW_FINAL_REFRELE(flent);
		return (err);
	}

	/*
	 * dls will eventually be merged with mac so it's ok
	 * to call dls' internal functions.
	 */
	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
	if (err != 0)
		goto bail;

	link_held = B_TRUE;

	/*
	 * Add the flow to the global flow table, this table will be per
	 * exclusive zone so each zone can have its own flow namespace.
	 * RFE 6625651 will fix this.
	 *
	 */
	if ((err = mac_flow_hash_add(flent)) != 0)
		goto bail;

	hash_added = B_TRUE;

	/*
	 * do not allow flows to be configured on an anchor VNIC
	 */
	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
		err = ENOTSUP;
		goto bail;
	}

	/*
	 * Add the subflow to the subflow table. Also instantiate the flow
	 * in the mac if there is an active user (we check if the MAC client's
	 * datapath has been setup).
	 */
	err = mac_flow_add_subflow(dlp->dl_mch, flent,
	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
	if (err != 0)
		goto bail;

	FLOW_UNMARK(flent, FE_INCIPIENT);
	dls_devnet_rele_link(dlh, dlp);
	mac_perim_exit(mph);
	return (0);

bail:
	if (hash_added)
		mac_flow_hash_remove(flent);

	if (link_held)
		dls_devnet_rele_link(dlh, dlp);

	/*
	 * Wait for any transient global flow hash refs to clear
	 * and then release the creation reference on the flow
	 */
	mac_flow_wait(flent, FLOW_USER_REF);
	FLOW_FINAL_REFRELE(flent);
	mac_perim_exit(mph);
	return (err);
}

/*
 * mac_link_flow_clean()
 * Internal flow interface used for freeing SRSs and related
 * data structures. Not meant to be used by mac clients.
 */
void
mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
{
	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
	mac_impl_t		*mip = mcip->mci_mip;
	boolean_t		last_subflow;

	ASSERT(mch != NULL);
	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));

	/*
	 * This sub flow entry may fail to be fully initialized by
	 * mac_link_flow_init(). If so, simply return.
	 */
	if (sub_flow->fe_mcip == NULL)
		return;

	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
	/*
	 * Tear down the data path
	 */
	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
	sub_flow->fe_mcip = NULL;

	/*
	 * Delete the SRSs associated with this subflow. If this is being
	 * driven by flowadm(1M) then the subflow will be deleted by
	 * dls_rem_flow. However if this is a result of the interface being
	 * unplumbed then the subflow itself won't be deleted.
	 */
	mac_flow_cleanup(sub_flow);

	/*
	 * If all the subflows are gone, renable some of the stuff
	 * we disabled when adding a subflow, polling etc.
	 */
	if (last_subflow) {
		/*
		 * The subflow table itself is not protected by any locks or
		 * refcnts. Hence quiesce the client upfront before clearing
		 * mci_subflow_tab.
		 */
		mac_client_quiesce(mcip);
		mac_client_update_classifier(mcip, B_FALSE);
		mac_flow_tab_destroy(mcip->mci_subflow_tab);
		mcip->mci_subflow_tab = NULL;
		mac_client_restart(mcip);
	}
}

/*
 * mac_link_flow_remove()
 * Used by flowadm(1m) or kernel mac clients for removing flows.
 */
int
mac_link_flow_remove(char *flow_name)
{
	flow_entry_t		*flent;
	mac_perim_handle_t	mph;
	int			err;
	datalink_id_t		linkid;

	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err != 0)
		return (err);

	linkid = flent->fe_link_id;
	FLOW_USER_REFRELE(flent);

	/*
	 * The perim must be acquired before acquiring any other references
	 * to maintain the lock and perimeter hierarchy. Please note the
	 * FLOW_REFRELE above.
	 */
	err = mac_perim_enter_by_linkid(linkid, &mph);
	if (err != 0)
		return (err);

	/*
	 * Note the second lookup of the flow, because a concurrent thread
	 * may have removed it already while we were waiting to enter the
	 * link's perimeter.
	 */
	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err != 0) {
		mac_perim_exit(mph);
		return (err);
	}
	FLOW_USER_REFRELE(flent);

	/*
	 * Remove the flow from the subflow table and deactivate the flow
	 * by quiescing and removings its SRSs
	 */
	mac_flow_rem_subflow(flent);

	/*
	 * Finally, remove the flow from the global table.
	 */
	mac_flow_hash_remove(flent);

	/*
	 * Wait for any transient global flow hash refs to clear
	 * and then release the creation reference on the flow
	 */
	mac_flow_wait(flent, FLOW_USER_REF);
	FLOW_FINAL_REFRELE(flent);

	mac_perim_exit(mph);

	return (0);
}

/*
 * mac_link_flow_modify()
 * Modifies the properties of a flow identified by its name.
 */
int
mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
{
	flow_entry_t		*flent;
	mac_client_impl_t 	*mcip;
	int			err = 0;
	mac_perim_handle_t	mph;
	datalink_id_t		linkid;
	flow_tab_t		*flow_tab;

	err = mac_validate_props(NULL, mrp);
	if (err != 0)
		return (err);

	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err != 0)
		return (err);

	linkid = flent->fe_link_id;
	FLOW_USER_REFRELE(flent);

	/*
	 * The perim must be acquired before acquiring any other references
	 * to maintain the lock and perimeter hierarchy. Please note the
	 * FLOW_REFRELE above.
	 */
	err = mac_perim_enter_by_linkid(linkid, &mph);
	if (err != 0)
		return (err);

	/*
	 * Note the second lookup of the flow, because a concurrent thread
	 * may have removed it already while we were waiting to enter the
	 * link's perimeter.
	 */
	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err != 0) {
		mac_perim_exit(mph);
		return (err);
	}
	FLOW_USER_REFRELE(flent);

	/*
	 * If this flow is attached to a MAC client, then pass the request
	 * along to the client.
	 * Otherwise, just update the cached values.
	 */
	mcip = flent->fe_mcip;
	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
	if (mcip != NULL) {
		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
			err = ENOENT;
		} else {
			mac_flow_modify(flow_tab, flent, mrp);
		}
	} else {
		(void) mac_flow_modify_props(flent, mrp);
	}

done:
	mac_perim_exit(mph);
	return (err);
}


/*
 * State structure and misc functions used by mac_link_flow_walk().
 */
typedef struct {
	int	(*ws_func)(mac_flowinfo_t *, void *);
	void	*ws_arg;
} flow_walk_state_t;

static void
mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
{
	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
	    MAXFLOWNAMELEN);
	finfop->fi_link_id = flent->fe_link_id;
	finfop->fi_flow_desc = flent->fe_flow_desc;
	finfop->fi_resource_props = flent->fe_resource_props;
}

static int
mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
{
	flow_walk_state_t	*statep = arg;
	mac_flowinfo_t		*finfo;
	int			err;

	finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
	mac_link_flowinfo_copy(finfo, flent);
	err = statep->ws_func(finfo, statep->ws_arg);
	kmem_free(finfo, sizeof (*finfo));
	return (err);
}

/*
 * mac_link_flow_walk()
 * Invokes callback 'func' for all flows belonging to the specified link.
 */
int
mac_link_flow_walk(datalink_id_t linkid,
    int (*func)(mac_flowinfo_t *, void *), void *arg)
{
	mac_client_impl_t	*mcip;
	mac_perim_handle_t	mph;
	flow_walk_state_t	state;
	dls_dl_handle_t		dlh;
	dls_link_t		*dlp;
	int			err;

	err = mac_perim_enter_by_linkid(linkid, &mph);
	if (err != 0)
		return (err);

	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
	if (err != 0) {
		mac_perim_exit(mph);
		return (err);
	}

	mcip = (mac_client_impl_t *)dlp->dl_mch;
	state.ws_func = func;
	state.ws_arg = arg;

	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
	    mac_link_flow_walk_cb, &state);

	dls_devnet_rele_link(dlh, dlp);
	mac_perim_exit(mph);
	return (err);
}

/*
 * mac_link_flow_info()
 * Retrieves information about a specific flow.
 */
int
mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
{
	flow_entry_t	*flent;
	int		err;

	err = mac_flow_lookup_byname(flow_name, &flent);
	if (err != 0)
		return (err);

	mac_link_flowinfo_copy(finfo, flent);
	FLOW_USER_REFRELE(flent);
	return (0);
}

/*
 * Hash function macro that takes an Ethernet address and VLAN id as input.
 */
#define	HASH_ETHER_VID(a, v, s)	\
	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))

/*
 * Generic layer-2 address hashing function that takes an address and address
 * length as input.  This is the DJB hash function.
 */
static uint32_t
flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
{
	uint32_t	hash = 5381;
	size_t		i;

	for (i = 0; i < addrlen; i++)
		hash = ((hash << 5) + hash) + addr[i];
	return (hash % htsize);
}

#define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))

#define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
	if ((s)->fs_mp->b_wptr == (start)) {		\
		mblk_t	*next = (s)->fs_mp->b_cont;	\
		if (next == NULL)			\
			return (EINVAL);		\
							\
		(s)->fs_mp = next;			\
		(start) = next->b_rptr;			\
	}						\
}

/* ARGSUSED */
static boolean_t
flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l2info_t		*l2 = &s->fs_l2info;
	flow_desc_t		*fd = &flent->fe_flow_desc;

	return (l2->l2_vid == fd->fd_vid &&
	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
}

/*
 * Layer 2 hash function.
 * Must be paired with flow_l2_accept() within a set of flow_ops
 * because it assumes the dest address is already extracted.
 */
static uint32_t
flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
{
	return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
}

/*
 * This is the generic layer 2 accept function.
 * It makes use of mac_header_info() to extract the header length,
 * sap, vlan ID and destination address.
 */
static int
flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
{
	boolean_t		is_ether;
	flow_l2info_t		*l2 = &s->fs_l2info;
	mac_header_info_t	mhi;
	int			err;

	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
	    s->fs_mp, &mhi)) != 0) {
		if (err == EINVAL)
			err = ENOBUFS;

		return (err);
	}

	l2->l2_start = s->fs_mp->b_rptr;
	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;

	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
		struct ether_vlan_header	*evhp =
		    (struct ether_vlan_header *)l2->l2_start;

		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
			return (ENOBUFS);

		l2->l2_sap = ntohs(evhp->ether_type);
		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
		l2->l2_hdrsize = sizeof (*evhp);
	} else {
		l2->l2_sap = mhi.mhi_bindsap;
		l2->l2_vid = 0;
		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
	}
	return (0);
}

/*
 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
 * accept(). The notable difference is that dest address is now extracted
 * by hash() rather than by accept(). This saves a few memory references
 * for flow tables that do not care about mac addresses.
 */
static uint32_t
flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
{
	flow_l2info_t			*l2 = &s->fs_l2info;
	struct ether_vlan_header	*evhp;

	evhp = (struct ether_vlan_header *)l2->l2_start;
	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
	return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
}

static uint32_t
flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
	return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
}

/* ARGSUSED */
static int
flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
{
	flow_l2info_t			*l2 = &s->fs_l2info;
	struct ether_vlan_header	*evhp;
	uint16_t			sap;

	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
	l2->l2_start = (uchar_t *)evhp;

	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
		return (ENOBUFS);

	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
			return (ENOBUFS);

		l2->l2_sap = ntohs(evhp->ether_type);
		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
	} else {
		l2->l2_sap = sap;
		l2->l2_vid = 0;
		l2->l2_hdrsize = sizeof (struct ether_header);
	}
	return (0);
}

/*
 * Validates a layer 2 flow entry.
 */
static int
flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	/*
	 * Dest address is mandatory, and 0 length addresses are not yet
	 * supported.
	 */
	if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
		return (EINVAL);

	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
		/*
		 * VLAN flows are only supported over ethernet macs.
		 */
		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
			return (EINVAL);

		if (fd->fd_vid == 0)
			return (EINVAL);

	}
	flent->fe_match = flow_l2_match;
	return (0);
}

/*
 * Calculates hash index of flow entry.
 */
static uint32_t
flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
	return (flow_l2_addrhash(fd->fd_dst_mac,
	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
}

/*
 * This is used for duplicate flow checking.
 */
/* ARGSUSED */
static boolean_t
flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
{
	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;

	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
}

/*
 * Generic flow entry insertion function.
 * Used by flow tables that do not have ordering requirements.
 */
/* ARGSUSED */
static int
flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
    flow_entry_t *flent)
{
	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));

	if (*headp != NULL) {
		ASSERT(flent->fe_next == NULL);
		flent->fe_next = *headp;
	}
	*headp = flent;
	return (0);
}

/*
 * IP version independent DSField matching function.
 */
/* ARGSUSED */
static boolean_t
flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_desc_t	*fd = &flent->fe_flow_desc;

	switch (l3info->l3_version) {
	case IPV4_VERSION: {
		ipha_t		*ipha = (ipha_t *)l3info->l3_start;

		return ((ipha->ipha_type_of_service &
		    fd->fd_dsfield_mask) == fd->fd_dsfield);
	}
	case IPV6_VERSION: {
		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;

		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
		    fd->fd_dsfield_mask) == fd->fd_dsfield);
	}
	default:
		return (B_FALSE);
	}
}

/*
 * IP v4 and v6 address matching.
 * The netmask only needs to be applied on the packet but not on the
 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
 */

/* ARGSUSED */
static boolean_t
flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_desc_t	*fd = &flent->fe_flow_desc;
	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
	in_addr_t	addr;

	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
		    V4_PART_OF_V6(fd->fd_local_addr));
	}
	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
	    V4_PART_OF_V6(fd->fd_remote_addr));
}

/* ARGSUSED */
static boolean_t
flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_desc_t	*fd = &flent->fe_flow_desc;
	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
	in6_addr_t	*addrp;

	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
		    fd->fd_local_addr));
	}
	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
}

/* ARGSUSED */
static boolean_t
flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_desc_t	*fd = &flent->fe_flow_desc;

	return (l3info->l3_protocol == fd->fd_protocol);
}

static uint32_t
flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_mask_t	mask = ft->ft_mask;

	if ((mask & FLOW_IP_LOCAL) != 0) {
		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
	} else if ((mask & FLOW_IP_REMOTE) != 0) {
		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
		/*
		 * DSField flents are arranged as a single list.
		 */
		return (0);
	}
	/*
	 * IP addr flents are hashed into two lists, v4 or v6.
	 */
	ASSERT(ft->ft_size >= 2);
	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
}

static uint32_t
flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;

	return (l3info->l3_protocol % ft->ft_size);
}

/* ARGSUSED */
static int
flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
{
	flow_l2info_t	*l2info = &s->fs_l2info;
	flow_l3info_t	*l3info = &s->fs_l3info;
	uint16_t	sap = l2info->l2_sap;
	uchar_t		*l3_start;

	l3_start = l2info->l2_start + l2info->l2_hdrsize;

	/*
	 * Adjust start pointer if we're at the end of an mblk.
	 */
	CHECK_AND_ADJUST_START_PTR(s, l3_start);

	l3info->l3_start = l3_start;
	if (!OK_32PTR(l3_start))
		return (EINVAL);

	switch (sap) {
	case ETHERTYPE_IP: {
		ipha_t	*ipha = (ipha_t *)l3_start;

		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
			return (ENOBUFS);

		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
		l3info->l3_protocol = ipha->ipha_protocol;
		l3info->l3_version = IPV4_VERSION;
		l3info->l3_fragmented =
		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
		break;
	}
	case ETHERTYPE_IPV6: {
		ip6_t		*ip6h = (ip6_t *)l3_start;
		ip6_frag_t	*frag = NULL;
		uint16_t	ip6_hdrlen;
		uint8_t		nexthdr;

		if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
		    &nexthdr, &frag)) {
			return (ENOBUFS);
		}
		l3info->l3_hdrsize = ip6_hdrlen;
		l3info->l3_protocol = nexthdr;
		l3info->l3_version = IPV6_VERSION;
		l3info->l3_fragmented = (frag != NULL);
		break;
	}
	default:
		return (EINVAL);
	}
	return (0);
}

/* ARGSUSED */
static int
flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	switch (fd->fd_protocol) {
	case IPPROTO_TCP:
	case IPPROTO_UDP:
	case IPPROTO_SCTP:
	case IPPROTO_ICMP:
	case IPPROTO_ICMPV6:
		flent->fe_match = flow_ip_proto_match;
		return (0);
	default:
		return (EINVAL);
	}
}

/* ARGSUSED */
static int
flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;
	flow_mask_t	mask;
	uint8_t		version;
	in6_addr_t	*addr, *netmask;

	/*
	 * DSField does not require a IP version.
	 */
	if (fd->fd_mask == FLOW_IP_DSFIELD) {
		if (fd->fd_dsfield_mask == 0)
			return (EINVAL);

		flent->fe_match = flow_ip_dsfield_match;
		return (0);
	}

	/*
	 * IP addresses must come with a version to avoid ambiguity.
	 */
	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
		return (EINVAL);

	version = fd->fd_ipversion;
	if (version != IPV4_VERSION && version != IPV6_VERSION)
		return (EINVAL);

	mask = fd->fd_mask & ~FLOW_IP_VERSION;
	switch (mask) {
	case FLOW_IP_LOCAL:
		addr = &fd->fd_local_addr;
		netmask = &fd->fd_local_netmask;
		break;
	case FLOW_IP_REMOTE:
		addr = &fd->fd_remote_addr;
		netmask = &fd->fd_remote_netmask;
		break;
	default:
		return (EINVAL);
	}

	/*
	 * Apply netmask onto specified address.
	 */
	V6_MASK_COPY(*addr, *netmask, *addr);
	if (version == IPV4_VERSION) {
		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));

		if (v4addr == 0 || v4mask == 0)
			return (EINVAL);
		flent->fe_match = flow_ip_v4_match;
	} else {
		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
		    IN6_IS_ADDR_UNSPECIFIED(netmask))
			return (EINVAL);
		flent->fe_match = flow_ip_v6_match;
	}
	return (0);
}

static uint32_t
flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	return (fd->fd_protocol % ft->ft_size);
}

static uint32_t
flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;

	/*
	 * DSField flents are arranged as a single list.
	 */
	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
		return (0);

	/*
	 * IP addr flents are hashed into two lists, v4 or v6.
	 */
	ASSERT(ft->ft_size >= 2);
	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
}

/* ARGSUSED */
static boolean_t
flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
{
	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;

	return (fd1->fd_protocol == fd2->fd_protocol);
}

/* ARGSUSED */
static boolean_t
flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
{
	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
	in6_addr_t	*a1, *m1, *a2, *m2;

	ASSERT(fd1->fd_mask == fd2->fd_mask);
	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
		return (fd1->fd_dsfield == fd2->fd_dsfield &&
		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
	}

	/*
	 * flow_ip_accept_fe() already validated the version.
	 */
	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
	if (fd1->fd_ipversion != fd2->fd_ipversion)
		return (B_FALSE);

	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
	case FLOW_IP_LOCAL:
		a1 = &fd1->fd_local_addr;
		m1 = &fd1->fd_local_netmask;
		a2 = &fd2->fd_local_addr;
		m2 = &fd2->fd_local_netmask;
		break;
	case FLOW_IP_REMOTE:
		a1 = &fd1->fd_remote_addr;
		m1 = &fd1->fd_remote_netmask;
		a2 = &fd2->fd_remote_addr;
		m2 = &fd2->fd_remote_netmask;
		break;
	default:
		/*
		 * This is unreachable given the checks in
		 * flow_ip_accept_fe().
		 */
		return (B_FALSE);
	}

	if (fd1->fd_ipversion == IPV4_VERSION) {
		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));

	} else {
		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
		    IN6_ARE_ADDR_EQUAL(m1, m2));
	}
}

static int
flow_ip_mask2plen(in6_addr_t *v6mask)
{
	int		bits;
	int		plen = IPV6_ABITS;
	int		i;

	for (i = 3; i >= 0; i--) {
		if (v6mask->s6_addr32[i] == 0) {
			plen -= 32;
			continue;
		}
		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
		if (bits == 0)
			break;
		plen -= bits;
	}
	return (plen);
}

/* ARGSUSED */
static int
flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
    flow_entry_t *flent)
{
	flow_entry_t	**p = headp;
	flow_desc_t	*fd0, *fd;
	in6_addr_t	*m0, *m;
	int		plen0, plen;

	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));

	/*
	 * No special ordering needed for dsfield.
	 */
	fd0 = &flent->fe_flow_desc;
	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
		if (*p != NULL) {
			ASSERT(flent->fe_next == NULL);
			flent->fe_next = *p;
		}
		*p = flent;
		return (0);
	}

	/*
	 * IP address flows are arranged in descending prefix length order.
	 */
	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
	plen0 = flow_ip_mask2plen(m0);
	ASSERT(plen0 != 0);

	for (; *p != NULL; p = &(*p)->fe_next) {
		fd = &(*p)->fe_flow_desc;

		/*
		 * Normally a dsfield flent shouldn't end up on the same
		 * list as an IP address because flow tables are (for now)
		 * disjoint. If we decide to support both IP and dsfield
		 * in the same table in the future, this check will allow
		 * for that.
		 */
		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
			continue;

		/*
		 * We also allow for the mixing of local and remote address
		 * flents within one list.
		 */
		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
		plen = flow_ip_mask2plen(m);

		if (plen <= plen0)
			break;
	}
	if (*p != NULL) {
		ASSERT(flent->fe_next == NULL);
		flent->fe_next = *p;
	}
	*p = flent;
	return (0);
}

/*
 * Transport layer protocol and port matching functions.
 */

/* ARGSUSED */
static boolean_t
flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_l4info_t	*l4info = &s->fs_l4info;
	flow_desc_t	*fd = &flent->fe_flow_desc;

	return (fd->fd_protocol == l3info->l3_protocol &&
	    fd->fd_local_port == l4info->l4_hash_port);
}

/* ARGSUSED */
static boolean_t
flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_l4info_t	*l4info = &s->fs_l4info;
	flow_desc_t	*fd = &flent->fe_flow_desc;

	return (fd->fd_protocol == l3info->l3_protocol &&
	    fd->fd_remote_port == l4info->l4_hash_port);
}

/*
 * Transport hash function.
 * Since we only support either local or remote port flows,
 * we only need to extract one of the ports to be used for
 * matching.
 */
static uint32_t
flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_l4info_t	*l4info = &s->fs_l4info;
	uint8_t		proto = l3info->l3_protocol;
	boolean_t	dst_or_src;

	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
	} else {
		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
	}

	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
	    l4info->l4_src_port;

	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
}

/*
 * Unlike other accept() functions above, we do not need to get the header
 * size because this is our highest layer so far. If we want to do support
 * other higher layer protocols, we would need to save the l4_hdrsize
 * in the code below.
 */

/* ARGSUSED */
static int
flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
{
	flow_l3info_t	*l3info = &s->fs_l3info;
	flow_l4info_t	*l4info = &s->fs_l4info;
	uint8_t		proto = l3info->l3_protocol;
	uchar_t		*l4_start;

	l4_start = l3info->l3_start + l3info->l3_hdrsize;

	/*
	 * Adjust start pointer if we're at the end of an mblk.
	 */
	CHECK_AND_ADJUST_START_PTR(s, l4_start);

	l4info->l4_start = l4_start;
	if (!OK_32PTR(l4_start))
		return (EINVAL);

	if (l3info->l3_fragmented == B_TRUE)
		return (EINVAL);

	switch (proto) {
	case IPPROTO_TCP: {
		struct tcphdr	*tcph = (struct tcphdr *)l4_start;

		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
			return (ENOBUFS);

		l4info->l4_src_port = tcph->th_sport;
		l4info->l4_dst_port = tcph->th_dport;
		break;
	}
	case IPPROTO_UDP: {
		struct udphdr	*udph = (struct udphdr *)l4_start;

		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
			return (ENOBUFS);

		l4info->l4_src_port = udph->uh_sport;
		l4info->l4_dst_port = udph->uh_dport;
		break;
	}
	case IPPROTO_SCTP: {
		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;

		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
			return (ENOBUFS);

		l4info->l4_src_port = sctph->sh_sport;
		l4info->l4_dst_port = sctph->sh_dport;
		break;
	}
	default:
		return (EINVAL);
	}

	return (0);
}

/*
 * Validates transport flow entry.
 * The protocol field must be present.
 */

/* ARGSUSED */
static int
flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;
	flow_mask_t	mask = fd->fd_mask;

	if ((mask & FLOW_IP_PROTOCOL) == 0)
		return (EINVAL);

	switch (fd->fd_protocol) {
	case IPPROTO_TCP:
	case IPPROTO_UDP:
	case IPPROTO_SCTP:
		break;
	default:
		return (EINVAL);
	}

	switch (mask & ~FLOW_IP_PROTOCOL) {
	case FLOW_ULP_PORT_LOCAL:
		if (fd->fd_local_port == 0)
			return (EINVAL);

		flent->fe_match = flow_transport_lport_match;
		break;
	case FLOW_ULP_PORT_REMOTE:
		if (fd->fd_remote_port == 0)
			return (EINVAL);

		flent->fe_match = flow_transport_rport_match;
		break;
	case 0:
		/*
		 * transport-only flows conflicts with our table type.
		 */
		return (EOPNOTSUPP);
	default:
		return (EINVAL);
	}

	return (0);
}

static uint32_t
flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
{
	flow_desc_t	*fd = &flent->fe_flow_desc;
	uint16_t	port = 0;

	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
	    fd->fd_local_port : fd->fd_remote_port;

	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
}

/* ARGSUSED */
static boolean_t
flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
{
	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;

	if (fd1->fd_protocol != fd2->fd_protocol)
		return (B_FALSE);

	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
		return (fd1->fd_local_port == fd2->fd_local_port);

	if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
		return (fd1->fd_remote_port == fd2->fd_remote_port);

	return (B_TRUE);
}

static flow_ops_t flow_l2_ops = {
	flow_l2_accept_fe,
	flow_l2_hash_fe,
	flow_l2_match_fe,
	flow_generic_insert_fe,
	flow_l2_hash,
	{flow_l2_accept}
};

static flow_ops_t flow_ip_ops = {
	flow_ip_accept_fe,
	flow_ip_hash_fe,
	flow_ip_match_fe,
	flow_ip_insert_fe,
	flow_ip_hash,
	{flow_l2_accept, flow_ip_accept}
};

static flow_ops_t flow_ip_proto_ops = {
	flow_ip_proto_accept_fe,
	flow_ip_proto_hash_fe,
	flow_ip_proto_match_fe,
	flow_generic_insert_fe,
	flow_ip_proto_hash,
	{flow_l2_accept, flow_ip_accept}
};

static flow_ops_t flow_transport_ops = {
	flow_transport_accept_fe,
	flow_transport_hash_fe,
	flow_transport_match_fe,
	flow_generic_insert_fe,
	flow_transport_hash,
	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
};

static flow_tab_info_t flow_tab_info_list[] = {
	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
};

#define	FLOW_MAX_TAB_INFO \
	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))

static flow_tab_info_t *
mac_flow_tab_info_get(flow_mask_t mask)
{
	int	i;

	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
		if (mask == flow_tab_info_list[i].fti_mask)
			return (&flow_tab_info_list[i]);
	}
	return (NULL);
}