xref: /illumos-gate/usr/src/uts/common/io/mac/mac_flow.c (revision b8052df9f609edb713f6828c9eecc3d7be19dfb3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #include <sys/strsun.h>
29 #include <sys/sdt.h>
30 #include <sys/mac.h>
31 #include <sys/mac_impl.h>
32 #include <sys/mac_client_impl.h>
33 #include <sys/mac_stat.h>
34 #include <sys/dls.h>
35 #include <sys/dls_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/ethernet.h>
38 #include <sys/cpupart.h>
39 #include <sys/pool.h>
40 #include <sys/pool_pset.h>
41 #include <sys/vlan.h>
42 #include <inet/ip.h>
43 #include <inet/ip6.h>
44 #include <netinet/tcp.h>
45 #include <netinet/udp.h>
46 #include <netinet/sctp.h>
47 
48 typedef struct flow_stats_s {
49 	uint64_t	fs_obytes;
50 	uint64_t	fs_opackets;
51 	uint64_t	fs_oerrors;
52 	uint64_t	fs_ibytes;
53 	uint64_t	fs_ipackets;
54 	uint64_t	fs_ierrors;
55 } flow_stats_t;
56 
57 
58 /* global flow table, will be a per exclusive-zone table later */
59 static mod_hash_t	*flow_hash;
60 static krwlock_t	flow_tab_lock;
61 
62 static kmem_cache_t	*flow_cache;
63 static kmem_cache_t	*flow_tab_cache;
64 static flow_ops_t	flow_l2_ops;
65 
66 typedef struct {
67 	const char	*fs_name;
68 	uint_t		fs_offset;
69 } flow_stats_info_t;
70 
71 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
72 static flow_stats_info_t flow_stats_list[] = {
73 	{"rbytes",	FS_OFF(fs_ibytes)},
74 	{"ipackets",	FS_OFF(fs_ipackets)},
75 	{"ierrors",	FS_OFF(fs_ierrors)},
76 	{"obytes",	FS_OFF(fs_obytes)},
77 	{"opackets",	FS_OFF(fs_opackets)},
78 	{"oerrors",	FS_OFF(fs_oerrors)}
79 };
80 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
81 
82 /*
83  * Checks whether a flow mask is legal.
84  */
85 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
86 
87 static void
88 flow_stat_init(kstat_named_t *knp)
89 {
90 	int	i;
91 
92 	for (i = 0; i < FS_SIZE; i++, knp++) {
93 		kstat_named_init(knp, flow_stats_list[i].fs_name,
94 		    KSTAT_DATA_UINT64);
95 	}
96 }
97 
98 static int
99 flow_stat_update(kstat_t *ksp, int rw)
100 {
101 	flow_entry_t		*fep = ksp->ks_private;
102 	kstat_named_t		*knp = ksp->ks_data;
103 	uint64_t		*statp;
104 	int			i;
105 	mac_rx_stats_t		*mac_rx_stat;
106 	mac_tx_stats_t		*mac_tx_stat;
107 	flow_stats_t		flow_stats;
108 	mac_soft_ring_set_t	*mac_srs;
109 
110 	if (rw != KSTAT_READ)
111 		return (EACCES);
112 
113 	bzero(&flow_stats, sizeof (flow_stats_t));
114 
115 	for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
116 		mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
117 		if (mac_srs == NULL) 		/* Multicast flow */
118 			break;
119 		mac_rx_stat = &mac_srs->srs_rx.sr_stat;
120 
121 		flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
122 		    mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
123 
124 		flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
125 		    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
126 
127 		flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
128 	}
129 
130 	mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
131 	if (mac_srs == NULL) 		/* Multicast flow */
132 		goto done;
133 	mac_tx_stat = &mac_srs->srs_tx.st_stat;
134 
135 	flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
136 	flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
137 	flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
138 
139 done:
140 	for (i = 0; i < FS_SIZE; i++, knp++) {
141 		statp = (uint64_t *)
142 		    ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
143 		knp->value.ui64 = *statp;
144 	}
145 	return (0);
146 }
147 
148 static void
149 flow_stat_create(flow_entry_t *fep)
150 {
151 	kstat_t		*ksp;
152 	kstat_named_t	*knp;
153 	uint_t		nstats = FS_SIZE;
154 
155 	/*
156 	 * Fow now, flow entries are only manipulated and visible from the
157 	 * global zone.
158 	 */
159 	ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
160 	    KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
161 	if (ksp == NULL)
162 		return;
163 
164 	ksp->ks_update = flow_stat_update;
165 	ksp->ks_private = fep;
166 	fep->fe_ksp = ksp;
167 
168 	knp = (kstat_named_t *)ksp->ks_data;
169 	flow_stat_init(knp);
170 	kstat_install(ksp);
171 }
172 
173 void
174 flow_stat_destroy(flow_entry_t *fep)
175 {
176 	if (fep->fe_ksp != NULL) {
177 		kstat_delete(fep->fe_ksp);
178 		fep->fe_ksp = NULL;
179 	}
180 }
181 
182 /*
183  * Initialize the flow table
184  */
185 void
186 mac_flow_init()
187 {
188 	flow_cache = kmem_cache_create("flow_entry_cache",
189 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
190 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
191 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
192 	flow_hash = mod_hash_create_extended("flow_hash",
193 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
194 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
195 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
196 }
197 
198 /*
199  * Cleanup and release the flow table
200  */
201 void
202 mac_flow_fini()
203 {
204 	kmem_cache_destroy(flow_cache);
205 	kmem_cache_destroy(flow_tab_cache);
206 	mod_hash_destroy_hash(flow_hash);
207 	rw_destroy(&flow_tab_lock);
208 }
209 
210 /*
211  * mac_create_flow(): create a flow_entry_t.
212  */
213 int
214 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
215     void *client_cookie, uint_t type, flow_entry_t **flentp)
216 {
217 	flow_entry_t		*flent = *flentp;
218 	int			err = 0;
219 
220 	if (mrp != NULL) {
221 		err = mac_validate_props(NULL, mrp);
222 		if (err != 0)
223 			return (err);
224 	}
225 
226 	if (flent == NULL) {
227 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
228 		bzero(flent, sizeof (*flent));
229 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
230 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
231 
232 		/* Initialize the receiver function to a safe routine */
233 		flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
234 		flent->fe_index = -1;
235 	}
236 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
237 
238 	/* This is an initial flow, will be configured later */
239 	if (fd == NULL) {
240 		*flentp = flent;
241 		return (0);
242 	}
243 
244 	flent->fe_client_cookie = client_cookie;
245 	flent->fe_type = type;
246 
247 	/* Save flow desc */
248 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
249 
250 	if (mrp != NULL) {
251 		/*
252 		 * We have already set fe_resource_props for a Link.
253 		 */
254 		if (type & FLOW_USER) {
255 			bcopy(mrp, &flent->fe_resource_props,
256 			    sizeof (mac_resource_props_t));
257 		}
258 		/*
259 		 * The effective resource list should reflect the priority
260 		 * that we set implicitly.
261 		 */
262 		if (!(mrp->mrp_mask & MRP_PRIORITY))
263 			mrp->mrp_mask |= MRP_PRIORITY;
264 		if (type & FLOW_USER)
265 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
266 		else
267 			mrp->mrp_priority = MPL_LINK_DEFAULT;
268 		bzero(mrp->mrp_pool, MAXPATHLEN);
269 		bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
270 		bcopy(mrp, &flent->fe_effective_props,
271 		    sizeof (mac_resource_props_t));
272 	}
273 	flow_stat_create(flent);
274 
275 	*flentp = flent;
276 	return (0);
277 }
278 
279 /*
280  * Validate flow entry and add it to a flow table.
281  */
282 int
283 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
284 {
285 	flow_entry_t	**headp, **p;
286 	flow_ops_t	*ops = &ft->ft_ops;
287 	flow_mask_t	mask;
288 	uint32_t	index;
289 	int		err;
290 
291 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
292 
293 	/*
294 	 * Check for invalid bits in mask.
295 	 */
296 	mask = flent->fe_flow_desc.fd_mask;
297 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
298 		return (EOPNOTSUPP);
299 
300 	/*
301 	 * Validate flent.
302 	 */
303 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
304 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
305 		    flow_entry_t *, flent, int, err);
306 		return (err);
307 	}
308 
309 	/*
310 	 * Flent is valid. now calculate hash and insert it
311 	 * into hash table.
312 	 */
313 	index = ops->fo_hash_fe(ft, flent);
314 
315 	/*
316 	 * We do not need a lock up until now because we were
317 	 * not accessing the flow table.
318 	 */
319 	rw_enter(&ft->ft_lock, RW_WRITER);
320 	headp = &ft->ft_table[index];
321 
322 	/*
323 	 * Check for duplicate flow.
324 	 */
325 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
326 		if ((*p)->fe_flow_desc.fd_mask !=
327 		    flent->fe_flow_desc.fd_mask)
328 			continue;
329 
330 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
331 			rw_exit(&ft->ft_lock);
332 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
333 			    flow_entry_t *, flent, int, err);
334 			return (EALREADY);
335 		}
336 	}
337 
338 	/*
339 	 * Insert flow to hash list.
340 	 */
341 	err = ops->fo_insert_fe(ft, headp, flent);
342 	if (err != 0) {
343 		rw_exit(&ft->ft_lock);
344 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
345 		    flow_entry_t *, flent, int, err);
346 		return (err);
347 	}
348 
349 	/*
350 	 * Save the hash index so it can be used by mac_flow_remove().
351 	 */
352 	flent->fe_index = (int)index;
353 
354 	/*
355 	 * Save the flow tab back reference.
356 	 */
357 	flent->fe_flow_tab = ft;
358 	FLOW_MARK(flent, FE_FLOW_TAB);
359 	ft->ft_flow_count++;
360 	rw_exit(&ft->ft_lock);
361 	return (0);
362 }
363 
364 /*
365  * Remove a flow from a mac client's subflow table
366  */
367 void
368 mac_flow_rem_subflow(flow_entry_t *flent)
369 {
370 	flow_tab_t		*ft = flent->fe_flow_tab;
371 	mac_client_impl_t	*mcip = ft->ft_mcip;
372 	mac_handle_t		mh = (mac_handle_t)ft->ft_mip;
373 
374 	ASSERT(MAC_PERIM_HELD(mh));
375 
376 	mac_flow_remove(ft, flent, B_FALSE);
377 	if (flent->fe_mcip == NULL) {
378 		/*
379 		 * The interface is not yet plumbed and mac_client_flow_add
380 		 * was not done.
381 		 */
382 		if (FLOW_TAB_EMPTY(ft)) {
383 			mac_flow_tab_destroy(ft);
384 			mcip->mci_subflow_tab = NULL;
385 		}
386 	} else {
387 		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
388 		mac_link_flow_clean((mac_client_handle_t)mcip, flent);
389 	}
390 	mac_fastpath_enable(mh);
391 }
392 
393 /*
394  * Add a flow to a mac client's subflow table and instantiate the flow
395  * in the mac by creating the associated SRSs etc.
396  */
397 int
398 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
399     boolean_t instantiate_flow)
400 {
401 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
402 	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
403 	flow_tab_info_t		*ftinfo;
404 	flow_mask_t		mask;
405 	flow_tab_t		*ft;
406 	int			err;
407 	boolean_t		ft_created = B_FALSE;
408 
409 	ASSERT(MAC_PERIM_HELD(mh));
410 
411 	if ((err = mac_fastpath_disable(mh)) != 0)
412 		return (err);
413 
414 	/*
415 	 * If the subflow table exists already just add the new subflow
416 	 * to the existing table, else we create a new subflow table below.
417 	 */
418 	ft = mcip->mci_subflow_tab;
419 	if (ft == NULL) {
420 		mask = flent->fe_flow_desc.fd_mask;
421 		/*
422 		 * Try to create a new table and then add the subflow to the
423 		 * newly created subflow table
424 		 */
425 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
426 			mac_fastpath_enable(mh);
427 			return (EOPNOTSUPP);
428 		}
429 
430 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
431 		    mcip->mci_mip, &ft);
432 		ft_created = B_TRUE;
433 	}
434 
435 	err = mac_flow_add(ft, flent);
436 	if (err != 0) {
437 		if (ft_created)
438 			mac_flow_tab_destroy(ft);
439 		mac_fastpath_enable(mh);
440 		return (err);
441 	}
442 
443 	if (instantiate_flow) {
444 		/* Now activate the flow by creating its SRSs */
445 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
446 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
447 		if (err != 0) {
448 			mac_flow_remove(ft, flent, B_FALSE);
449 			if (ft_created)
450 				mac_flow_tab_destroy(ft);
451 			mac_fastpath_enable(mh);
452 			return (err);
453 		}
454 	} else {
455 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
456 	}
457 	if (ft_created) {
458 		ASSERT(mcip->mci_subflow_tab == NULL);
459 		ft->ft_mcip = mcip;
460 		mcip->mci_subflow_tab = ft;
461 		if (instantiate_flow)
462 			mac_client_update_classifier(mcip, B_TRUE);
463 	}
464 	return (0);
465 }
466 
467 /*
468  * Remove flow entry from flow table.
469  */
470 void
471 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
472 {
473 	flow_entry_t	**fp;
474 
475 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
476 	if (!(flent->fe_flags & FE_FLOW_TAB))
477 		return;
478 
479 	rw_enter(&ft->ft_lock, RW_WRITER);
480 	/*
481 	 * If this is a permanent removal from the flow table, mark it
482 	 * CONDEMNED to prevent future references. If this is a temporary
483 	 * removal from the table, say to update the flow descriptor then
484 	 * we don't mark it CONDEMNED
485 	 */
486 	if (!temp)
487 		FLOW_MARK(flent, FE_CONDEMNED);
488 	/*
489 	 * Locate the specified flent.
490 	 */
491 	fp = &ft->ft_table[flent->fe_index];
492 	while (*fp != flent)
493 		fp = &(*fp)->fe_next;
494 
495 	/*
496 	 * The flent must exist. Otherwise it's a bug.
497 	 */
498 	ASSERT(fp != NULL);
499 	*fp = flent->fe_next;
500 	flent->fe_next = NULL;
501 
502 	/*
503 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
504 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
505 	 * will panic.
506 	 */
507 	flent->fe_index = -1;
508 	FLOW_UNMARK(flent, FE_FLOW_TAB);
509 	ft->ft_flow_count--;
510 	rw_exit(&ft->ft_lock);
511 }
512 
513 /*
514  * This is the flow lookup routine used by the mac sw classifier engine.
515  */
516 int
517 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
518 {
519 	flow_state_t	s;
520 	flow_entry_t	*flent;
521 	flow_ops_t	*ops = &ft->ft_ops;
522 	boolean_t	retried = B_FALSE;
523 	int		i, err;
524 
525 	s.fs_flags = flags;
526 retry:
527 	s.fs_mp = mp;
528 
529 	/*
530 	 * Walk the list of predeclared accept functions.
531 	 * Each of these would accumulate enough state to allow the next
532 	 * accept routine to make progress.
533 	 */
534 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
535 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
536 			mblk_t	*last;
537 
538 			/*
539 			 * ENOBUFS indicates that the mp could be too short
540 			 * and may need a pullup.
541 			 */
542 			if (err != ENOBUFS || retried)
543 				return (err);
544 
545 			/*
546 			 * The pullup is done on the last processed mblk, not
547 			 * the starting one. pullup is not done if the mblk
548 			 * has references or if b_cont is NULL.
549 			 */
550 			last = s.fs_mp;
551 			if (DB_REF(last) > 1 || last->b_cont == NULL ||
552 			    pullupmsg(last, -1) == 0)
553 				return (EINVAL);
554 
555 			retried = B_TRUE;
556 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
557 			    flow_state_t *, &s);
558 			goto retry;
559 		}
560 	}
561 
562 	/*
563 	 * The packet is considered sane. We may now attempt to
564 	 * find the corresponding flent.
565 	 */
566 	rw_enter(&ft->ft_lock, RW_READER);
567 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
568 	for (; flent != NULL; flent = flent->fe_next) {
569 		if (flent->fe_match(ft, flent, &s)) {
570 			FLOW_TRY_REFHOLD(flent, err);
571 			if (err != 0)
572 				continue;
573 			*flentp = flent;
574 			rw_exit(&ft->ft_lock);
575 			return (0);
576 		}
577 	}
578 	rw_exit(&ft->ft_lock);
579 	return (ENOENT);
580 }
581 
582 /*
583  * Walk flow table.
584  * The caller is assumed to have proper perimeter protection.
585  */
586 int
587 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
588     void *arg)
589 {
590 	int		err, i, cnt = 0;
591 	flow_entry_t	*flent;
592 
593 	if (ft == NULL)
594 		return (0);
595 
596 	for (i = 0; i < ft->ft_size; i++) {
597 		for (flent = ft->ft_table[i]; flent != NULL;
598 		    flent = flent->fe_next) {
599 			cnt++;
600 			err = (*fn)(flent, arg);
601 			if (err != 0)
602 				return (err);
603 		}
604 	}
605 	VERIFY(cnt == ft->ft_flow_count);
606 	return (0);
607 }
608 
609 /*
610  * Same as the above except a mutex is used for protection here.
611  */
612 int
613 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
614     void *arg)
615 {
616 	int		err;
617 
618 	if (ft == NULL)
619 		return (0);
620 
621 	rw_enter(&ft->ft_lock, RW_WRITER);
622 	err = mac_flow_walk_nolock(ft, fn, arg);
623 	rw_exit(&ft->ft_lock);
624 	return (err);
625 }
626 
627 static boolean_t	mac_flow_clean(flow_entry_t *);
628 
629 /*
630  * Destroy a flow entry. Called when the last reference on a flow is released.
631  */
632 void
633 mac_flow_destroy(flow_entry_t *flent)
634 {
635 	ASSERT(flent->fe_refcnt == 0);
636 
637 	if ((flent->fe_type & FLOW_USER) != 0) {
638 		ASSERT(mac_flow_clean(flent));
639 	} else {
640 		mac_flow_cleanup(flent);
641 	}
642 	mac_misc_stat_delete(flent);
643 	mutex_destroy(&flent->fe_lock);
644 	cv_destroy(&flent->fe_cv);
645 	flow_stat_destroy(flent);
646 	kmem_cache_free(flow_cache, flent);
647 }
648 
649 /*
650  * XXX eric
651  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
652  * mac_link_flow_modify() should really be moved/reworked into the
653  * two functions below. This would consolidate all the mac property
654  * checking in one place. I'm leaving this alone for now since it's
655  * out of scope of the new flows work.
656  */
657 /* ARGSUSED */
658 uint32_t
659 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
660 {
661 	uint32_t		changed_mask = 0;
662 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
663 	int			i;
664 
665 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
666 	    (!(fmrp->mrp_mask & MRP_MAXBW) ||
667 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
668 		changed_mask |= MRP_MAXBW;
669 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
670 			fmrp->mrp_mask &= ~MRP_MAXBW;
671 			fmrp->mrp_maxbw = 0;
672 		} else {
673 			fmrp->mrp_mask |= MRP_MAXBW;
674 			fmrp->mrp_maxbw = mrp->mrp_maxbw;
675 		}
676 	}
677 
678 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
679 		if (fmrp->mrp_priority != mrp->mrp_priority)
680 			changed_mask |= MRP_PRIORITY;
681 		if (mrp->mrp_priority == MPL_RESET) {
682 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
683 			fmrp->mrp_mask &= ~MRP_PRIORITY;
684 		} else {
685 			fmrp->mrp_priority = mrp->mrp_priority;
686 			fmrp->mrp_mask |= MRP_PRIORITY;
687 		}
688 	}
689 
690 	/* modify fanout */
691 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
692 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
693 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
694 			for (i = 0; i < mrp->mrp_ncpus; i++) {
695 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
696 					break;
697 			}
698 			if (i == mrp->mrp_ncpus) {
699 				/*
700 				 * The new set of cpus passed is exactly
701 				 * the same as the existing set.
702 				 */
703 				return (changed_mask);
704 			}
705 		}
706 		changed_mask |= MRP_CPUS;
707 		MAC_COPY_CPUS(mrp, fmrp);
708 	}
709 
710 	/*
711 	 * Modify the rings property.
712 	 */
713 	if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
714 		mac_set_rings_effective(flent->fe_mcip);
715 
716 	if ((mrp->mrp_mask & MRP_POOL) != 0) {
717 		if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
718 			changed_mask |= MRP_POOL;
719 		if (strlen(mrp->mrp_pool) == 0)
720 			fmrp->mrp_mask &= ~MRP_POOL;
721 		else
722 			fmrp->mrp_mask |= MRP_POOL;
723 		(void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
724 	}
725 	return (changed_mask);
726 }
727 
728 void
729 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
730 {
731 	uint32_t changed_mask;
732 	mac_client_impl_t *mcip = flent->fe_mcip;
733 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
734 	mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
735 	cpupart_t *cpupart = NULL;
736 	boolean_t use_default = B_FALSE;
737 
738 	ASSERT(flent != NULL);
739 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
740 
741 	rw_enter(&ft->ft_lock, RW_WRITER);
742 
743 	/* Update the cached values inside the subflow entry */
744 	changed_mask = mac_flow_modify_props(flent, mrp);
745 	rw_exit(&ft->ft_lock);
746 	/*
747 	 * Push the changed parameters to the scheduling code in the
748 	 * SRS's, to take effect right away.
749 	 */
750 	if (changed_mask & MRP_MAXBW) {
751 		mac_srs_update_bwlimit(flent, mrp);
752 		/*
753 		 * If bandwidth is changed, we may have to change
754 		 * the number of soft ring to be used for fanout.
755 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
756 		 * is not set and there is no user supplied cpu
757 		 * info. This applies only to link at this time.
758 		 */
759 		if (!(flent->fe_type & FLOW_USER) &&
760 		    !(changed_mask & MRP_CPUS) &&
761 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
762 			mac_fanout_setup(mcip, flent, mcip_mrp,
763 			    mac_rx_deliver, mcip, NULL, NULL);
764 		}
765 	}
766 	if (mrp->mrp_mask & MRP_PRIORITY)
767 		mac_flow_update_priority(mcip, flent);
768 
769 	if (changed_mask & MRP_CPUS)
770 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
771 		    NULL);
772 
773 	if (mrp->mrp_mask & MRP_POOL) {
774 		pool_lock();
775 		cpupart = mac_pset_find(mrp, &use_default);
776 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
777 		    cpupart);
778 		mac_set_pool_effective(use_default, cpupart, mrp, emrp);
779 		pool_unlock();
780 	}
781 }
782 
783 /*
784  * This function waits for a certain condition to be met and is generally
785  * used before a destructive or quiescing operation.
786  */
787 void
788 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
789 {
790 	mutex_enter(&flent->fe_lock);
791 	flent->fe_flags |= FE_WAITER;
792 
793 	switch (event) {
794 	case FLOW_DRIVER_UPCALL:
795 		/*
796 		 * We want to make sure the driver upcalls have finished before
797 		 * we signal the Rx SRS worker to quit.
798 		 */
799 		while (flent->fe_refcnt != 1)
800 			cv_wait(&flent->fe_cv, &flent->fe_lock);
801 		break;
802 
803 	case FLOW_USER_REF:
804 		/*
805 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
806 		 * been removed from the global flow hash.
807 		 */
808 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
809 		while (flent->fe_user_refcnt != 0)
810 			cv_wait(&flent->fe_cv, &flent->fe_lock);
811 		break;
812 
813 	default:
814 		ASSERT(0);
815 	}
816 
817 	flent->fe_flags &= ~FE_WAITER;
818 	mutex_exit(&flent->fe_lock);
819 }
820 
821 static boolean_t
822 mac_flow_clean(flow_entry_t *flent)
823 {
824 	ASSERT(flent->fe_next == NULL);
825 	ASSERT(flent->fe_tx_srs == NULL);
826 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
827 	ASSERT(flent->fe_mbg == NULL);
828 
829 	return (B_TRUE);
830 }
831 
832 void
833 mac_flow_cleanup(flow_entry_t *flent)
834 {
835 	if ((flent->fe_type & FLOW_USER) == 0) {
836 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
837 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
838 		ASSERT(flent->fe_refcnt == 0);
839 	} else {
840 		ASSERT(flent->fe_refcnt == 1);
841 	}
842 
843 	if (flent->fe_mbg != NULL) {
844 		ASSERT(flent->fe_tx_srs == NULL);
845 		/* This is a multicast or broadcast flow entry */
846 		mac_bcast_grp_free(flent->fe_mbg);
847 		flent->fe_mbg = NULL;
848 	}
849 
850 	if (flent->fe_tx_srs != NULL) {
851 		ASSERT(flent->fe_mbg == NULL);
852 		mac_srs_free(flent->fe_tx_srs);
853 		flent->fe_tx_srs = NULL;
854 	}
855 
856 	/*
857 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
858 	 * when mac_unicast_add fails we may not have set up any SRS
859 	 * in which case fe_rx_srs_cnt will be zero.
860 	 */
861 	if (flent->fe_rx_srs_cnt != 0) {
862 		ASSERT(flent->fe_rx_srs_cnt == 1);
863 		mac_srs_free(flent->fe_rx_srs[0]);
864 		flent->fe_rx_srs[0] = NULL;
865 		flent->fe_rx_srs_cnt = 0;
866 	}
867 	ASSERT(flent->fe_rx_srs[0] == NULL);
868 }
869 
870 void
871 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
872 {
873 	/*
874 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
875 	 * Updates to the fe_flow_desc happen under the fe_lock
876 	 * after removing the flent from the flow table
877 	 */
878 	mutex_enter(&flent->fe_lock);
879 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
880 	mutex_exit(&flent->fe_lock);
881 }
882 
883 /*
884  * Update a field of a flow entry. The mac perimeter ensures that
885  * this is the only thread doing a modify operation on this mac end point.
886  * So the flow table can't change or disappear. The ft_lock protects access
887  * to the flow entry, and holding the lock ensures that there isn't any thread
888  * accessing the flow entry or attempting a flow table lookup. However
889  * data threads that are using the flow entry based on the old descriptor
890  * will continue to use the flow entry. If strong coherence is required
891  * then the flow will have to be quiesced before the descriptor can be
892  * changed.
893  */
894 void
895 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
896 {
897 	flow_tab_t	*ft = flent->fe_flow_tab;
898 	flow_desc_t	old_desc;
899 	int		err;
900 
901 	if (ft == NULL) {
902 		/*
903 		 * The flow hasn't yet been inserted into the table,
904 		 * so only the caller knows about this flow, however for
905 		 * uniformity we grab the fe_lock here.
906 		 */
907 		mutex_enter(&flent->fe_lock);
908 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
909 		mutex_exit(&flent->fe_lock);
910 	}
911 
912 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
913 
914 	/*
915 	 * Need to remove the flow entry from the table and reinsert it,
916 	 * into a potentially diference hash line. The hash depends on
917 	 * the new descriptor fields. However access to fe_desc itself
918 	 * is always under the fe_lock. This helps log and stat functions
919 	 * see a self-consistent fe_flow_desc.
920 	 */
921 	mac_flow_remove(ft, flent, B_TRUE);
922 	old_desc = flent->fe_flow_desc;
923 
924 	mutex_enter(&flent->fe_lock);
925 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
926 	mutex_exit(&flent->fe_lock);
927 
928 	if (mac_flow_add(ft, flent) != 0) {
929 		/*
930 		 * The add failed say due to an invalid flow descriptor.
931 		 * Undo the update
932 		 */
933 		flent->fe_flow_desc = old_desc;
934 		err = mac_flow_add(ft, flent);
935 		ASSERT(err == 0);
936 	}
937 }
938 
939 void
940 mac_flow_set_name(flow_entry_t *flent, const char *name)
941 {
942 	flow_tab_t	*ft = flent->fe_flow_tab;
943 
944 	if (ft == NULL) {
945 		/*
946 		 *  The flow hasn't yet been inserted into the table,
947 		 * so only the caller knows about this flow
948 		 */
949 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
950 	} else {
951 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
952 	}
953 
954 	mutex_enter(&flent->fe_lock);
955 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
956 	mutex_exit(&flent->fe_lock);
957 }
958 
959 /*
960  * Return the client-private cookie that was associated with
961  * the flow when it was created.
962  */
963 void *
964 mac_flow_get_client_cookie(flow_entry_t *flent)
965 {
966 	return (flent->fe_client_cookie);
967 }
968 
969 /*
970  * Forward declarations.
971  */
972 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
973 static uint32_t	flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
974 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
975 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
976 static uint32_t	flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
977 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
978 
979 /*
980  * Create flow table.
981  */
982 void
983 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
984     mac_impl_t *mip, flow_tab_t **ftp)
985 {
986 	flow_tab_t	*ft;
987 	flow_ops_t	*new_ops;
988 
989 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
990 	bzero(ft, sizeof (*ft));
991 
992 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
993 
994 	/*
995 	 * We make a copy of the ops vector instead of just pointing to it
996 	 * because we might want to customize the ops vector on a per table
997 	 * basis (e.g. for optimization).
998 	 */
999 	new_ops = &ft->ft_ops;
1000 	bcopy(ops, new_ops, sizeof (*ops));
1001 	ft->ft_mask = mask;
1002 	ft->ft_size = size;
1003 	ft->ft_mip = mip;
1004 
1005 	/*
1006 	 * Optimizations for DL_ETHER media.
1007 	 */
1008 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1009 		if (new_ops->fo_hash == flow_l2_hash)
1010 			new_ops->fo_hash = flow_ether_hash;
1011 		if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1012 			new_ops->fo_hash_fe = flow_ether_hash_fe;
1013 		if (new_ops->fo_accept[0] == flow_l2_accept)
1014 			new_ops->fo_accept[0] = flow_ether_accept;
1015 	}
1016 	*ftp = ft;
1017 }
1018 
1019 void
1020 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1021 {
1022 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1023 	    1024, mip, ftp);
1024 }
1025 
1026 /*
1027  * Destroy flow table.
1028  */
1029 void
1030 mac_flow_tab_destroy(flow_tab_t *ft)
1031 {
1032 	if (ft == NULL)
1033 		return;
1034 
1035 	ASSERT(ft->ft_flow_count == 0);
1036 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1037 	bzero(ft, sizeof (*ft));
1038 	kmem_cache_free(flow_tab_cache, ft);
1039 }
1040 
1041 /*
1042  * Add a new flow entry to the global flow hash table
1043  */
1044 int
1045 mac_flow_hash_add(flow_entry_t *flent)
1046 {
1047 	int	err;
1048 
1049 	rw_enter(&flow_tab_lock, RW_WRITER);
1050 	err = mod_hash_insert(flow_hash,
1051 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1052 	if (err != 0) {
1053 		rw_exit(&flow_tab_lock);
1054 		return (EEXIST);
1055 	}
1056 	/* Mark as inserted into the global flow hash table */
1057 	FLOW_MARK(flent, FE_G_FLOW_HASH);
1058 	rw_exit(&flow_tab_lock);
1059 	return (err);
1060 }
1061 
1062 /*
1063  * Remove a flow entry from the global flow hash table
1064  */
1065 void
1066 mac_flow_hash_remove(flow_entry_t *flent)
1067 {
1068 	mod_hash_val_t	val;
1069 
1070 	rw_enter(&flow_tab_lock, RW_WRITER);
1071 	VERIFY(mod_hash_remove(flow_hash,
1072 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1073 
1074 	/* Clear the mark that says inserted into the global flow hash table */
1075 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1076 	rw_exit(&flow_tab_lock);
1077 }
1078 
1079 /*
1080  * Retrieve a flow entry from the global flow hash table.
1081  */
1082 int
1083 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1084 {
1085 	int		err;
1086 	flow_entry_t	*flent;
1087 
1088 	rw_enter(&flow_tab_lock, RW_READER);
1089 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1090 	    (mod_hash_val_t *)&flent);
1091 	if (err != 0) {
1092 		rw_exit(&flow_tab_lock);
1093 		return (ENOENT);
1094 	}
1095 	ASSERT(flent != NULL);
1096 	FLOW_USER_REFHOLD(flent);
1097 	rw_exit(&flow_tab_lock);
1098 
1099 	*flentp = flent;
1100 	return (0);
1101 }
1102 
1103 /*
1104  * Initialize or release mac client flows by walking the subflow table.
1105  * These are typically invoked during plumb/unplumb of links.
1106  */
1107 
1108 static int
1109 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1110 {
1111 	mac_client_impl_t	*mcip = arg;
1112 
1113 	if (mac_link_flow_init(arg, flent) != 0) {
1114 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1115 		    flent->fe_flow_name, mcip->mci_name);
1116 	} else {
1117 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1118 	}
1119 	return (0);
1120 }
1121 
1122 void
1123 mac_link_init_flows(mac_client_handle_t mch)
1124 {
1125 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1126 
1127 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1128 	    mac_link_init_flows_cb, mcip);
1129 	/*
1130 	 * If mac client had subflow(s) configured before plumb, change
1131 	 * function to mac_rx_srs_subflow_process and in case of hardware
1132 	 * classification, disable polling.
1133 	 */
1134 	mac_client_update_classifier(mcip, B_TRUE);
1135 
1136 }
1137 
1138 boolean_t
1139 mac_link_has_flows(mac_client_handle_t mch)
1140 {
1141 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1142 
1143 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1144 		return (B_TRUE);
1145 
1146 	return (B_FALSE);
1147 }
1148 
1149 static int
1150 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1151 {
1152 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1153 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1154 	mac_link_flow_clean(arg, flent);
1155 	return (0);
1156 }
1157 
1158 void
1159 mac_link_release_flows(mac_client_handle_t mch)
1160 {
1161 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1162 
1163 	/*
1164 	 * Change the mci_flent callback back to mac_rx_srs_process()
1165 	 * because flows are about to be deactivated.
1166 	 */
1167 	mac_client_update_classifier(mcip, B_FALSE);
1168 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1169 	    mac_link_release_flows_cb, mcip);
1170 }
1171 
1172 void
1173 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1174 {
1175 	mac_flow_set_name(fep, new_name);
1176 	if (fep->fe_ksp != NULL) {
1177 		flow_stat_destroy(fep);
1178 		flow_stat_create(fep);
1179 	}
1180 }
1181 
1182 /*
1183  * mac_link_flow_init()
1184  * Internal flow interface used for allocating SRSs and related
1185  * data structures. Not meant to be used by mac clients.
1186  */
1187 int
1188 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1189 {
1190 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1191 	mac_impl_t		*mip = mcip->mci_mip;
1192 	int			err;
1193 
1194 	ASSERT(mch != NULL);
1195 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1196 
1197 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1198 		return (err);
1199 
1200 	sub_flow->fe_mcip = mcip;
1201 
1202 	return (0);
1203 }
1204 
1205 /*
1206  * mac_link_flow_add()
1207  * Used by flowadm(8) or kernel mac clients for creating flows.
1208  */
1209 int
1210 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1211     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1212 {
1213 	flow_entry_t		*flent = NULL;
1214 	int			err;
1215 	dls_dl_handle_t		dlh;
1216 	dls_link_t		*dlp;
1217 	boolean_t		link_held = B_FALSE;
1218 	boolean_t		hash_added = B_FALSE;
1219 	mac_perim_handle_t	mph;
1220 
1221 	err = mac_flow_lookup_byname(flow_name, &flent);
1222 	if (err == 0) {
1223 		FLOW_USER_REFRELE(flent);
1224 		return (EEXIST);
1225 	}
1226 
1227 	/*
1228 	 * First create a flow entry given the description provided
1229 	 * by the caller.
1230 	 */
1231 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1232 	    FLOW_USER | FLOW_OTHER, &flent);
1233 
1234 	if (err != 0)
1235 		return (err);
1236 
1237 	/*
1238 	 * We've got a local variable referencing this flow now, so we need
1239 	 * to hold it. We'll release this flow before returning.
1240 	 * All failures until we return will undo any action that may internally
1241 	 * held the flow, so the last REFRELE will assure a clean freeing
1242 	 * of resources.
1243 	 */
1244 	FLOW_REFHOLD(flent);
1245 
1246 	flent->fe_link_id = linkid;
1247 	FLOW_MARK(flent, FE_INCIPIENT);
1248 
1249 	err = mac_perim_enter_by_linkid(linkid, &mph);
1250 	if (err != 0) {
1251 		FLOW_FINAL_REFRELE(flent);
1252 		return (err);
1253 	}
1254 
1255 	/*
1256 	 * dls will eventually be merged with mac so it's ok
1257 	 * to call dls' internal functions.
1258 	 */
1259 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1260 	if (err != 0)
1261 		goto bail;
1262 
1263 	link_held = B_TRUE;
1264 
1265 	/*
1266 	 * Add the flow to the global flow table, this table will be per
1267 	 * exclusive zone so each zone can have its own flow namespace.
1268 	 * RFE 6625651 will fix this.
1269 	 *
1270 	 */
1271 	if ((err = mac_flow_hash_add(flent)) != 0)
1272 		goto bail;
1273 
1274 	hash_added = B_TRUE;
1275 
1276 	/*
1277 	 * do not allow flows to be configured on an anchor VNIC
1278 	 */
1279 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1280 		err = ENOTSUP;
1281 		goto bail;
1282 	}
1283 
1284 	/*
1285 	 * Add the subflow to the subflow table. Also instantiate the flow
1286 	 * in the mac if there is an active user (we check if the MAC client's
1287 	 * datapath has been setup).
1288 	 */
1289 	err = mac_flow_add_subflow(dlp->dl_mch, flent,
1290 	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1291 	if (err != 0)
1292 		goto bail;
1293 
1294 	FLOW_UNMARK(flent, FE_INCIPIENT);
1295 	dls_devnet_rele_link(dlh, dlp);
1296 	mac_perim_exit(mph);
1297 	return (0);
1298 
1299 bail:
1300 	if (hash_added)
1301 		mac_flow_hash_remove(flent);
1302 
1303 	if (link_held)
1304 		dls_devnet_rele_link(dlh, dlp);
1305 
1306 	/*
1307 	 * Wait for any transient global flow hash refs to clear
1308 	 * and then release the creation reference on the flow
1309 	 */
1310 	mac_flow_wait(flent, FLOW_USER_REF);
1311 	FLOW_FINAL_REFRELE(flent);
1312 	mac_perim_exit(mph);
1313 	return (err);
1314 }
1315 
1316 /*
1317  * mac_link_flow_clean()
1318  * Internal flow interface used for freeing SRSs and related
1319  * data structures. Not meant to be used by mac clients.
1320  */
1321 void
1322 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1323 {
1324 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1325 	mac_impl_t		*mip = mcip->mci_mip;
1326 	boolean_t		last_subflow;
1327 
1328 	ASSERT(mch != NULL);
1329 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1330 
1331 	/*
1332 	 * This sub flow entry may fail to be fully initialized by
1333 	 * mac_link_flow_init(). If so, simply return.
1334 	 */
1335 	if (sub_flow->fe_mcip == NULL)
1336 		return;
1337 
1338 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1339 	/*
1340 	 * Tear down the data path
1341 	 */
1342 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1343 	sub_flow->fe_mcip = NULL;
1344 
1345 	/*
1346 	 * Delete the SRSs associated with this subflow. If this is being
1347 	 * driven by flowadm(8) then the subflow will be deleted by
1348 	 * dls_rem_flow. However if this is a result of the interface being
1349 	 * unplumbed then the subflow itself won't be deleted.
1350 	 */
1351 	mac_flow_cleanup(sub_flow);
1352 
1353 	/*
1354 	 * If all the subflows are gone, renable some of the stuff
1355 	 * we disabled when adding a subflow, polling etc.
1356 	 */
1357 	if (last_subflow) {
1358 		/*
1359 		 * The subflow table itself is not protected by any locks or
1360 		 * refcnts. Hence quiesce the client upfront before clearing
1361 		 * mci_subflow_tab.
1362 		 */
1363 		mac_client_quiesce(mcip);
1364 		mac_client_update_classifier(mcip, B_FALSE);
1365 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1366 		mcip->mci_subflow_tab = NULL;
1367 		mac_client_restart(mcip);
1368 	}
1369 }
1370 
1371 /*
1372  * mac_link_flow_remove()
1373  * Used by flowadm(8) or kernel mac clients for removing flows.
1374  */
1375 int
1376 mac_link_flow_remove(char *flow_name)
1377 {
1378 	flow_entry_t		*flent;
1379 	mac_perim_handle_t	mph;
1380 	int			err;
1381 	datalink_id_t		linkid;
1382 
1383 	err = mac_flow_lookup_byname(flow_name, &flent);
1384 	if (err != 0)
1385 		return (err);
1386 
1387 	linkid = flent->fe_link_id;
1388 	FLOW_USER_REFRELE(flent);
1389 
1390 	/*
1391 	 * The perim must be acquired before acquiring any other references
1392 	 * to maintain the lock and perimeter hierarchy. Please note the
1393 	 * FLOW_REFRELE above.
1394 	 */
1395 	err = mac_perim_enter_by_linkid(linkid, &mph);
1396 	if (err != 0)
1397 		return (err);
1398 
1399 	/*
1400 	 * Note the second lookup of the flow, because a concurrent thread
1401 	 * may have removed it already while we were waiting to enter the
1402 	 * link's perimeter.
1403 	 */
1404 	err = mac_flow_lookup_byname(flow_name, &flent);
1405 	if (err != 0) {
1406 		mac_perim_exit(mph);
1407 		return (err);
1408 	}
1409 	FLOW_USER_REFRELE(flent);
1410 
1411 	/*
1412 	 * Remove the flow from the subflow table and deactivate the flow
1413 	 * by quiescing and removings its SRSs
1414 	 */
1415 	mac_flow_rem_subflow(flent);
1416 
1417 	/*
1418 	 * Finally, remove the flow from the global table.
1419 	 */
1420 	mac_flow_hash_remove(flent);
1421 
1422 	/*
1423 	 * Wait for any transient global flow hash refs to clear
1424 	 * and then release the creation reference on the flow
1425 	 */
1426 	mac_flow_wait(flent, FLOW_USER_REF);
1427 	FLOW_FINAL_REFRELE(flent);
1428 
1429 	mac_perim_exit(mph);
1430 
1431 	return (0);
1432 }
1433 
1434 /*
1435  * mac_link_flow_modify()
1436  * Modifies the properties of a flow identified by its name.
1437  */
1438 int
1439 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1440 {
1441 	flow_entry_t		*flent;
1442 	mac_client_impl_t 	*mcip;
1443 	int			err = 0;
1444 	mac_perim_handle_t	mph;
1445 	datalink_id_t		linkid;
1446 	flow_tab_t		*flow_tab;
1447 
1448 	err = mac_validate_props(NULL, mrp);
1449 	if (err != 0)
1450 		return (err);
1451 
1452 	err = mac_flow_lookup_byname(flow_name, &flent);
1453 	if (err != 0)
1454 		return (err);
1455 
1456 	linkid = flent->fe_link_id;
1457 	FLOW_USER_REFRELE(flent);
1458 
1459 	/*
1460 	 * The perim must be acquired before acquiring any other references
1461 	 * to maintain the lock and perimeter hierarchy. Please note the
1462 	 * FLOW_REFRELE above.
1463 	 */
1464 	err = mac_perim_enter_by_linkid(linkid, &mph);
1465 	if (err != 0)
1466 		return (err);
1467 
1468 	/*
1469 	 * Note the second lookup of the flow, because a concurrent thread
1470 	 * may have removed it already while we were waiting to enter the
1471 	 * link's perimeter.
1472 	 */
1473 	err = mac_flow_lookup_byname(flow_name, &flent);
1474 	if (err != 0) {
1475 		mac_perim_exit(mph);
1476 		return (err);
1477 	}
1478 	FLOW_USER_REFRELE(flent);
1479 
1480 	/*
1481 	 * If this flow is attached to a MAC client, then pass the request
1482 	 * along to the client.
1483 	 * Otherwise, just update the cached values.
1484 	 */
1485 	mcip = flent->fe_mcip;
1486 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1487 	if (mcip != NULL) {
1488 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1489 			err = ENOENT;
1490 		} else {
1491 			mac_flow_modify(flow_tab, flent, mrp);
1492 		}
1493 	} else {
1494 		(void) mac_flow_modify_props(flent, mrp);
1495 	}
1496 
1497 done:
1498 	mac_perim_exit(mph);
1499 	return (err);
1500 }
1501 
1502 
1503 /*
1504  * State structure and misc functions used by mac_link_flow_walk().
1505  */
1506 typedef struct {
1507 	int	(*ws_func)(mac_flowinfo_t *, void *);
1508 	void	*ws_arg;
1509 } flow_walk_state_t;
1510 
1511 static void
1512 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1513 {
1514 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1515 	    MAXFLOWNAMELEN);
1516 	finfop->fi_link_id = flent->fe_link_id;
1517 	finfop->fi_flow_desc = flent->fe_flow_desc;
1518 	finfop->fi_resource_props = flent->fe_resource_props;
1519 }
1520 
1521 static int
1522 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1523 {
1524 	flow_walk_state_t	*statep = arg;
1525 	mac_flowinfo_t		*finfo;
1526 	int			err;
1527 
1528 	finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1529 	mac_link_flowinfo_copy(finfo, flent);
1530 	err = statep->ws_func(finfo, statep->ws_arg);
1531 	kmem_free(finfo, sizeof (*finfo));
1532 	return (err);
1533 }
1534 
1535 /*
1536  * mac_link_flow_walk()
1537  * Invokes callback 'func' for all flows belonging to the specified link.
1538  */
1539 int
1540 mac_link_flow_walk(datalink_id_t linkid,
1541     int (*func)(mac_flowinfo_t *, void *), void *arg)
1542 {
1543 	mac_client_impl_t	*mcip;
1544 	mac_perim_handle_t	mph;
1545 	flow_walk_state_t	state;
1546 	dls_dl_handle_t		dlh;
1547 	dls_link_t		*dlp;
1548 	int			err;
1549 
1550 	err = mac_perim_enter_by_linkid(linkid, &mph);
1551 	if (err != 0)
1552 		return (err);
1553 
1554 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1555 	if (err != 0) {
1556 		mac_perim_exit(mph);
1557 		return (err);
1558 	}
1559 
1560 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1561 	state.ws_func = func;
1562 	state.ws_arg = arg;
1563 
1564 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1565 	    mac_link_flow_walk_cb, &state);
1566 
1567 	dls_devnet_rele_link(dlh, dlp);
1568 	mac_perim_exit(mph);
1569 	return (err);
1570 }
1571 
1572 /*
1573  * mac_link_flow_info()
1574  * Retrieves information about a specific flow.
1575  */
1576 int
1577 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1578 {
1579 	flow_entry_t	*flent;
1580 	int		err;
1581 
1582 	err = mac_flow_lookup_byname(flow_name, &flent);
1583 	if (err != 0)
1584 		return (err);
1585 
1586 	mac_link_flowinfo_copy(finfo, flent);
1587 	FLOW_USER_REFRELE(flent);
1588 	return (0);
1589 }
1590 
1591 /*
1592  * Hash function macro that takes an Ethernet address and VLAN id as input.
1593  */
1594 #define	HASH_ETHER_VID(a, v, s)	\
1595 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1596 
1597 /*
1598  * Generic layer-2 address hashing function that takes an address and address
1599  * length as input.  This is the DJB hash function.
1600  */
1601 static uint32_t
1602 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1603 {
1604 	uint32_t	hash = 5381;
1605 	size_t		i;
1606 
1607 	for (i = 0; i < addrlen; i++)
1608 		hash = ((hash << 5) + hash) + addr[i];
1609 	return (hash % htsize);
1610 }
1611 
1612 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1613 
1614 #define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
1615 	if ((s)->fs_mp->b_wptr == (start)) {		\
1616 		mblk_t	*next = (s)->fs_mp->b_cont;	\
1617 		if (next == NULL)			\
1618 			return (EINVAL);		\
1619 							\
1620 		(s)->fs_mp = next;			\
1621 		(start) = next->b_rptr;			\
1622 	}						\
1623 }
1624 
1625 /* ARGSUSED */
1626 static boolean_t
1627 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1628 {
1629 	flow_l2info_t		*l2 = &s->fs_l2info;
1630 	flow_desc_t		*fd = &flent->fe_flow_desc;
1631 
1632 	return (l2->l2_vid == fd->fd_vid &&
1633 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1634 }
1635 
1636 /*
1637  * Layer 2 hash function.
1638  * Must be paired with flow_l2_accept() within a set of flow_ops
1639  * because it assumes the dest address is already extracted.
1640  */
1641 static uint32_t
1642 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1643 {
1644 	return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1645 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1646 }
1647 
1648 /*
1649  * This is the generic layer 2 accept function.
1650  * It makes use of mac_header_info() to extract the header length,
1651  * sap, vlan ID and destination address.
1652  */
1653 static int
1654 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1655 {
1656 	boolean_t		is_ether;
1657 	flow_l2info_t		*l2 = &s->fs_l2info;
1658 	mac_header_info_t	mhi;
1659 	int			err;
1660 
1661 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1662 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1663 	    s->fs_mp, &mhi)) != 0) {
1664 		if (err == EINVAL)
1665 			err = ENOBUFS;
1666 
1667 		return (err);
1668 	}
1669 
1670 	l2->l2_start = s->fs_mp->b_rptr;
1671 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1672 
1673 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1674 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1675 		struct ether_vlan_header	*evhp =
1676 		    (struct ether_vlan_header *)l2->l2_start;
1677 
1678 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1679 			return (ENOBUFS);
1680 
1681 		l2->l2_sap = ntohs(evhp->ether_type);
1682 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1683 		l2->l2_hdrsize = sizeof (*evhp);
1684 	} else {
1685 		l2->l2_sap = mhi.mhi_bindsap;
1686 		l2->l2_vid = 0;
1687 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1688 	}
1689 	return (0);
1690 }
1691 
1692 /*
1693  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1694  * accept(). The notable difference is that dest address is now extracted
1695  * by hash() rather than by accept(). This saves a few memory references
1696  * for flow tables that do not care about mac addresses.
1697  */
1698 static uint32_t
1699 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1700 {
1701 	flow_l2info_t			*l2 = &s->fs_l2info;
1702 	struct ether_vlan_header	*evhp;
1703 
1704 	evhp = (struct ether_vlan_header *)l2->l2_start;
1705 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1706 	return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1707 }
1708 
1709 static uint32_t
1710 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1711 {
1712 	flow_desc_t	*fd = &flent->fe_flow_desc;
1713 
1714 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1715 	return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1716 }
1717 
1718 /* ARGSUSED */
1719 static int
1720 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1721 {
1722 	flow_l2info_t			*l2 = &s->fs_l2info;
1723 	struct ether_vlan_header	*evhp;
1724 	uint16_t			sap;
1725 
1726 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1727 	l2->l2_start = (uchar_t *)evhp;
1728 
1729 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1730 		return (ENOBUFS);
1731 
1732 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1733 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1734 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1735 			return (ENOBUFS);
1736 
1737 		l2->l2_sap = ntohs(evhp->ether_type);
1738 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1739 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1740 	} else {
1741 		l2->l2_sap = sap;
1742 		l2->l2_vid = 0;
1743 		l2->l2_hdrsize = sizeof (struct ether_header);
1744 	}
1745 	return (0);
1746 }
1747 
1748 /*
1749  * Validates a layer 2 flow entry.
1750  */
1751 static int
1752 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1753 {
1754 	flow_desc_t	*fd = &flent->fe_flow_desc;
1755 
1756 	/*
1757 	 * Dest address is mandatory, and 0 length addresses are not yet
1758 	 * supported.
1759 	 */
1760 	if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1761 		return (EINVAL);
1762 
1763 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1764 		/*
1765 		 * VLAN flows are only supported over ethernet macs.
1766 		 */
1767 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1768 			return (EINVAL);
1769 
1770 		if (fd->fd_vid == 0)
1771 			return (EINVAL);
1772 
1773 	}
1774 	flent->fe_match = flow_l2_match;
1775 	return (0);
1776 }
1777 
1778 /*
1779  * Calculates hash index of flow entry.
1780  */
1781 static uint32_t
1782 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1783 {
1784 	flow_desc_t	*fd = &flent->fe_flow_desc;
1785 
1786 	ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1787 	return (flow_l2_addrhash(fd->fd_dst_mac,
1788 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1789 }
1790 
1791 /*
1792  * This is used for duplicate flow checking.
1793  */
1794 /* ARGSUSED */
1795 static boolean_t
1796 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1797 {
1798 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1799 
1800 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1801 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1802 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1803 }
1804 
1805 /*
1806  * Generic flow entry insertion function.
1807  * Used by flow tables that do not have ordering requirements.
1808  */
1809 /* ARGSUSED */
1810 static int
1811 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1812     flow_entry_t *flent)
1813 {
1814 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1815 
1816 	if (*headp != NULL) {
1817 		ASSERT(flent->fe_next == NULL);
1818 		flent->fe_next = *headp;
1819 	}
1820 	*headp = flent;
1821 	return (0);
1822 }
1823 
1824 /*
1825  * IP version independent DSField matching function.
1826  */
1827 /* ARGSUSED */
1828 static boolean_t
1829 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1830 {
1831 	flow_l3info_t	*l3info = &s->fs_l3info;
1832 	flow_desc_t	*fd = &flent->fe_flow_desc;
1833 
1834 	switch (l3info->l3_version) {
1835 	case IPV4_VERSION: {
1836 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1837 
1838 		return ((ipha->ipha_type_of_service &
1839 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1840 	}
1841 	case IPV6_VERSION: {
1842 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1843 
1844 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1845 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1846 	}
1847 	default:
1848 		return (B_FALSE);
1849 	}
1850 }
1851 
1852 /*
1853  * IP v4 and v6 address matching.
1854  * The netmask only needs to be applied on the packet but not on the
1855  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1856  */
1857 
1858 /* ARGSUSED */
1859 static boolean_t
1860 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1861 {
1862 	flow_l3info_t	*l3info = &s->fs_l3info;
1863 	flow_desc_t	*fd = &flent->fe_flow_desc;
1864 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1865 	in_addr_t	addr;
1866 
1867 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1868 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1869 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1870 		    V4_PART_OF_V6(fd->fd_local_addr));
1871 	}
1872 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1873 	    V4_PART_OF_V6(fd->fd_remote_addr));
1874 }
1875 
1876 /* ARGSUSED */
1877 static boolean_t
1878 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1879 {
1880 	flow_l3info_t	*l3info = &s->fs_l3info;
1881 	flow_desc_t	*fd = &flent->fe_flow_desc;
1882 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1883 	in6_addr_t	*addrp;
1884 
1885 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1886 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1887 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1888 		    fd->fd_local_addr));
1889 	}
1890 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1891 }
1892 
1893 /* ARGSUSED */
1894 static boolean_t
1895 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1896 {
1897 	flow_l3info_t	*l3info = &s->fs_l3info;
1898 	flow_desc_t	*fd = &flent->fe_flow_desc;
1899 
1900 	return (l3info->l3_protocol == fd->fd_protocol);
1901 }
1902 
1903 static uint32_t
1904 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1905 {
1906 	flow_l3info_t	*l3info = &s->fs_l3info;
1907 	flow_mask_t	mask = ft->ft_mask;
1908 
1909 	if ((mask & FLOW_IP_LOCAL) != 0) {
1910 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1911 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1912 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1913 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1914 		/*
1915 		 * DSField flents are arranged as a single list.
1916 		 */
1917 		return (0);
1918 	}
1919 	/*
1920 	 * IP addr flents are hashed into two lists, v4 or v6.
1921 	 */
1922 	ASSERT(ft->ft_size >= 2);
1923 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1924 }
1925 
1926 static uint32_t
1927 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1928 {
1929 	flow_l3info_t	*l3info = &s->fs_l3info;
1930 
1931 	return (l3info->l3_protocol % ft->ft_size);
1932 }
1933 
1934 /* ARGSUSED */
1935 static int
1936 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1937 {
1938 	flow_l2info_t	*l2info = &s->fs_l2info;
1939 	flow_l3info_t	*l3info = &s->fs_l3info;
1940 	uint16_t	sap = l2info->l2_sap;
1941 	uchar_t		*l3_start;
1942 
1943 	l3_start = l2info->l2_start + l2info->l2_hdrsize;
1944 
1945 	/*
1946 	 * Adjust start pointer if we're at the end of an mblk.
1947 	 */
1948 	CHECK_AND_ADJUST_START_PTR(s, l3_start);
1949 
1950 	l3info->l3_start = l3_start;
1951 	if (!OK_32PTR(l3_start))
1952 		return (EINVAL);
1953 
1954 	switch (sap) {
1955 	case ETHERTYPE_IP: {
1956 		ipha_t	*ipha = (ipha_t *)l3_start;
1957 
1958 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1959 			return (ENOBUFS);
1960 
1961 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1962 		l3info->l3_protocol = ipha->ipha_protocol;
1963 		l3info->l3_version = IPV4_VERSION;
1964 		l3info->l3_fragmented =
1965 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1966 		break;
1967 	}
1968 	case ETHERTYPE_IPV6: {
1969 		ip6_t		*ip6h = (ip6_t *)l3_start;
1970 		ip6_frag_t	*frag = NULL;
1971 		uint16_t	ip6_hdrlen;
1972 		uint8_t		nexthdr;
1973 
1974 		if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
1975 		    &nexthdr, &frag)) {
1976 			return (ENOBUFS);
1977 		}
1978 		l3info->l3_hdrsize = ip6_hdrlen;
1979 		l3info->l3_protocol = nexthdr;
1980 		l3info->l3_version = IPV6_VERSION;
1981 		l3info->l3_fragmented = (frag != NULL);
1982 		break;
1983 	}
1984 	default:
1985 		return (EINVAL);
1986 	}
1987 	return (0);
1988 }
1989 
1990 /* ARGSUSED */
1991 static int
1992 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1993 {
1994 	flow_desc_t	*fd = &flent->fe_flow_desc;
1995 
1996 	switch (fd->fd_protocol) {
1997 	case IPPROTO_TCP:
1998 	case IPPROTO_UDP:
1999 	case IPPROTO_SCTP:
2000 	case IPPROTO_ICMP:
2001 	case IPPROTO_ICMPV6:
2002 		flent->fe_match = flow_ip_proto_match;
2003 		return (0);
2004 	default:
2005 		return (EINVAL);
2006 	}
2007 }
2008 
2009 /* ARGSUSED */
2010 static int
2011 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2012 {
2013 	flow_desc_t	*fd = &flent->fe_flow_desc;
2014 	flow_mask_t	mask;
2015 	uint8_t		version;
2016 	in6_addr_t	*addr, *netmask;
2017 
2018 	/*
2019 	 * DSField does not require a IP version.
2020 	 */
2021 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
2022 		if (fd->fd_dsfield_mask == 0)
2023 			return (EINVAL);
2024 
2025 		flent->fe_match = flow_ip_dsfield_match;
2026 		return (0);
2027 	}
2028 
2029 	/*
2030 	 * IP addresses must come with a version to avoid ambiguity.
2031 	 */
2032 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2033 		return (EINVAL);
2034 
2035 	version = fd->fd_ipversion;
2036 	if (version != IPV4_VERSION && version != IPV6_VERSION)
2037 		return (EINVAL);
2038 
2039 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
2040 	switch (mask) {
2041 	case FLOW_IP_LOCAL:
2042 		addr = &fd->fd_local_addr;
2043 		netmask = &fd->fd_local_netmask;
2044 		break;
2045 	case FLOW_IP_REMOTE:
2046 		addr = &fd->fd_remote_addr;
2047 		netmask = &fd->fd_remote_netmask;
2048 		break;
2049 	default:
2050 		return (EINVAL);
2051 	}
2052 
2053 	/*
2054 	 * Apply netmask onto specified address.
2055 	 */
2056 	V6_MASK_COPY(*addr, *netmask, *addr);
2057 	if (version == IPV4_VERSION) {
2058 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
2059 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
2060 
2061 		if (v4addr == 0 || v4mask == 0)
2062 			return (EINVAL);
2063 		flent->fe_match = flow_ip_v4_match;
2064 	} else {
2065 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2066 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
2067 			return (EINVAL);
2068 		flent->fe_match = flow_ip_v6_match;
2069 	}
2070 	return (0);
2071 }
2072 
2073 static uint32_t
2074 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2075 {
2076 	flow_desc_t	*fd = &flent->fe_flow_desc;
2077 
2078 	return (fd->fd_protocol % ft->ft_size);
2079 }
2080 
2081 static uint32_t
2082 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2083 {
2084 	flow_desc_t	*fd = &flent->fe_flow_desc;
2085 
2086 	/*
2087 	 * DSField flents are arranged as a single list.
2088 	 */
2089 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2090 		return (0);
2091 
2092 	/*
2093 	 * IP addr flents are hashed into two lists, v4 or v6.
2094 	 */
2095 	ASSERT(ft->ft_size >= 2);
2096 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2097 }
2098 
2099 /* ARGSUSED */
2100 static boolean_t
2101 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2102 {
2103 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2104 
2105 	return (fd1->fd_protocol == fd2->fd_protocol);
2106 }
2107 
2108 /* ARGSUSED */
2109 static boolean_t
2110 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2111 {
2112 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2113 	in6_addr_t	*a1, *m1, *a2, *m2;
2114 
2115 	ASSERT(fd1->fd_mask == fd2->fd_mask);
2116 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2117 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2118 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2119 	}
2120 
2121 	/*
2122 	 * flow_ip_accept_fe() already validated the version.
2123 	 */
2124 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2125 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2126 		return (B_FALSE);
2127 
2128 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2129 	case FLOW_IP_LOCAL:
2130 		a1 = &fd1->fd_local_addr;
2131 		m1 = &fd1->fd_local_netmask;
2132 		a2 = &fd2->fd_local_addr;
2133 		m2 = &fd2->fd_local_netmask;
2134 		break;
2135 	case FLOW_IP_REMOTE:
2136 		a1 = &fd1->fd_remote_addr;
2137 		m1 = &fd1->fd_remote_netmask;
2138 		a2 = &fd2->fd_remote_addr;
2139 		m2 = &fd2->fd_remote_netmask;
2140 		break;
2141 	default:
2142 		/*
2143 		 * This is unreachable given the checks in
2144 		 * flow_ip_accept_fe().
2145 		 */
2146 		return (B_FALSE);
2147 	}
2148 
2149 	if (fd1->fd_ipversion == IPV4_VERSION) {
2150 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2151 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2152 
2153 	} else {
2154 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2155 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2156 	}
2157 }
2158 
2159 static int
2160 flow_ip_mask2plen(in6_addr_t *v6mask)
2161 {
2162 	int		bits;
2163 	int		plen = IPV6_ABITS;
2164 	int		i;
2165 
2166 	for (i = 3; i >= 0; i--) {
2167 		if (v6mask->s6_addr32[i] == 0) {
2168 			plen -= 32;
2169 			continue;
2170 		}
2171 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2172 		if (bits == 0)
2173 			break;
2174 		plen -= bits;
2175 	}
2176 	return (plen);
2177 }
2178 
2179 /* ARGSUSED */
2180 static int
2181 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2182     flow_entry_t *flent)
2183 {
2184 	flow_entry_t	**p = headp;
2185 	flow_desc_t	*fd0, *fd;
2186 	in6_addr_t	*m0, *m;
2187 	int		plen0, plen;
2188 
2189 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2190 
2191 	/*
2192 	 * No special ordering needed for dsfield.
2193 	 */
2194 	fd0 = &flent->fe_flow_desc;
2195 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2196 		if (*p != NULL) {
2197 			ASSERT(flent->fe_next == NULL);
2198 			flent->fe_next = *p;
2199 		}
2200 		*p = flent;
2201 		return (0);
2202 	}
2203 
2204 	/*
2205 	 * IP address flows are arranged in descending prefix length order.
2206 	 */
2207 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2208 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2209 	plen0 = flow_ip_mask2plen(m0);
2210 	ASSERT(plen0 != 0);
2211 
2212 	for (; *p != NULL; p = &(*p)->fe_next) {
2213 		fd = &(*p)->fe_flow_desc;
2214 
2215 		/*
2216 		 * Normally a dsfield flent shouldn't end up on the same
2217 		 * list as an IP address because flow tables are (for now)
2218 		 * disjoint. If we decide to support both IP and dsfield
2219 		 * in the same table in the future, this check will allow
2220 		 * for that.
2221 		 */
2222 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2223 			continue;
2224 
2225 		/*
2226 		 * We also allow for the mixing of local and remote address
2227 		 * flents within one list.
2228 		 */
2229 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2230 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2231 		plen = flow_ip_mask2plen(m);
2232 
2233 		if (plen <= plen0)
2234 			break;
2235 	}
2236 	if (*p != NULL) {
2237 		ASSERT(flent->fe_next == NULL);
2238 		flent->fe_next = *p;
2239 	}
2240 	*p = flent;
2241 	return (0);
2242 }
2243 
2244 /*
2245  * Transport layer protocol and port matching functions.
2246  */
2247 
2248 /* ARGSUSED */
2249 static boolean_t
2250 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2251 {
2252 	flow_l3info_t	*l3info = &s->fs_l3info;
2253 	flow_l4info_t	*l4info = &s->fs_l4info;
2254 	flow_desc_t	*fd = &flent->fe_flow_desc;
2255 
2256 	return (fd->fd_protocol == l3info->l3_protocol &&
2257 	    fd->fd_local_port == l4info->l4_hash_port);
2258 }
2259 
2260 /* ARGSUSED */
2261 static boolean_t
2262 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2263 {
2264 	flow_l3info_t	*l3info = &s->fs_l3info;
2265 	flow_l4info_t	*l4info = &s->fs_l4info;
2266 	flow_desc_t	*fd = &flent->fe_flow_desc;
2267 
2268 	return (fd->fd_protocol == l3info->l3_protocol &&
2269 	    fd->fd_remote_port == l4info->l4_hash_port);
2270 }
2271 
2272 /*
2273  * Transport hash function.
2274  * Since we only support either local or remote port flows,
2275  * we only need to extract one of the ports to be used for
2276  * matching.
2277  */
2278 static uint32_t
2279 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2280 {
2281 	flow_l3info_t	*l3info = &s->fs_l3info;
2282 	flow_l4info_t	*l4info = &s->fs_l4info;
2283 	uint8_t		proto = l3info->l3_protocol;
2284 	boolean_t	dst_or_src;
2285 
2286 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2287 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2288 	} else {
2289 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2290 	}
2291 
2292 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2293 	    l4info->l4_src_port;
2294 
2295 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2296 }
2297 
2298 /*
2299  * Unlike other accept() functions above, we do not need to get the header
2300  * size because this is our highest layer so far. If we want to do support
2301  * other higher layer protocols, we would need to save the l4_hdrsize
2302  * in the code below.
2303  */
2304 
2305 /* ARGSUSED */
2306 static int
2307 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2308 {
2309 	flow_l3info_t	*l3info = &s->fs_l3info;
2310 	flow_l4info_t	*l4info = &s->fs_l4info;
2311 	uint8_t		proto = l3info->l3_protocol;
2312 	uchar_t		*l4_start;
2313 
2314 	l4_start = l3info->l3_start + l3info->l3_hdrsize;
2315 
2316 	/*
2317 	 * Adjust start pointer if we're at the end of an mblk.
2318 	 */
2319 	CHECK_AND_ADJUST_START_PTR(s, l4_start);
2320 
2321 	l4info->l4_start = l4_start;
2322 	if (!OK_32PTR(l4_start))
2323 		return (EINVAL);
2324 
2325 	if (l3info->l3_fragmented == B_TRUE)
2326 		return (EINVAL);
2327 
2328 	switch (proto) {
2329 	case IPPROTO_TCP: {
2330 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2331 
2332 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2333 			return (ENOBUFS);
2334 
2335 		l4info->l4_src_port = tcph->th_sport;
2336 		l4info->l4_dst_port = tcph->th_dport;
2337 		break;
2338 	}
2339 	case IPPROTO_UDP: {
2340 		struct udphdr	*udph = (struct udphdr *)l4_start;
2341 
2342 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2343 			return (ENOBUFS);
2344 
2345 		l4info->l4_src_port = udph->uh_sport;
2346 		l4info->l4_dst_port = udph->uh_dport;
2347 		break;
2348 	}
2349 	case IPPROTO_SCTP: {
2350 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2351 
2352 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2353 			return (ENOBUFS);
2354 
2355 		l4info->l4_src_port = sctph->sh_sport;
2356 		l4info->l4_dst_port = sctph->sh_dport;
2357 		break;
2358 	}
2359 	default:
2360 		return (EINVAL);
2361 	}
2362 
2363 	return (0);
2364 }
2365 
2366 /*
2367  * Validates transport flow entry.
2368  * The protocol field must be present.
2369  */
2370 
2371 /* ARGSUSED */
2372 static int
2373 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2374 {
2375 	flow_desc_t	*fd = &flent->fe_flow_desc;
2376 	flow_mask_t	mask = fd->fd_mask;
2377 
2378 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2379 		return (EINVAL);
2380 
2381 	switch (fd->fd_protocol) {
2382 	case IPPROTO_TCP:
2383 	case IPPROTO_UDP:
2384 	case IPPROTO_SCTP:
2385 		break;
2386 	default:
2387 		return (EINVAL);
2388 	}
2389 
2390 	switch (mask & ~FLOW_IP_PROTOCOL) {
2391 	case FLOW_ULP_PORT_LOCAL:
2392 		if (fd->fd_local_port == 0)
2393 			return (EINVAL);
2394 
2395 		flent->fe_match = flow_transport_lport_match;
2396 		break;
2397 	case FLOW_ULP_PORT_REMOTE:
2398 		if (fd->fd_remote_port == 0)
2399 			return (EINVAL);
2400 
2401 		flent->fe_match = flow_transport_rport_match;
2402 		break;
2403 	case 0:
2404 		/*
2405 		 * transport-only flows conflicts with our table type.
2406 		 */
2407 		return (EOPNOTSUPP);
2408 	default:
2409 		return (EINVAL);
2410 	}
2411 
2412 	return (0);
2413 }
2414 
2415 static uint32_t
2416 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2417 {
2418 	flow_desc_t	*fd = &flent->fe_flow_desc;
2419 	uint16_t	port = 0;
2420 
2421 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2422 	    fd->fd_local_port : fd->fd_remote_port;
2423 
2424 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2425 }
2426 
2427 /* ARGSUSED */
2428 static boolean_t
2429 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2430 {
2431 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2432 
2433 	if (fd1->fd_protocol != fd2->fd_protocol)
2434 		return (B_FALSE);
2435 
2436 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2437 		return (fd1->fd_local_port == fd2->fd_local_port);
2438 
2439 	if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2440 		return (fd1->fd_remote_port == fd2->fd_remote_port);
2441 
2442 	return (B_TRUE);
2443 }
2444 
2445 static flow_ops_t flow_l2_ops = {
2446 	flow_l2_accept_fe,
2447 	flow_l2_hash_fe,
2448 	flow_l2_match_fe,
2449 	flow_generic_insert_fe,
2450 	flow_l2_hash,
2451 	{flow_l2_accept}
2452 };
2453 
2454 static flow_ops_t flow_ip_ops = {
2455 	flow_ip_accept_fe,
2456 	flow_ip_hash_fe,
2457 	flow_ip_match_fe,
2458 	flow_ip_insert_fe,
2459 	flow_ip_hash,
2460 	{flow_l2_accept, flow_ip_accept}
2461 };
2462 
2463 static flow_ops_t flow_ip_proto_ops = {
2464 	flow_ip_proto_accept_fe,
2465 	flow_ip_proto_hash_fe,
2466 	flow_ip_proto_match_fe,
2467 	flow_generic_insert_fe,
2468 	flow_ip_proto_hash,
2469 	{flow_l2_accept, flow_ip_accept}
2470 };
2471 
2472 static flow_ops_t flow_transport_ops = {
2473 	flow_transport_accept_fe,
2474 	flow_transport_hash_fe,
2475 	flow_transport_match_fe,
2476 	flow_generic_insert_fe,
2477 	flow_transport_hash,
2478 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2479 };
2480 
2481 static flow_tab_info_t flow_tab_info_list[] = {
2482 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2483 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2484 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2485 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2486 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2487 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2488 };
2489 
2490 #define	FLOW_MAX_TAB_INFO \
2491 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2492 
2493 static flow_tab_info_t *
2494 mac_flow_tab_info_get(flow_mask_t mask)
2495 {
2496 	int	i;
2497 
2498 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2499 		if (mask == flow_tab_info_list[i].fti_mask)
2500 			return (&flow_tab_info_list[i]);
2501 	}
2502 	return (NULL);
2503 }
2504