xref: /titanic_52/usr/src/uts/common/io/mac/mac_flow.c (revision 491f61a1e1c1fc54a47bbcf53dbbbe1293b93b27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/dls.h>
33 #include <sys/dls_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/ethernet.h>
36 #include <sys/vlan.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <netinet/tcp.h>
40 #include <netinet/udp.h>
41 #include <netinet/sctp.h>
42 
43 /* global flow table, will be a per exclusive-zone table later */
44 static mod_hash_t	*flow_hash;
45 static krwlock_t	flow_tab_lock;
46 
47 static kmem_cache_t	*flow_cache;
48 static kmem_cache_t	*flow_tab_cache;
49 static flow_ops_t	flow_l2_ops;
50 
51 typedef struct {
52 	const char	*fs_name;
53 	uint_t		fs_offset;
54 } flow_stats_info_t;
55 
56 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
57 static flow_stats_info_t flow_stats_list[] = {
58 	{"rbytes",	FS_OFF(fs_rbytes)},
59 	{"ipackets",	FS_OFF(fs_ipackets)},
60 	{"ierrors",	FS_OFF(fs_ierrors)},
61 	{"obytes",	FS_OFF(fs_obytes)},
62 	{"opackets",	FS_OFF(fs_opackets)},
63 	{"oerrors",	FS_OFF(fs_oerrors)}
64 };
65 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
66 
67 /*
68  * Checks whether a flow mask is legal.
69  */
70 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
71 
72 static void
73 flow_stat_init(kstat_named_t *knp)
74 {
75 	int	i;
76 
77 	for (i = 0; i < FS_SIZE; i++, knp++) {
78 		kstat_named_init(knp, flow_stats_list[i].fs_name,
79 		    KSTAT_DATA_UINT64);
80 	}
81 }
82 
83 static int
84 flow_stat_update(kstat_t *ksp, int rw)
85 {
86 	flow_entry_t	*fep = ksp->ks_private;
87 	flow_stats_t 	*fsp = &fep->fe_flowstats;
88 	kstat_named_t	*knp = ksp->ks_data;
89 	uint64_t	*statp;
90 	int		i;
91 
92 	if (rw != KSTAT_READ)
93 		return (EACCES);
94 
95 	for (i = 0; i < FS_SIZE; i++, knp++) {
96 		statp = (uint64_t *)
97 		    ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
98 
99 		knp->value.ui64 = *statp;
100 	}
101 	return (0);
102 }
103 
104 static void
105 flow_stat_create(flow_entry_t *fep)
106 {
107 	kstat_t		*ksp;
108 	kstat_named_t	*knp;
109 	uint_t		nstats = FS_SIZE;
110 
111 	/*
112 	 * Fow now, flow entries are only manipulated and visible from the
113 	 * global zone.
114 	 */
115 	ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
116 	    KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
117 	if (ksp == NULL)
118 		return;
119 
120 	ksp->ks_update = flow_stat_update;
121 	ksp->ks_private = fep;
122 	fep->fe_ksp = ksp;
123 
124 	knp = (kstat_named_t *)ksp->ks_data;
125 	flow_stat_init(knp);
126 	kstat_install(ksp);
127 }
128 
129 void
130 flow_stat_destroy(flow_entry_t *fep)
131 {
132 	if (fep->fe_ksp != NULL) {
133 		kstat_delete(fep->fe_ksp);
134 		fep->fe_ksp = NULL;
135 	}
136 }
137 
138 /*
139  * Initialize the flow table
140  */
141 void
142 mac_flow_init()
143 {
144 	flow_cache = kmem_cache_create("flow_entry_cache",
145 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
146 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
147 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
148 	flow_hash = mod_hash_create_extended("flow_hash",
149 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
150 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
151 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
152 }
153 
154 /*
155  * Cleanup and release the flow table
156  */
157 void
158 mac_flow_fini()
159 {
160 	kmem_cache_destroy(flow_cache);
161 	kmem_cache_destroy(flow_tab_cache);
162 	mod_hash_destroy_hash(flow_hash);
163 	rw_destroy(&flow_tab_lock);
164 }
165 
166 /*
167  * mac_create_flow(): create a flow_entry_t.
168  */
169 int
170 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
171     void *client_cookie, uint_t type, flow_entry_t **flentp)
172 {
173 	flow_entry_t	*flent = *flentp;
174 	int		err = 0;
175 
176 	if (mrp != NULL) {
177 		err = mac_validate_props(mrp);
178 		if (err != 0)
179 			return (err);
180 	}
181 
182 	if (flent == NULL) {
183 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
184 		bzero(flent, sizeof (*flent));
185 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
186 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
187 
188 		/* Initialize the receiver function to a safe routine */
189 		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
190 		flent->fe_index = -1;
191 	}
192 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
193 
194 	/* This is an initial flow, will be configured later */
195 	if (fd == NULL) {
196 		*flentp = flent;
197 		return (0);
198 	}
199 
200 	flent->fe_client_cookie = client_cookie;
201 	flent->fe_type = type;
202 
203 	/* Save flow desc */
204 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
205 
206 	if (mrp != NULL) {
207 		/*
208 		 * We have already set fe_resource_props for a Link.
209 		 */
210 		if (type & FLOW_USER) {
211 			bcopy(mrp, &flent->fe_resource_props,
212 			    sizeof (mac_resource_props_t));
213 		}
214 		/*
215 		 * The effective resource list should reflect the priority
216 		 * that we set implicitly.
217 		 */
218 		if (!(mrp->mrp_mask & MRP_PRIORITY))
219 			mrp->mrp_mask |= MRP_PRIORITY;
220 		if (type & FLOW_USER)
221 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
222 		else
223 			mrp->mrp_priority = MPL_LINK_DEFAULT;
224 		bcopy(mrp, &flent->fe_effective_props,
225 		    sizeof (mac_resource_props_t));
226 	}
227 	flow_stat_create(flent);
228 
229 	*flentp = flent;
230 	return (0);
231 }
232 
233 /*
234  * Validate flow entry and add it to a flow table.
235  */
236 int
237 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
238 {
239 	flow_entry_t	**headp, **p;
240 	flow_ops_t	*ops = &ft->ft_ops;
241 	flow_mask_t	mask;
242 	uint32_t	index;
243 	int		err;
244 
245 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
246 
247 	/*
248 	 * Check for invalid bits in mask.
249 	 */
250 	mask = flent->fe_flow_desc.fd_mask;
251 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
252 		return (EOPNOTSUPP);
253 
254 	/*
255 	 * Validate flent.
256 	 */
257 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
258 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
259 		    flow_entry_t *, flent, int, err);
260 		return (err);
261 	}
262 
263 	/*
264 	 * Flent is valid. now calculate hash and insert it
265 	 * into hash table.
266 	 */
267 	index = ops->fo_hash_fe(ft, flent);
268 
269 	/*
270 	 * We do not need a lock up until now because we were
271 	 * not accessing the flow table.
272 	 */
273 	rw_enter(&ft->ft_lock, RW_WRITER);
274 	headp = &ft->ft_table[index];
275 
276 	/*
277 	 * Check for duplicate flow.
278 	 */
279 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
280 		if ((*p)->fe_flow_desc.fd_mask !=
281 		    flent->fe_flow_desc.fd_mask)
282 			continue;
283 
284 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
285 			rw_exit(&ft->ft_lock);
286 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
287 			    flow_entry_t *, flent, int, err);
288 			return (EALREADY);
289 		}
290 	}
291 
292 	/*
293 	 * Insert flow to hash list.
294 	 */
295 	err = ops->fo_insert_fe(ft, headp, flent);
296 	if (err != 0) {
297 		rw_exit(&ft->ft_lock);
298 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
299 		    flow_entry_t *, flent, int, err);
300 		return (err);
301 	}
302 
303 	/*
304 	 * Save the hash index so it can be used by mac_flow_remove().
305 	 */
306 	flent->fe_index = (int)index;
307 
308 	/*
309 	 * Save the flow tab back reference.
310 	 */
311 	flent->fe_flow_tab = ft;
312 	FLOW_MARK(flent, FE_FLOW_TAB);
313 	ft->ft_flow_count++;
314 	rw_exit(&ft->ft_lock);
315 	return (0);
316 }
317 
318 /*
319  * Remove a flow from a mac client's subflow table
320  */
321 void
322 mac_flow_rem_subflow(flow_entry_t *flent)
323 {
324 	flow_tab_t		*ft = flent->fe_flow_tab;
325 	mac_client_impl_t	*mcip = ft->ft_mcip;
326 	mac_handle_t		mh = (mac_handle_t)ft->ft_mip;
327 
328 	ASSERT(MAC_PERIM_HELD(mh));
329 
330 	mac_flow_remove(ft, flent, B_FALSE);
331 	if (flent->fe_mcip == NULL) {
332 		/*
333 		 * The interface is not yet plumbed and mac_client_flow_add
334 		 * was not done.
335 		 */
336 		if (FLOW_TAB_EMPTY(ft)) {
337 			mac_flow_tab_destroy(ft);
338 			mcip->mci_subflow_tab = NULL;
339 		}
340 	} else {
341 		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
342 		mac_link_flow_clean((mac_client_handle_t)mcip, flent);
343 	}
344 	mac_fastpath_enable(mh);
345 }
346 
347 /*
348  * Add a flow to a mac client's subflow table and instantiate the flow
349  * in the mac by creating the associated SRSs etc.
350  */
351 int
352 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
353     boolean_t instantiate_flow)
354 {
355 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
356 	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
357 	flow_tab_info_t		*ftinfo;
358 	flow_mask_t		mask;
359 	flow_tab_t		*ft;
360 	int			err;
361 	boolean_t		ft_created = B_FALSE;
362 
363 	ASSERT(MAC_PERIM_HELD(mh));
364 
365 	if ((err = mac_fastpath_disable(mh)) != 0)
366 		return (err);
367 
368 	/*
369 	 * If the subflow table exists already just add the new subflow
370 	 * to the existing table, else we create a new subflow table below.
371 	 */
372 	ft = mcip->mci_subflow_tab;
373 	if (ft == NULL) {
374 		mask = flent->fe_flow_desc.fd_mask;
375 		/*
376 		 * Try to create a new table and then add the subflow to the
377 		 * newly created subflow table
378 		 */
379 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
380 			mac_fastpath_enable(mh);
381 			return (EOPNOTSUPP);
382 		}
383 
384 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
385 		    mcip->mci_mip, &ft);
386 		ft_created = B_TRUE;
387 	}
388 
389 	err = mac_flow_add(ft, flent);
390 	if (err != 0) {
391 		if (ft_created)
392 			mac_flow_tab_destroy(ft);
393 		mac_fastpath_enable(mh);
394 		return (err);
395 	}
396 
397 	if (instantiate_flow) {
398 		/* Now activate the flow by creating its SRSs */
399 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
400 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
401 		if (err != 0) {
402 			mac_flow_remove(ft, flent, B_FALSE);
403 			if (ft_created)
404 				mac_flow_tab_destroy(ft);
405 			mac_fastpath_enable(mh);
406 			return (err);
407 		}
408 	} else {
409 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
410 	}
411 	if (ft_created) {
412 		ASSERT(mcip->mci_subflow_tab == NULL);
413 		ft->ft_mcip = mcip;
414 		mcip->mci_subflow_tab = ft;
415 		if (instantiate_flow)
416 			mac_client_update_classifier(mcip, B_TRUE);
417 	}
418 	return (0);
419 }
420 
421 /*
422  * Remove flow entry from flow table.
423  */
424 void
425 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
426 {
427 	flow_entry_t	**fp;
428 
429 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
430 	if (!(flent->fe_flags & FE_FLOW_TAB))
431 		return;
432 
433 	rw_enter(&ft->ft_lock, RW_WRITER);
434 	/*
435 	 * If this is a permanent removal from the flow table, mark it
436 	 * CONDEMNED to prevent future references. If this is a temporary
437 	 * removal from the table, say to update the flow descriptor then
438 	 * we don't mark it CONDEMNED
439 	 */
440 	if (!temp)
441 		FLOW_MARK(flent, FE_CONDEMNED);
442 	/*
443 	 * Locate the specified flent.
444 	 */
445 	fp = &ft->ft_table[flent->fe_index];
446 	while (*fp != flent)
447 		fp = &(*fp)->fe_next;
448 
449 	/*
450 	 * The flent must exist. Otherwise it's a bug.
451 	 */
452 	ASSERT(fp != NULL);
453 	*fp = flent->fe_next;
454 	flent->fe_next = NULL;
455 
456 	/*
457 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
458 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
459 	 * will panic.
460 	 */
461 	flent->fe_index = -1;
462 	FLOW_UNMARK(flent, FE_FLOW_TAB);
463 	ft->ft_flow_count--;
464 	rw_exit(&ft->ft_lock);
465 }
466 
467 /*
468  * This is the flow lookup routine used by the mac sw classifier engine.
469  */
470 int
471 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
472 {
473 	flow_state_t	s;
474 	flow_entry_t	*flent;
475 	flow_ops_t	*ops = &ft->ft_ops;
476 	boolean_t	retried = B_FALSE;
477 	int		i, err;
478 
479 	s.fs_flags = flags;
480 retry:
481 	s.fs_mp = mp;
482 
483 	/*
484 	 * Walk the list of predeclared accept functions.
485 	 * Each of these would accumulate enough state to allow the next
486 	 * accept routine to make progress.
487 	 */
488 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
489 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
490 			mblk_t	*last;
491 
492 			/*
493 			 * ENOBUFS indicates that the mp could be too short
494 			 * and may need a pullup.
495 			 */
496 			if (err != ENOBUFS || retried)
497 				return (err);
498 
499 			/*
500 			 * The pullup is done on the last processed mblk, not
501 			 * the starting one. pullup is not done if the mblk
502 			 * has references or if b_cont is NULL.
503 			 */
504 			last = s.fs_mp;
505 			if (DB_REF(last) > 1 || last->b_cont == NULL ||
506 			    pullupmsg(last, -1) == 0)
507 				return (EINVAL);
508 
509 			retried = B_TRUE;
510 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
511 			    flow_state_t *, &s);
512 			goto retry;
513 		}
514 	}
515 
516 	/*
517 	 * The packet is considered sane. We may now attempt to
518 	 * find the corresponding flent.
519 	 */
520 	rw_enter(&ft->ft_lock, RW_READER);
521 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
522 	for (; flent != NULL; flent = flent->fe_next) {
523 		if (flent->fe_match(ft, flent, &s)) {
524 			FLOW_TRY_REFHOLD(flent, err);
525 			if (err != 0)
526 				continue;
527 			*flentp = flent;
528 			rw_exit(&ft->ft_lock);
529 			return (0);
530 		}
531 	}
532 	rw_exit(&ft->ft_lock);
533 	return (ENOENT);
534 }
535 
536 /*
537  * Walk flow table.
538  * The caller is assumed to have proper perimeter protection.
539  */
540 int
541 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
542     void *arg)
543 {
544 	int		err, i, cnt = 0;
545 	flow_entry_t	*flent;
546 
547 	if (ft == NULL)
548 		return (0);
549 
550 	for (i = 0; i < ft->ft_size; i++) {
551 		for (flent = ft->ft_table[i]; flent != NULL;
552 		    flent = flent->fe_next) {
553 			cnt++;
554 			err = (*fn)(flent, arg);
555 			if (err != 0)
556 				return (err);
557 		}
558 	}
559 	VERIFY(cnt == ft->ft_flow_count);
560 	return (0);
561 }
562 
563 /*
564  * Same as the above except a mutex is used for protection here.
565  */
566 int
567 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
568     void *arg)
569 {
570 	int		err;
571 
572 	if (ft == NULL)
573 		return (0);
574 
575 	rw_enter(&ft->ft_lock, RW_WRITER);
576 	err = mac_flow_walk_nolock(ft, fn, arg);
577 	rw_exit(&ft->ft_lock);
578 	return (err);
579 }
580 
581 static boolean_t	mac_flow_clean(flow_entry_t *);
582 
583 /*
584  * Destroy a flow entry. Called when the last reference on a flow is released.
585  */
586 void
587 mac_flow_destroy(flow_entry_t *flent)
588 {
589 	ASSERT(flent->fe_refcnt == 0);
590 
591 	if ((flent->fe_type & FLOW_USER) != 0) {
592 		ASSERT(mac_flow_clean(flent));
593 	} else {
594 		mac_flow_cleanup(flent);
595 	}
596 
597 	mutex_destroy(&flent->fe_lock);
598 	cv_destroy(&flent->fe_cv);
599 	flow_stat_destroy(flent);
600 	kmem_cache_free(flow_cache, flent);
601 }
602 
603 /*
604  * XXX eric
605  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
606  * mac_link_flow_modify() should really be moved/reworked into the
607  * two functions below. This would consolidate all the mac property
608  * checking in one place. I'm leaving this alone for now since it's
609  * out of scope of the new flows work.
610  */
611 /* ARGSUSED */
612 uint32_t
613 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
614 {
615 	uint32_t		changed_mask = 0;
616 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
617 	int			i;
618 
619 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
620 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
621 		changed_mask |= MRP_MAXBW;
622 		fmrp->mrp_maxbw = mrp->mrp_maxbw;
623 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
624 			fmrp->mrp_mask &= ~MRP_MAXBW;
625 		} else {
626 			fmrp->mrp_mask |= MRP_MAXBW;
627 		}
628 	}
629 
630 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
631 		if (fmrp->mrp_priority != mrp->mrp_priority)
632 			changed_mask |= MRP_PRIORITY;
633 		if (mrp->mrp_priority == MPL_RESET) {
634 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
635 			fmrp->mrp_mask &= ~MRP_PRIORITY;
636 		} else {
637 			fmrp->mrp_priority = mrp->mrp_priority;
638 			fmrp->mrp_mask |= MRP_PRIORITY;
639 		}
640 	}
641 
642 	/* modify fanout */
643 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
644 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
645 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
646 			for (i = 0; i < mrp->mrp_ncpus; i++) {
647 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
648 					break;
649 			}
650 			if (i == mrp->mrp_ncpus) {
651 				/*
652 				 * The new set of cpus passed is exactly
653 				 * the same as the existing set.
654 				 */
655 				return (changed_mask);
656 			}
657 		}
658 		changed_mask |= MRP_CPUS;
659 		MAC_COPY_CPUS(mrp, fmrp);
660 	}
661 	return (changed_mask);
662 }
663 
664 void
665 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
666 {
667 	uint32_t changed_mask;
668 	mac_client_impl_t *mcip = flent->fe_mcip;
669 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
670 
671 	ASSERT(flent != NULL);
672 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
673 
674 	rw_enter(&ft->ft_lock, RW_WRITER);
675 
676 	/* Update the cached values inside the subflow entry */
677 	changed_mask = mac_flow_modify_props(flent, mrp);
678 	rw_exit(&ft->ft_lock);
679 	/*
680 	 * Push the changed parameters to the scheduling code in the
681 	 * SRS's, to take effect right away.
682 	 */
683 	if (changed_mask & MRP_MAXBW) {
684 		mac_srs_update_bwlimit(flent, mrp);
685 		/*
686 		 * If bandwidth is changed, we may have to change
687 		 * the number of soft ring to be used for fanout.
688 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
689 		 * is not set and there is no user supplied cpu
690 		 * info. This applies only to link at this time.
691 		 */
692 		if (!(flent->fe_type & FLOW_USER) &&
693 		    !(changed_mask & MRP_CPUS) &&
694 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
695 			mac_fanout_setup(mcip, flent, mcip_mrp,
696 			    mac_rx_deliver, mcip, NULL);
697 		}
698 	}
699 	if (mrp->mrp_mask & MRP_PRIORITY)
700 		mac_flow_update_priority(mcip, flent);
701 
702 	if (changed_mask & MRP_CPUS)
703 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
704 }
705 
706 /*
707  * This function waits for a certain condition to be met and is generally
708  * used before a destructive or quiescing operation.
709  */
710 void
711 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
712 {
713 	mutex_enter(&flent->fe_lock);
714 	flent->fe_flags |= FE_WAITER;
715 
716 	switch (event) {
717 	case FLOW_DRIVER_UPCALL:
718 		/*
719 		 * We want to make sure the driver upcalls have finished before
720 		 * we signal the Rx SRS worker to quit.
721 		 */
722 		while (flent->fe_refcnt != 1)
723 			cv_wait(&flent->fe_cv, &flent->fe_lock);
724 		break;
725 
726 	case FLOW_USER_REF:
727 		/*
728 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
729 		 * been removed from the global flow hash.
730 		 */
731 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
732 		while (flent->fe_user_refcnt != 0)
733 			cv_wait(&flent->fe_cv, &flent->fe_lock);
734 		break;
735 
736 	default:
737 		ASSERT(0);
738 	}
739 
740 	flent->fe_flags &= ~FE_WAITER;
741 	mutex_exit(&flent->fe_lock);
742 }
743 
744 static boolean_t
745 mac_flow_clean(flow_entry_t *flent)
746 {
747 	ASSERT(flent->fe_next == NULL);
748 	ASSERT(flent->fe_tx_srs == NULL);
749 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
750 	ASSERT(flent->fe_mbg == NULL);
751 
752 	return (B_TRUE);
753 }
754 
755 void
756 mac_flow_cleanup(flow_entry_t *flent)
757 {
758 	if ((flent->fe_type & FLOW_USER) == 0) {
759 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
760 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
761 		ASSERT(flent->fe_refcnt == 0);
762 	} else {
763 		ASSERT(flent->fe_refcnt == 1);
764 	}
765 
766 	if (flent->fe_mbg != NULL) {
767 		ASSERT(flent->fe_tx_srs == NULL);
768 		/* This is a multicast or broadcast flow entry */
769 		mac_bcast_grp_free(flent->fe_mbg);
770 		flent->fe_mbg = NULL;
771 	}
772 
773 	if (flent->fe_tx_srs != NULL) {
774 		ASSERT(flent->fe_mbg == NULL);
775 		mac_srs_free(flent->fe_tx_srs);
776 		flent->fe_tx_srs = NULL;
777 	}
778 
779 	/*
780 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
781 	 * when mac_unicast_add fails we may not have set up any SRS
782 	 * in which case fe_rx_srs_cnt will be zero.
783 	 */
784 	if (flent->fe_rx_srs_cnt != 0) {
785 		ASSERT(flent->fe_rx_srs_cnt == 1);
786 		mac_srs_free(flent->fe_rx_srs[0]);
787 		flent->fe_rx_srs[0] = NULL;
788 		flent->fe_rx_srs_cnt = 0;
789 	}
790 	ASSERT(flent->fe_rx_srs[0] == NULL);
791 }
792 
793 void
794 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
795 {
796 	/*
797 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
798 	 * Updates to the fe_flow_desc happen under the fe_lock
799 	 * after removing the flent from the flow table
800 	 */
801 	mutex_enter(&flent->fe_lock);
802 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
803 	mutex_exit(&flent->fe_lock);
804 }
805 
806 /*
807  * Update a field of a flow entry. The mac perimeter ensures that
808  * this is the only thread doing a modify operation on this mac end point.
809  * So the flow table can't change or disappear. The ft_lock protects access
810  * to the flow entry, and holding the lock ensures that there isn't any thread
811  * accessing the flow entry or attempting a flow table lookup. However
812  * data threads that are using the flow entry based on the old descriptor
813  * will continue to use the flow entry. If strong coherence is required
814  * then the flow will have to be quiesced before the descriptor can be
815  * changed.
816  */
817 void
818 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
819 {
820 	flow_tab_t	*ft = flent->fe_flow_tab;
821 	flow_desc_t	old_desc;
822 	int		err;
823 
824 	if (ft == NULL) {
825 		/*
826 		 * The flow hasn't yet been inserted into the table,
827 		 * so only the caller knows about this flow, however for
828 		 * uniformity we grab the fe_lock here.
829 		 */
830 		mutex_enter(&flent->fe_lock);
831 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
832 		mutex_exit(&flent->fe_lock);
833 	}
834 
835 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
836 
837 	/*
838 	 * Need to remove the flow entry from the table and reinsert it,
839 	 * into a potentially diference hash line. The hash depends on
840 	 * the new descriptor fields. However access to fe_desc itself
841 	 * is always under the fe_lock. This helps log and stat functions
842 	 * see a self-consistent fe_flow_desc.
843 	 */
844 	mac_flow_remove(ft, flent, B_TRUE);
845 	old_desc = flent->fe_flow_desc;
846 
847 	mutex_enter(&flent->fe_lock);
848 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
849 	mutex_exit(&flent->fe_lock);
850 
851 	if (mac_flow_add(ft, flent) != 0) {
852 		/*
853 		 * The add failed say due to an invalid flow descriptor.
854 		 * Undo the update
855 		 */
856 		flent->fe_flow_desc = old_desc;
857 		err = mac_flow_add(ft, flent);
858 		ASSERT(err == 0);
859 	}
860 }
861 
862 void
863 mac_flow_set_name(flow_entry_t *flent, const char *name)
864 {
865 	flow_tab_t	*ft = flent->fe_flow_tab;
866 
867 	if (ft == NULL) {
868 		/*
869 		 *  The flow hasn't yet been inserted into the table,
870 		 * so only the caller knows about this flow
871 		 */
872 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
873 	} else {
874 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
875 	}
876 
877 	mutex_enter(&flent->fe_lock);
878 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
879 	mutex_exit(&flent->fe_lock);
880 }
881 
882 /*
883  * Return the client-private cookie that was associated with
884  * the flow when it was created.
885  */
886 void *
887 mac_flow_get_client_cookie(flow_entry_t *flent)
888 {
889 	return (flent->fe_client_cookie);
890 }
891 
892 /*
893  * Forward declarations.
894  */
895 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
896 static uint32_t	flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
897 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
898 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
899 static uint32_t	flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
900 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
901 
902 /*
903  * Create flow table.
904  */
905 void
906 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
907     mac_impl_t *mip, flow_tab_t **ftp)
908 {
909 	flow_tab_t	*ft;
910 	flow_ops_t	*new_ops;
911 
912 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
913 	bzero(ft, sizeof (*ft));
914 
915 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
916 
917 	/*
918 	 * We make a copy of the ops vector instead of just pointing to it
919 	 * because we might want to customize the ops vector on a per table
920 	 * basis (e.g. for optimization).
921 	 */
922 	new_ops = &ft->ft_ops;
923 	bcopy(ops, new_ops, sizeof (*ops));
924 	ft->ft_mask = mask;
925 	ft->ft_size = size;
926 	ft->ft_mip = mip;
927 
928 	/*
929 	 * Optimizations for DL_ETHER media.
930 	 */
931 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
932 		if (new_ops->fo_hash == flow_l2_hash)
933 			new_ops->fo_hash = flow_ether_hash;
934 		if (new_ops->fo_hash_fe == flow_l2_hash_fe)
935 			new_ops->fo_hash_fe = flow_ether_hash_fe;
936 		if (new_ops->fo_accept[0] == flow_l2_accept)
937 			new_ops->fo_accept[0] = flow_ether_accept;
938 	}
939 	*ftp = ft;
940 }
941 
942 void
943 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
944 {
945 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
946 	    1024, mip, ftp);
947 }
948 
949 /*
950  * Destroy flow table.
951  */
952 void
953 mac_flow_tab_destroy(flow_tab_t *ft)
954 {
955 	if (ft == NULL)
956 		return;
957 
958 	ASSERT(ft->ft_flow_count == 0);
959 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
960 	bzero(ft, sizeof (*ft));
961 	kmem_cache_free(flow_tab_cache, ft);
962 }
963 
964 /*
965  * Add a new flow entry to the global flow hash table
966  */
967 int
968 mac_flow_hash_add(flow_entry_t *flent)
969 {
970 	int	err;
971 
972 	rw_enter(&flow_tab_lock, RW_WRITER);
973 	err = mod_hash_insert(flow_hash,
974 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
975 	if (err != 0) {
976 		rw_exit(&flow_tab_lock);
977 		return (EEXIST);
978 	}
979 	/* Mark as inserted into the global flow hash table */
980 	FLOW_MARK(flent, FE_G_FLOW_HASH);
981 	rw_exit(&flow_tab_lock);
982 	return (err);
983 }
984 
985 /*
986  * Remove a flow entry from the global flow hash table
987  */
988 void
989 mac_flow_hash_remove(flow_entry_t *flent)
990 {
991 	mod_hash_val_t	val;
992 
993 	rw_enter(&flow_tab_lock, RW_WRITER);
994 	VERIFY(mod_hash_remove(flow_hash,
995 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
996 
997 	/* Clear the mark that says inserted into the global flow hash table */
998 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
999 	rw_exit(&flow_tab_lock);
1000 }
1001 
1002 /*
1003  * Retrieve a flow entry from the global flow hash table.
1004  */
1005 int
1006 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1007 {
1008 	int		err;
1009 	flow_entry_t	*flent;
1010 
1011 	rw_enter(&flow_tab_lock, RW_READER);
1012 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1013 	    (mod_hash_val_t *)&flent);
1014 	if (err != 0) {
1015 		rw_exit(&flow_tab_lock);
1016 		return (ENOENT);
1017 	}
1018 	ASSERT(flent != NULL);
1019 	FLOW_USER_REFHOLD(flent);
1020 	rw_exit(&flow_tab_lock);
1021 
1022 	*flentp = flent;
1023 	return (0);
1024 }
1025 
1026 /*
1027  * Initialize or release mac client flows by walking the subflow table.
1028  * These are typically invoked during plumb/unplumb of links.
1029  */
1030 
1031 static int
1032 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1033 {
1034 	mac_client_impl_t	*mcip = arg;
1035 
1036 	if (mac_link_flow_init(arg, flent) != 0) {
1037 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1038 		    flent->fe_flow_name, mcip->mci_name);
1039 	} else {
1040 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1041 	}
1042 	return (0);
1043 }
1044 
1045 void
1046 mac_link_init_flows(mac_client_handle_t mch)
1047 {
1048 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1049 
1050 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1051 	    mac_link_init_flows_cb, mcip);
1052 	/*
1053 	 * If mac client had subflow(s) configured before plumb, change
1054 	 * function to mac_rx_srs_subflow_process and in case of hardware
1055 	 * classification, disable polling.
1056 	 */
1057 	mac_client_update_classifier(mcip, B_TRUE);
1058 
1059 }
1060 
1061 boolean_t
1062 mac_link_has_flows(mac_client_handle_t mch)
1063 {
1064 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1065 
1066 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1067 		return (B_TRUE);
1068 
1069 	return (B_FALSE);
1070 }
1071 
1072 static int
1073 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1074 {
1075 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1076 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1077 	mac_link_flow_clean(arg, flent);
1078 	return (0);
1079 }
1080 
1081 void
1082 mac_link_release_flows(mac_client_handle_t mch)
1083 {
1084 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1085 
1086 	/*
1087 	 * Change the mci_flent callback back to mac_rx_srs_process()
1088 	 * because flows are about to be deactivated.
1089 	 */
1090 	mac_client_update_classifier(mcip, B_FALSE);
1091 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1092 	    mac_link_release_flows_cb, mcip);
1093 }
1094 
1095 void
1096 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1097 {
1098 	mac_flow_set_name(fep, new_name);
1099 	if (fep->fe_ksp != NULL) {
1100 		flow_stat_destroy(fep);
1101 		flow_stat_create(fep);
1102 	}
1103 }
1104 
1105 /*
1106  * mac_link_flow_init()
1107  * Internal flow interface used for allocating SRSs and related
1108  * data structures. Not meant to be used by mac clients.
1109  */
1110 int
1111 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1112 {
1113 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1114 	mac_impl_t		*mip = mcip->mci_mip;
1115 	int			err;
1116 
1117 	ASSERT(mch != NULL);
1118 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1119 
1120 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1121 		return (err);
1122 
1123 	sub_flow->fe_mcip = mcip;
1124 
1125 	return (0);
1126 }
1127 
1128 /*
1129  * mac_link_flow_add()
1130  * Used by flowadm(1m) or kernel mac clients for creating flows.
1131  */
1132 int
1133 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1134     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1135 {
1136 	flow_entry_t		*flent = NULL;
1137 	int			err;
1138 	dls_dl_handle_t		dlh;
1139 	dls_link_t		*dlp;
1140 	boolean_t		link_held = B_FALSE;
1141 	boolean_t		hash_added = B_FALSE;
1142 	mac_perim_handle_t	mph;
1143 
1144 	err = mac_flow_lookup_byname(flow_name, &flent);
1145 	if (err == 0) {
1146 		FLOW_USER_REFRELE(flent);
1147 		return (EEXIST);
1148 	}
1149 
1150 	/*
1151 	 * First create a flow entry given the description provided
1152 	 * by the caller.
1153 	 */
1154 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1155 	    FLOW_USER | FLOW_OTHER, &flent);
1156 
1157 	if (err != 0)
1158 		return (err);
1159 
1160 	/*
1161 	 * We've got a local variable referencing this flow now, so we need
1162 	 * to hold it. We'll release this flow before returning.
1163 	 * All failures until we return will undo any action that may internally
1164 	 * held the flow, so the last REFRELE will assure a clean freeing
1165 	 * of resources.
1166 	 */
1167 	FLOW_REFHOLD(flent);
1168 
1169 	flent->fe_link_id = linkid;
1170 	FLOW_MARK(flent, FE_INCIPIENT);
1171 
1172 	err = mac_perim_enter_by_linkid(linkid, &mph);
1173 	if (err != 0) {
1174 		FLOW_FINAL_REFRELE(flent);
1175 		return (err);
1176 	}
1177 
1178 	/*
1179 	 * dls will eventually be merged with mac so it's ok
1180 	 * to call dls' internal functions.
1181 	 */
1182 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1183 	if (err != 0)
1184 		goto bail;
1185 
1186 	link_held = B_TRUE;
1187 
1188 	/*
1189 	 * Add the flow to the global flow table, this table will be per
1190 	 * exclusive zone so each zone can have its own flow namespace.
1191 	 * RFE 6625651 will fix this.
1192 	 *
1193 	 */
1194 	if ((err = mac_flow_hash_add(flent)) != 0)
1195 		goto bail;
1196 
1197 	hash_added = B_TRUE;
1198 
1199 	/*
1200 	 * do not allow flows to be configured on an anchor VNIC
1201 	 */
1202 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1203 		err = ENOTSUP;
1204 		goto bail;
1205 	}
1206 
1207 	/*
1208 	 * Add the subflow to the subflow table. Also instantiate the flow
1209 	 * in the mac if there is an active user (we check if the MAC client's
1210 	 * datapath has been setup).
1211 	 */
1212 	err = mac_flow_add_subflow(dlp->dl_mch, flent,
1213 	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1214 	if (err != 0)
1215 		goto bail;
1216 
1217 	FLOW_UNMARK(flent, FE_INCIPIENT);
1218 	dls_devnet_rele_link(dlh, dlp);
1219 	mac_perim_exit(mph);
1220 	return (0);
1221 
1222 bail:
1223 	if (hash_added)
1224 		mac_flow_hash_remove(flent);
1225 
1226 	if (link_held)
1227 		dls_devnet_rele_link(dlh, dlp);
1228 
1229 	/*
1230 	 * Wait for any transient global flow hash refs to clear
1231 	 * and then release the creation reference on the flow
1232 	 */
1233 	mac_flow_wait(flent, FLOW_USER_REF);
1234 	FLOW_FINAL_REFRELE(flent);
1235 	mac_perim_exit(mph);
1236 	return (err);
1237 }
1238 
1239 /*
1240  * mac_link_flow_clean()
1241  * Internal flow interface used for freeing SRSs and related
1242  * data structures. Not meant to be used by mac clients.
1243  */
1244 void
1245 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1246 {
1247 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1248 	mac_impl_t		*mip = mcip->mci_mip;
1249 	boolean_t		last_subflow;
1250 
1251 	ASSERT(mch != NULL);
1252 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1253 
1254 	/*
1255 	 * This sub flow entry may fail to be fully initialized by
1256 	 * mac_link_flow_init(). If so, simply return.
1257 	 */
1258 	if (sub_flow->fe_mcip == NULL)
1259 		return;
1260 
1261 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1262 	/*
1263 	 * Tear down the data path
1264 	 */
1265 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1266 	sub_flow->fe_mcip = NULL;
1267 
1268 	/*
1269 	 * Delete the SRSs associated with this subflow. If this is being
1270 	 * driven by flowadm(1M) then the subflow will be deleted by
1271 	 * dls_rem_flow. However if this is a result of the interface being
1272 	 * unplumbed then the subflow itself won't be deleted.
1273 	 */
1274 	mac_flow_cleanup(sub_flow);
1275 
1276 	/*
1277 	 * If all the subflows are gone, renable some of the stuff
1278 	 * we disabled when adding a subflow, polling etc.
1279 	 */
1280 	if (last_subflow) {
1281 		/*
1282 		 * The subflow table itself is not protected by any locks or
1283 		 * refcnts. Hence quiesce the client upfront before clearing
1284 		 * mci_subflow_tab.
1285 		 */
1286 		mac_client_quiesce(mcip);
1287 		mac_client_update_classifier(mcip, B_FALSE);
1288 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1289 		mcip->mci_subflow_tab = NULL;
1290 		mac_client_restart(mcip);
1291 	}
1292 }
1293 
1294 /*
1295  * mac_link_flow_remove()
1296  * Used by flowadm(1m) or kernel mac clients for removing flows.
1297  */
1298 int
1299 mac_link_flow_remove(char *flow_name)
1300 {
1301 	flow_entry_t		*flent;
1302 	mac_perim_handle_t	mph;
1303 	int			err;
1304 	datalink_id_t		linkid;
1305 
1306 	err = mac_flow_lookup_byname(flow_name, &flent);
1307 	if (err != 0)
1308 		return (err);
1309 
1310 	linkid = flent->fe_link_id;
1311 	FLOW_USER_REFRELE(flent);
1312 
1313 	/*
1314 	 * The perim must be acquired before acquiring any other references
1315 	 * to maintain the lock and perimeter hierarchy. Please note the
1316 	 * FLOW_REFRELE above.
1317 	 */
1318 	err = mac_perim_enter_by_linkid(linkid, &mph);
1319 	if (err != 0)
1320 		return (err);
1321 
1322 	/*
1323 	 * Note the second lookup of the flow, because a concurrent thread
1324 	 * may have removed it already while we were waiting to enter the
1325 	 * link's perimeter.
1326 	 */
1327 	err = mac_flow_lookup_byname(flow_name, &flent);
1328 	if (err != 0) {
1329 		mac_perim_exit(mph);
1330 		return (err);
1331 	}
1332 	FLOW_USER_REFRELE(flent);
1333 
1334 	/*
1335 	 * Remove the flow from the subflow table and deactivate the flow
1336 	 * by quiescing and removings its SRSs
1337 	 */
1338 	mac_flow_rem_subflow(flent);
1339 
1340 	/*
1341 	 * Finally, remove the flow from the global table.
1342 	 */
1343 	mac_flow_hash_remove(flent);
1344 
1345 	/*
1346 	 * Wait for any transient global flow hash refs to clear
1347 	 * and then release the creation reference on the flow
1348 	 */
1349 	mac_flow_wait(flent, FLOW_USER_REF);
1350 	FLOW_FINAL_REFRELE(flent);
1351 
1352 	mac_perim_exit(mph);
1353 
1354 	return (0);
1355 }
1356 
1357 /*
1358  * mac_link_flow_modify()
1359  * Modifies the properties of a flow identified by its name.
1360  */
1361 int
1362 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1363 {
1364 	flow_entry_t		*flent;
1365 	mac_client_impl_t 	*mcip;
1366 	int			err = 0;
1367 	mac_perim_handle_t	mph;
1368 	datalink_id_t		linkid;
1369 	flow_tab_t		*flow_tab;
1370 
1371 	err = mac_validate_props(mrp);
1372 	if (err != 0)
1373 		return (err);
1374 
1375 	err = mac_flow_lookup_byname(flow_name, &flent);
1376 	if (err != 0)
1377 		return (err);
1378 
1379 	linkid = flent->fe_link_id;
1380 	FLOW_USER_REFRELE(flent);
1381 
1382 	/*
1383 	 * The perim must be acquired before acquiring any other references
1384 	 * to maintain the lock and perimeter hierarchy. Please note the
1385 	 * FLOW_REFRELE above.
1386 	 */
1387 	err = mac_perim_enter_by_linkid(linkid, &mph);
1388 	if (err != 0)
1389 		return (err);
1390 
1391 	/*
1392 	 * Note the second lookup of the flow, because a concurrent thread
1393 	 * may have removed it already while we were waiting to enter the
1394 	 * link's perimeter.
1395 	 */
1396 	err = mac_flow_lookup_byname(flow_name, &flent);
1397 	if (err != 0) {
1398 		mac_perim_exit(mph);
1399 		return (err);
1400 	}
1401 	FLOW_USER_REFRELE(flent);
1402 
1403 	/*
1404 	 * If this flow is attached to a MAC client, then pass the request
1405 	 * along to the client.
1406 	 * Otherwise, just update the cached values.
1407 	 */
1408 	mcip = flent->fe_mcip;
1409 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1410 	if (mcip != NULL) {
1411 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1412 			err = ENOENT;
1413 		} else {
1414 			mac_flow_modify(flow_tab, flent, mrp);
1415 		}
1416 	} else {
1417 		(void) mac_flow_modify_props(flent, mrp);
1418 	}
1419 
1420 done:
1421 	mac_perim_exit(mph);
1422 	return (err);
1423 }
1424 
1425 
1426 /*
1427  * State structure and misc functions used by mac_link_flow_walk().
1428  */
1429 typedef struct {
1430 	int	(*ws_func)(mac_flowinfo_t *, void *);
1431 	void	*ws_arg;
1432 } flow_walk_state_t;
1433 
1434 static void
1435 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1436 {
1437 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1438 	    MAXFLOWNAMELEN);
1439 	finfop->fi_link_id = flent->fe_link_id;
1440 	finfop->fi_flow_desc = flent->fe_flow_desc;
1441 	finfop->fi_resource_props = flent->fe_resource_props;
1442 }
1443 
1444 static int
1445 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1446 {
1447 	flow_walk_state_t	*statep = arg;
1448 	mac_flowinfo_t		finfo;
1449 
1450 	mac_link_flowinfo_copy(&finfo, flent);
1451 	return (statep->ws_func(&finfo, statep->ws_arg));
1452 }
1453 
1454 /*
1455  * mac_link_flow_walk()
1456  * Invokes callback 'func' for all flows belonging to the specified link.
1457  */
1458 int
1459 mac_link_flow_walk(datalink_id_t linkid,
1460     int (*func)(mac_flowinfo_t *, void *), void *arg)
1461 {
1462 	mac_client_impl_t	*mcip;
1463 	mac_perim_handle_t	mph;
1464 	flow_walk_state_t	state;
1465 	dls_dl_handle_t		dlh;
1466 	dls_link_t		*dlp;
1467 	int			err;
1468 
1469 	err = mac_perim_enter_by_linkid(linkid, &mph);
1470 	if (err != 0)
1471 		return (err);
1472 
1473 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1474 	if (err != 0) {
1475 		mac_perim_exit(mph);
1476 		return (err);
1477 	}
1478 
1479 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1480 	state.ws_func = func;
1481 	state.ws_arg = arg;
1482 
1483 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1484 	    mac_link_flow_walk_cb, &state);
1485 
1486 	dls_devnet_rele_link(dlh, dlp);
1487 	mac_perim_exit(mph);
1488 	return (err);
1489 }
1490 
1491 /*
1492  * mac_link_flow_info()
1493  * Retrieves information about a specific flow.
1494  */
1495 int
1496 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1497 {
1498 	flow_entry_t	*flent;
1499 	int		err;
1500 
1501 	err = mac_flow_lookup_byname(flow_name, &flent);
1502 	if (err != 0)
1503 		return (err);
1504 
1505 	mac_link_flowinfo_copy(finfo, flent);
1506 	FLOW_USER_REFRELE(flent);
1507 	return (0);
1508 }
1509 
1510 /*
1511  * Hash function macro that takes an Ethernet address and VLAN id as input.
1512  */
1513 #define	HASH_ETHER_VID(a, v, s)	\
1514 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1515 
1516 /*
1517  * Generic layer-2 address hashing function that takes an address and address
1518  * length as input.  This is the DJB hash function.
1519  */
1520 static uint32_t
1521 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1522 {
1523 	uint32_t	hash = 5381;
1524 	size_t		i;
1525 
1526 	for (i = 0; i < addrlen; i++)
1527 		hash = ((hash << 5) + hash) + addr[i];
1528 	return (hash % htsize);
1529 }
1530 
1531 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1532 
1533 #define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
1534 	if ((s)->fs_mp->b_wptr == (start)) {		\
1535 		mblk_t	*next = (s)->fs_mp->b_cont;	\
1536 		if (next == NULL)			\
1537 			return (EINVAL);		\
1538 							\
1539 		(s)->fs_mp = next;			\
1540 		(start) = next->b_rptr;			\
1541 	}						\
1542 }
1543 
1544 /* ARGSUSED */
1545 static boolean_t
1546 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1547 {
1548 	flow_l2info_t		*l2 = &s->fs_l2info;
1549 	flow_desc_t		*fd = &flent->fe_flow_desc;
1550 
1551 	return (l2->l2_vid == fd->fd_vid &&
1552 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1553 }
1554 
1555 /*
1556  * Layer 2 hash function.
1557  * Must be paired with flow_l2_accept() within a set of flow_ops
1558  * because it assumes the dest address is already extracted.
1559  */
1560 static uint32_t
1561 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1562 {
1563 	return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1564 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1565 }
1566 
1567 /*
1568  * This is the generic layer 2 accept function.
1569  * It makes use of mac_header_info() to extract the header length,
1570  * sap, vlan ID and destination address.
1571  */
1572 static int
1573 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1574 {
1575 	boolean_t		is_ether;
1576 	flow_l2info_t		*l2 = &s->fs_l2info;
1577 	mac_header_info_t	mhi;
1578 	int			err;
1579 
1580 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1581 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1582 	    s->fs_mp, &mhi)) != 0) {
1583 		if (err == EINVAL)
1584 			err = ENOBUFS;
1585 
1586 		return (err);
1587 	}
1588 
1589 	l2->l2_start = s->fs_mp->b_rptr;
1590 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1591 
1592 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1593 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1594 		struct ether_vlan_header	*evhp =
1595 		    (struct ether_vlan_header *)l2->l2_start;
1596 
1597 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1598 			return (ENOBUFS);
1599 
1600 		l2->l2_sap = ntohs(evhp->ether_type);
1601 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1602 		l2->l2_hdrsize = sizeof (*evhp);
1603 	} else {
1604 		l2->l2_sap = mhi.mhi_bindsap;
1605 		l2->l2_vid = 0;
1606 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1607 	}
1608 	return (0);
1609 }
1610 
1611 /*
1612  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1613  * accept(). The notable difference is that dest address is now extracted
1614  * by hash() rather than by accept(). This saves a few memory references
1615  * for flow tables that do not care about mac addresses.
1616  */
1617 static uint32_t
1618 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1619 {
1620 	flow_l2info_t			*l2 = &s->fs_l2info;
1621 	struct ether_vlan_header	*evhp;
1622 
1623 	evhp = (struct ether_vlan_header *)l2->l2_start;
1624 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1625 	return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1626 }
1627 
1628 static uint32_t
1629 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1630 {
1631 	flow_desc_t	*fd = &flent->fe_flow_desc;
1632 
1633 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1634 	return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1635 }
1636 
1637 /* ARGSUSED */
1638 static int
1639 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1640 {
1641 	flow_l2info_t			*l2 = &s->fs_l2info;
1642 	struct ether_vlan_header	*evhp;
1643 	uint16_t			sap;
1644 
1645 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1646 	l2->l2_start = (uchar_t *)evhp;
1647 
1648 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1649 		return (ENOBUFS);
1650 
1651 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1652 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1653 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1654 			return (ENOBUFS);
1655 
1656 		l2->l2_sap = ntohs(evhp->ether_type);
1657 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1658 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1659 	} else {
1660 		l2->l2_sap = sap;
1661 		l2->l2_vid = 0;
1662 		l2->l2_hdrsize = sizeof (struct ether_header);
1663 	}
1664 	return (0);
1665 }
1666 
1667 /*
1668  * Validates a layer 2 flow entry.
1669  */
1670 static int
1671 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1672 {
1673 	flow_desc_t	*fd = &flent->fe_flow_desc;
1674 
1675 	/*
1676 	 * Dest address is mandatory, and 0 length addresses are not yet
1677 	 * supported.
1678 	 */
1679 	if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1680 		return (EINVAL);
1681 
1682 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1683 		/*
1684 		 * VLAN flows are only supported over ethernet macs.
1685 		 */
1686 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1687 			return (EINVAL);
1688 
1689 		if (fd->fd_vid == 0)
1690 			return (EINVAL);
1691 
1692 	}
1693 	flent->fe_match = flow_l2_match;
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Calculates hash index of flow entry.
1699  */
1700 static uint32_t
1701 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1702 {
1703 	flow_desc_t	*fd = &flent->fe_flow_desc;
1704 
1705 	ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1706 	return (flow_l2_addrhash(fd->fd_dst_mac,
1707 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1708 }
1709 
1710 /*
1711  * This is used for duplicate flow checking.
1712  */
1713 /* ARGSUSED */
1714 static boolean_t
1715 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1716 {
1717 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1718 
1719 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1720 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1721 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1722 }
1723 
1724 /*
1725  * Generic flow entry insertion function.
1726  * Used by flow tables that do not have ordering requirements.
1727  */
1728 /* ARGSUSED */
1729 static int
1730 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1731     flow_entry_t *flent)
1732 {
1733 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1734 
1735 	if (*headp != NULL) {
1736 		ASSERT(flent->fe_next == NULL);
1737 		flent->fe_next = *headp;
1738 	}
1739 	*headp = flent;
1740 	return (0);
1741 }
1742 
1743 /*
1744  * IP version independent DSField matching function.
1745  */
1746 /* ARGSUSED */
1747 static boolean_t
1748 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1749 {
1750 	flow_l3info_t	*l3info = &s->fs_l3info;
1751 	flow_desc_t	*fd = &flent->fe_flow_desc;
1752 
1753 	switch (l3info->l3_version) {
1754 	case IPV4_VERSION: {
1755 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1756 
1757 		return ((ipha->ipha_type_of_service &
1758 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1759 	}
1760 	case IPV6_VERSION: {
1761 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1762 
1763 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1764 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1765 	}
1766 	default:
1767 		return (B_FALSE);
1768 	}
1769 }
1770 
1771 /*
1772  * IP v4 and v6 address matching.
1773  * The netmask only needs to be applied on the packet but not on the
1774  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1775  */
1776 
1777 /* ARGSUSED */
1778 static boolean_t
1779 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1780 {
1781 	flow_l3info_t	*l3info = &s->fs_l3info;
1782 	flow_desc_t	*fd = &flent->fe_flow_desc;
1783 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1784 	in_addr_t	addr;
1785 
1786 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1787 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1788 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1789 		    V4_PART_OF_V6(fd->fd_local_addr));
1790 	}
1791 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1792 	    V4_PART_OF_V6(fd->fd_remote_addr));
1793 }
1794 
1795 /* ARGSUSED */
1796 static boolean_t
1797 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1798 {
1799 	flow_l3info_t	*l3info = &s->fs_l3info;
1800 	flow_desc_t	*fd = &flent->fe_flow_desc;
1801 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1802 	in6_addr_t	*addrp;
1803 
1804 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1805 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1806 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1807 		    fd->fd_local_addr));
1808 	}
1809 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1810 }
1811 
1812 /* ARGSUSED */
1813 static boolean_t
1814 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1815 {
1816 	flow_l3info_t	*l3info = &s->fs_l3info;
1817 	flow_desc_t	*fd = &flent->fe_flow_desc;
1818 
1819 	return (l3info->l3_protocol == fd->fd_protocol);
1820 }
1821 
1822 static uint32_t
1823 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1824 {
1825 	flow_l3info_t	*l3info = &s->fs_l3info;
1826 	flow_mask_t	mask = ft->ft_mask;
1827 
1828 	if ((mask & FLOW_IP_LOCAL) != 0) {
1829 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1830 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1831 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1832 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1833 		/*
1834 		 * DSField flents are arranged as a single list.
1835 		 */
1836 		return (0);
1837 	}
1838 	/*
1839 	 * IP addr flents are hashed into two lists, v4 or v6.
1840 	 */
1841 	ASSERT(ft->ft_size >= 2);
1842 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1843 }
1844 
1845 static uint32_t
1846 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1847 {
1848 	flow_l3info_t	*l3info = &s->fs_l3info;
1849 
1850 	return (l3info->l3_protocol % ft->ft_size);
1851 }
1852 
1853 /* ARGSUSED */
1854 static int
1855 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1856 {
1857 	flow_l2info_t	*l2info = &s->fs_l2info;
1858 	flow_l3info_t	*l3info = &s->fs_l3info;
1859 	uint16_t	sap = l2info->l2_sap;
1860 	uchar_t		*l3_start;
1861 
1862 	l3_start = l2info->l2_start + l2info->l2_hdrsize;
1863 
1864 	/*
1865 	 * Adjust start pointer if we're at the end of an mblk.
1866 	 */
1867 	CHECK_AND_ADJUST_START_PTR(s, l3_start);
1868 
1869 	l3info->l3_start = l3_start;
1870 	if (!OK_32PTR(l3_start))
1871 		return (EINVAL);
1872 
1873 	switch (sap) {
1874 	case ETHERTYPE_IP: {
1875 		ipha_t	*ipha = (ipha_t *)l3_start;
1876 
1877 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1878 			return (ENOBUFS);
1879 
1880 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1881 		l3info->l3_protocol = ipha->ipha_protocol;
1882 		l3info->l3_version = IPV4_VERSION;
1883 		l3info->l3_fragmented =
1884 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1885 		break;
1886 	}
1887 	case ETHERTYPE_IPV6: {
1888 		ip6_t   *ip6h = (ip6_t *)l3_start;
1889 		uint16_t ip6_hdrlen;
1890 		uint8_t	 nexthdr;
1891 
1892 		if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
1893 		    &nexthdr)) {
1894 			return (ENOBUFS);
1895 		}
1896 		l3info->l3_hdrsize = ip6_hdrlen;
1897 		l3info->l3_protocol = nexthdr;
1898 		l3info->l3_version = IPV6_VERSION;
1899 		l3info->l3_fragmented = B_FALSE;
1900 		break;
1901 	}
1902 	default:
1903 		return (EINVAL);
1904 	}
1905 	return (0);
1906 }
1907 
1908 /* ARGSUSED */
1909 static int
1910 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1911 {
1912 	flow_desc_t	*fd = &flent->fe_flow_desc;
1913 
1914 	switch (fd->fd_protocol) {
1915 	case IPPROTO_TCP:
1916 	case IPPROTO_UDP:
1917 	case IPPROTO_SCTP:
1918 	case IPPROTO_ICMP:
1919 	case IPPROTO_ICMPV6:
1920 		flent->fe_match = flow_ip_proto_match;
1921 		return (0);
1922 	default:
1923 		return (EINVAL);
1924 	}
1925 }
1926 
1927 /* ARGSUSED */
1928 static int
1929 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1930 {
1931 	flow_desc_t	*fd = &flent->fe_flow_desc;
1932 	flow_mask_t	mask;
1933 	uint8_t		version;
1934 	in6_addr_t	*addr, *netmask;
1935 
1936 	/*
1937 	 * DSField does not require a IP version.
1938 	 */
1939 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
1940 		if (fd->fd_dsfield_mask == 0)
1941 			return (EINVAL);
1942 
1943 		flent->fe_match = flow_ip_dsfield_match;
1944 		return (0);
1945 	}
1946 
1947 	/*
1948 	 * IP addresses must come with a version to avoid ambiguity.
1949 	 */
1950 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
1951 		return (EINVAL);
1952 
1953 	version = fd->fd_ipversion;
1954 	if (version != IPV4_VERSION && version != IPV6_VERSION)
1955 		return (EINVAL);
1956 
1957 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
1958 	switch (mask) {
1959 	case FLOW_IP_LOCAL:
1960 		addr = &fd->fd_local_addr;
1961 		netmask = &fd->fd_local_netmask;
1962 		break;
1963 	case FLOW_IP_REMOTE:
1964 		addr = &fd->fd_remote_addr;
1965 		netmask = &fd->fd_remote_netmask;
1966 		break;
1967 	default:
1968 		return (EINVAL);
1969 	}
1970 
1971 	/*
1972 	 * Apply netmask onto specified address.
1973 	 */
1974 	V6_MASK_COPY(*addr, *netmask, *addr);
1975 	if (version == IPV4_VERSION) {
1976 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
1977 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
1978 
1979 		if (v4addr == 0 || v4mask == 0)
1980 			return (EINVAL);
1981 		flent->fe_match = flow_ip_v4_match;
1982 	} else {
1983 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
1984 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
1985 			return (EINVAL);
1986 		flent->fe_match = flow_ip_v6_match;
1987 	}
1988 	return (0);
1989 }
1990 
1991 static uint32_t
1992 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1993 {
1994 	flow_desc_t	*fd = &flent->fe_flow_desc;
1995 
1996 	return (fd->fd_protocol % ft->ft_size);
1997 }
1998 
1999 static uint32_t
2000 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2001 {
2002 	flow_desc_t	*fd = &flent->fe_flow_desc;
2003 
2004 	/*
2005 	 * DSField flents are arranged as a single list.
2006 	 */
2007 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2008 		return (0);
2009 
2010 	/*
2011 	 * IP addr flents are hashed into two lists, v4 or v6.
2012 	 */
2013 	ASSERT(ft->ft_size >= 2);
2014 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2015 }
2016 
2017 /* ARGSUSED */
2018 static boolean_t
2019 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2020 {
2021 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2022 
2023 	return (fd1->fd_protocol == fd2->fd_protocol);
2024 }
2025 
2026 /* ARGSUSED */
2027 static boolean_t
2028 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2029 {
2030 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2031 	in6_addr_t	*a1, *m1, *a2, *m2;
2032 
2033 	ASSERT(fd1->fd_mask == fd2->fd_mask);
2034 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2035 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2036 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2037 	}
2038 
2039 	/*
2040 	 * flow_ip_accept_fe() already validated the version.
2041 	 */
2042 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2043 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2044 		return (B_FALSE);
2045 
2046 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2047 	case FLOW_IP_LOCAL:
2048 		a1 = &fd1->fd_local_addr;
2049 		m1 = &fd1->fd_local_netmask;
2050 		a2 = &fd2->fd_local_addr;
2051 		m2 = &fd2->fd_local_netmask;
2052 		break;
2053 	case FLOW_IP_REMOTE:
2054 		a1 = &fd1->fd_remote_addr;
2055 		m1 = &fd1->fd_remote_netmask;
2056 		a2 = &fd2->fd_remote_addr;
2057 		m2 = &fd2->fd_remote_netmask;
2058 		break;
2059 	default:
2060 		/*
2061 		 * This is unreachable given the checks in
2062 		 * flow_ip_accept_fe().
2063 		 */
2064 		return (B_FALSE);
2065 	}
2066 
2067 	if (fd1->fd_ipversion == IPV4_VERSION) {
2068 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2069 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2070 
2071 	} else {
2072 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2073 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2074 	}
2075 }
2076 
2077 static int
2078 flow_ip_mask2plen(in6_addr_t *v6mask)
2079 {
2080 	int		bits;
2081 	int		plen = IPV6_ABITS;
2082 	int		i;
2083 
2084 	for (i = 3; i >= 0; i--) {
2085 		if (v6mask->s6_addr32[i] == 0) {
2086 			plen -= 32;
2087 			continue;
2088 		}
2089 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2090 		if (bits == 0)
2091 			break;
2092 		plen -= bits;
2093 	}
2094 	return (plen);
2095 }
2096 
2097 /* ARGSUSED */
2098 static int
2099 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2100     flow_entry_t *flent)
2101 {
2102 	flow_entry_t	**p = headp;
2103 	flow_desc_t	*fd0, *fd;
2104 	in6_addr_t	*m0, *m;
2105 	int		plen0, plen;
2106 
2107 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2108 
2109 	/*
2110 	 * No special ordering needed for dsfield.
2111 	 */
2112 	fd0 = &flent->fe_flow_desc;
2113 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2114 		if (*p != NULL) {
2115 			ASSERT(flent->fe_next == NULL);
2116 			flent->fe_next = *p;
2117 		}
2118 		*p = flent;
2119 		return (0);
2120 	}
2121 
2122 	/*
2123 	 * IP address flows are arranged in descending prefix length order.
2124 	 */
2125 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2126 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2127 	plen0 = flow_ip_mask2plen(m0);
2128 	ASSERT(plen0 != 0);
2129 
2130 	for (; *p != NULL; p = &(*p)->fe_next) {
2131 		fd = &(*p)->fe_flow_desc;
2132 
2133 		/*
2134 		 * Normally a dsfield flent shouldn't end up on the same
2135 		 * list as an IP address because flow tables are (for now)
2136 		 * disjoint. If we decide to support both IP and dsfield
2137 		 * in the same table in the future, this check will allow
2138 		 * for that.
2139 		 */
2140 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2141 			continue;
2142 
2143 		/*
2144 		 * We also allow for the mixing of local and remote address
2145 		 * flents within one list.
2146 		 */
2147 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2148 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2149 		plen = flow_ip_mask2plen(m);
2150 
2151 		if (plen <= plen0)
2152 			break;
2153 	}
2154 	if (*p != NULL) {
2155 		ASSERT(flent->fe_next == NULL);
2156 		flent->fe_next = *p;
2157 	}
2158 	*p = flent;
2159 	return (0);
2160 }
2161 
2162 /*
2163  * Transport layer protocol and port matching functions.
2164  */
2165 
2166 /* ARGSUSED */
2167 static boolean_t
2168 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2169 {
2170 	flow_l3info_t	*l3info = &s->fs_l3info;
2171 	flow_l4info_t	*l4info = &s->fs_l4info;
2172 	flow_desc_t	*fd = &flent->fe_flow_desc;
2173 
2174 	return (fd->fd_protocol == l3info->l3_protocol &&
2175 	    fd->fd_local_port == l4info->l4_hash_port);
2176 }
2177 
2178 /* ARGSUSED */
2179 static boolean_t
2180 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2181 {
2182 	flow_l3info_t	*l3info = &s->fs_l3info;
2183 	flow_l4info_t	*l4info = &s->fs_l4info;
2184 	flow_desc_t	*fd = &flent->fe_flow_desc;
2185 
2186 	return (fd->fd_protocol == l3info->l3_protocol &&
2187 	    fd->fd_remote_port == l4info->l4_hash_port);
2188 }
2189 
2190 /*
2191  * Transport hash function.
2192  * Since we only support either local or remote port flows,
2193  * we only need to extract one of the ports to be used for
2194  * matching.
2195  */
2196 static uint32_t
2197 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2198 {
2199 	flow_l3info_t	*l3info = &s->fs_l3info;
2200 	flow_l4info_t	*l4info = &s->fs_l4info;
2201 	uint8_t		proto = l3info->l3_protocol;
2202 	boolean_t	dst_or_src;
2203 
2204 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2205 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2206 	} else {
2207 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2208 	}
2209 
2210 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2211 	    l4info->l4_src_port;
2212 
2213 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2214 }
2215 
2216 /*
2217  * Unlike other accept() functions above, we do not need to get the header
2218  * size because this is our highest layer so far. If we want to do support
2219  * other higher layer protocols, we would need to save the l4_hdrsize
2220  * in the code below.
2221  */
2222 
2223 /* ARGSUSED */
2224 static int
2225 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2226 {
2227 	flow_l3info_t	*l3info = &s->fs_l3info;
2228 	flow_l4info_t	*l4info = &s->fs_l4info;
2229 	uint8_t		proto = l3info->l3_protocol;
2230 	uchar_t		*l4_start;
2231 
2232 	l4_start = l3info->l3_start + l3info->l3_hdrsize;
2233 
2234 	/*
2235 	 * Adjust start pointer if we're at the end of an mblk.
2236 	 */
2237 	CHECK_AND_ADJUST_START_PTR(s, l4_start);
2238 
2239 	l4info->l4_start = l4_start;
2240 	if (!OK_32PTR(l4_start))
2241 		return (EINVAL);
2242 
2243 	if (l3info->l3_fragmented == B_TRUE)
2244 		return (EINVAL);
2245 
2246 	switch (proto) {
2247 	case IPPROTO_TCP: {
2248 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2249 
2250 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2251 			return (ENOBUFS);
2252 
2253 		l4info->l4_src_port = tcph->th_sport;
2254 		l4info->l4_dst_port = tcph->th_dport;
2255 		break;
2256 	}
2257 	case IPPROTO_UDP: {
2258 		struct udphdr	*udph = (struct udphdr *)l4_start;
2259 
2260 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2261 			return (ENOBUFS);
2262 
2263 		l4info->l4_src_port = udph->uh_sport;
2264 		l4info->l4_dst_port = udph->uh_dport;
2265 		break;
2266 	}
2267 	case IPPROTO_SCTP: {
2268 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2269 
2270 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2271 			return (ENOBUFS);
2272 
2273 		l4info->l4_src_port = sctph->sh_sport;
2274 		l4info->l4_dst_port = sctph->sh_dport;
2275 		break;
2276 	}
2277 	default:
2278 		return (EINVAL);
2279 	}
2280 
2281 	return (0);
2282 }
2283 
2284 /*
2285  * Validates transport flow entry.
2286  * The protocol field must be present.
2287  */
2288 
2289 /* ARGSUSED */
2290 static int
2291 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2292 {
2293 	flow_desc_t	*fd = &flent->fe_flow_desc;
2294 	flow_mask_t	mask = fd->fd_mask;
2295 
2296 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2297 		return (EINVAL);
2298 
2299 	switch (fd->fd_protocol) {
2300 	case IPPROTO_TCP:
2301 	case IPPROTO_UDP:
2302 	case IPPROTO_SCTP:
2303 		break;
2304 	default:
2305 		return (EINVAL);
2306 	}
2307 
2308 	switch (mask & ~FLOW_IP_PROTOCOL) {
2309 	case FLOW_ULP_PORT_LOCAL:
2310 		if (fd->fd_local_port == 0)
2311 			return (EINVAL);
2312 
2313 		flent->fe_match = flow_transport_lport_match;
2314 		break;
2315 	case FLOW_ULP_PORT_REMOTE:
2316 		if (fd->fd_remote_port == 0)
2317 			return (EINVAL);
2318 
2319 		flent->fe_match = flow_transport_rport_match;
2320 		break;
2321 	case 0:
2322 		/*
2323 		 * transport-only flows conflicts with our table type.
2324 		 */
2325 		return (EOPNOTSUPP);
2326 	default:
2327 		return (EINVAL);
2328 	}
2329 
2330 	return (0);
2331 }
2332 
2333 static uint32_t
2334 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2335 {
2336 	flow_desc_t	*fd = &flent->fe_flow_desc;
2337 	uint16_t	port = 0;
2338 
2339 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2340 	    fd->fd_local_port : fd->fd_remote_port;
2341 
2342 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2343 }
2344 
2345 /* ARGSUSED */
2346 static boolean_t
2347 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2348 {
2349 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2350 
2351 	if (fd1->fd_protocol != fd2->fd_protocol)
2352 		return (B_FALSE);
2353 
2354 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2355 		return (fd1->fd_local_port == fd2->fd_local_port);
2356 
2357 	if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2358 		return (fd1->fd_remote_port == fd2->fd_remote_port);
2359 
2360 	return (B_TRUE);
2361 }
2362 
2363 static flow_ops_t flow_l2_ops = {
2364 	flow_l2_accept_fe,
2365 	flow_l2_hash_fe,
2366 	flow_l2_match_fe,
2367 	flow_generic_insert_fe,
2368 	flow_l2_hash,
2369 	{flow_l2_accept}
2370 };
2371 
2372 static flow_ops_t flow_ip_ops = {
2373 	flow_ip_accept_fe,
2374 	flow_ip_hash_fe,
2375 	flow_ip_match_fe,
2376 	flow_ip_insert_fe,
2377 	flow_ip_hash,
2378 	{flow_l2_accept, flow_ip_accept}
2379 };
2380 
2381 static flow_ops_t flow_ip_proto_ops = {
2382 	flow_ip_proto_accept_fe,
2383 	flow_ip_proto_hash_fe,
2384 	flow_ip_proto_match_fe,
2385 	flow_generic_insert_fe,
2386 	flow_ip_proto_hash,
2387 	{flow_l2_accept, flow_ip_accept}
2388 };
2389 
2390 static flow_ops_t flow_transport_ops = {
2391 	flow_transport_accept_fe,
2392 	flow_transport_hash_fe,
2393 	flow_transport_match_fe,
2394 	flow_generic_insert_fe,
2395 	flow_transport_hash,
2396 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2397 };
2398 
2399 static flow_tab_info_t flow_tab_info_list[] = {
2400 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2401 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2402 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2403 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2404 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2405 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2406 };
2407 
2408 #define	FLOW_MAX_TAB_INFO \
2409 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2410 
2411 static flow_tab_info_t *
2412 mac_flow_tab_info_get(flow_mask_t mask)
2413 {
2414 	int	i;
2415 
2416 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2417 		if (mask == flow_tab_info_list[i].fti_mask)
2418 			return (&flow_tab_info_list[i]);
2419 	}
2420 	return (NULL);
2421 }
2422