xref: /illumos-gate/usr/src/uts/common/io/mac/mac_flow.c (revision 3d393ee6c37fa10ac512ed6d36109ad616dc7c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/dls.h>
33 #include <sys/dls_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/ethernet.h>
36 #include <sys/vlan.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <netinet/tcp.h>
40 #include <netinet/udp.h>
41 #include <netinet/sctp.h>
42 
43 /* global flow table, will be a per exclusive-zone table later */
44 static mod_hash_t	*flow_hash;
45 static krwlock_t	flow_tab_lock;
46 
47 static kmem_cache_t	*flow_cache;
48 static kmem_cache_t	*flow_tab_cache;
49 static flow_ops_t	flow_l2_ops;
50 
51 typedef struct {
52 	const char	*fs_name;
53 	uint_t		fs_offset;
54 } flow_stats_info_t;
55 
56 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
57 static flow_stats_info_t flow_stats_list[] = {
58 	{"rbytes",	FS_OFF(fs_rbytes)},
59 	{"ipackets",	FS_OFF(fs_ipackets)},
60 	{"ierrors",	FS_OFF(fs_ierrors)},
61 	{"obytes",	FS_OFF(fs_obytes)},
62 	{"opackets",	FS_OFF(fs_opackets)},
63 	{"oerrors",	FS_OFF(fs_oerrors)}
64 };
65 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
66 
67 /*
68  * Checks whether a flow mask is legal.
69  */
70 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
71 
72 static void
73 flow_stat_init(kstat_named_t *knp)
74 {
75 	int	i;
76 
77 	for (i = 0; i < FS_SIZE; i++, knp++) {
78 		kstat_named_init(knp, flow_stats_list[i].fs_name,
79 		    KSTAT_DATA_UINT64);
80 	}
81 }
82 
83 static int
84 flow_stat_update(kstat_t *ksp, int rw)
85 {
86 	flow_entry_t		*fep = ksp->ks_private;
87 	flow_stats_t 		*fsp = &fep->fe_flowstats;
88 	kstat_named_t		*knp = ksp->ks_data;
89 	uint64_t		*statp;
90 	zoneid_t		zid;
91 	int			i;
92 
93 	if (rw != KSTAT_READ)
94 		return (EACCES);
95 
96 	zid = getzoneid();
97 	if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) {
98 		for (i = 0; i < FS_SIZE; i++, knp++)
99 			knp->value.ui64 = 0;
100 
101 		return (0);
102 	}
103 
104 	for (i = 0; i < FS_SIZE; i++, knp++) {
105 		statp = (uint64_t *)
106 		    ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
107 
108 		knp->value.ui64 = *statp;
109 	}
110 	return (0);
111 }
112 
113 static void
114 flow_stat_create(flow_entry_t *fep)
115 {
116 	kstat_t		*ksp;
117 	kstat_named_t	*knp;
118 	uint_t		nstats = FS_SIZE;
119 
120 	ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow",
121 	    KSTAT_TYPE_NAMED, nstats, 0);
122 	if (ksp == NULL)
123 		return;
124 
125 	ksp->ks_update = flow_stat_update;
126 	ksp->ks_private = fep;
127 	fep->fe_ksp = ksp;
128 
129 	knp = (kstat_named_t *)ksp->ks_data;
130 	flow_stat_init(knp);
131 	kstat_install(ksp);
132 }
133 
134 void
135 flow_stat_destroy(flow_entry_t *fep)
136 {
137 	if (fep->fe_ksp != NULL) {
138 		kstat_delete(fep->fe_ksp);
139 		fep->fe_ksp = NULL;
140 	}
141 }
142 
143 /*
144  * Initialize the flow table
145  */
146 void
147 mac_flow_init()
148 {
149 	flow_cache = kmem_cache_create("flow_entry_cache",
150 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
151 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
152 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
153 	flow_hash = mod_hash_create_extended("flow_hash",
154 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
155 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
156 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
157 }
158 
159 /*
160  * Cleanup and release the flow table
161  */
162 void
163 mac_flow_fini()
164 {
165 	kmem_cache_destroy(flow_cache);
166 	kmem_cache_destroy(flow_tab_cache);
167 	mod_hash_destroy_hash(flow_hash);
168 	rw_destroy(&flow_tab_lock);
169 }
170 
171 /*
172  * mac_create_flow(): create a flow_entry_t.
173  */
174 int
175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
176     void *client_cookie, uint_t type, flow_entry_t **flentp)
177 {
178 	flow_entry_t	*flent = *flentp;
179 	int		err = 0;
180 
181 	if (mrp != NULL) {
182 		err = mac_validate_props(mrp);
183 		if (err != 0)
184 			return (err);
185 	}
186 
187 	if (flent == NULL) {
188 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
189 		bzero(flent, sizeof (*flent));
190 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
191 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
192 
193 		/* Initialize the receiver function to a safe routine */
194 		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
195 		flent->fe_index = -1;
196 	}
197 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
198 
199 	/* This is an initial flow, will be configured later */
200 	if (fd == NULL) {
201 		*flentp = flent;
202 		return (0);
203 	}
204 
205 	flent->fe_client_cookie = client_cookie;
206 	flent->fe_type = type;
207 
208 	/*
209 	 * As flow creation is only allowed in global zone, this will
210 	 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will
211 	 * later set the right value.
212 	 */
213 	flent->fe_zoneid = getzoneid();
214 
215 	/* Save flow desc */
216 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
217 
218 	if (mrp != NULL) {
219 		/*
220 		 * We have already set fe_resource_props for a Link.
221 		 */
222 		if (type & FLOW_USER) {
223 			bcopy(mrp, &flent->fe_resource_props,
224 			    sizeof (mac_resource_props_t));
225 		}
226 		/*
227 		 * The effective resource list should reflect the priority
228 		 * that we set implicitly.
229 		 */
230 		if (!(mrp->mrp_mask & MRP_PRIORITY))
231 			mrp->mrp_mask |= MRP_PRIORITY;
232 		if (type & FLOW_USER)
233 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
234 		else
235 			mrp->mrp_priority = MPL_LINK_DEFAULT;
236 		bcopy(mrp, &flent->fe_effective_props,
237 		    sizeof (mac_resource_props_t));
238 	}
239 	flow_stat_create(flent);
240 
241 	*flentp = flent;
242 	return (0);
243 }
244 
245 /*
246  * Validate flow entry and add it to a flow table.
247  */
248 int
249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
250 {
251 	flow_entry_t	**headp, **p;
252 	flow_ops_t	*ops = &ft->ft_ops;
253 	flow_mask_t	mask;
254 	uint32_t	index;
255 	int		err;
256 
257 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
258 
259 	/*
260 	 * Check for invalid bits in mask.
261 	 */
262 	mask = flent->fe_flow_desc.fd_mask;
263 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
264 		return (EOPNOTSUPP);
265 
266 	/*
267 	 * Validate flent.
268 	 */
269 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
270 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
271 		    flow_entry_t *, flent, int, err);
272 		return (err);
273 	}
274 
275 	/*
276 	 * Flent is valid. now calculate hash and insert it
277 	 * into hash table.
278 	 */
279 	index = ops->fo_hash_fe(ft, flent);
280 
281 	/*
282 	 * We do not need a lock up until now because we were
283 	 * not accessing the flow table.
284 	 */
285 	rw_enter(&ft->ft_lock, RW_WRITER);
286 	headp = &ft->ft_table[index];
287 
288 	/*
289 	 * Check for duplicate flow.
290 	 */
291 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
292 		if ((*p)->fe_flow_desc.fd_mask !=
293 		    flent->fe_flow_desc.fd_mask)
294 			continue;
295 
296 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
297 			rw_exit(&ft->ft_lock);
298 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
299 			    flow_entry_t *, flent, int, err);
300 			return (EALREADY);
301 		}
302 	}
303 
304 	/*
305 	 * Insert flow to hash list.
306 	 */
307 	err = ops->fo_insert_fe(ft, headp, flent);
308 	if (err != 0) {
309 		rw_exit(&ft->ft_lock);
310 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
311 		    flow_entry_t *, flent, int, err);
312 		return (err);
313 	}
314 
315 	/*
316 	 * Save the hash index so it can be used by mac_flow_remove().
317 	 */
318 	flent->fe_index = (int)index;
319 
320 	/*
321 	 * Save the flow tab back reference.
322 	 */
323 	flent->fe_flow_tab = ft;
324 	FLOW_MARK(flent, FE_FLOW_TAB);
325 	ft->ft_flow_count++;
326 	rw_exit(&ft->ft_lock);
327 	return (0);
328 }
329 
330 /*
331  * Remove a flow from a mac client's subflow table
332  */
333 void
334 mac_flow_rem_subflow(flow_entry_t *flent)
335 {
336 	flow_tab_t		*ft = flent->fe_flow_tab;
337 	mac_client_impl_t	*mcip = ft->ft_mcip;
338 
339 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
340 
341 	mac_flow_remove(ft, flent, B_FALSE);
342 	if (flent->fe_mcip == NULL) {
343 		/*
344 		 * The interface is not yet plumbed and mac_client_flow_add
345 		 * was not done.
346 		 */
347 		if (FLOW_TAB_EMPTY(ft)) {
348 			mac_flow_tab_destroy(ft);
349 			mcip->mci_subflow_tab = NULL;
350 		}
351 		return;
352 	}
353 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
354 	mac_link_flow_clean((mac_client_handle_t)mcip, flent);
355 }
356 
357 /*
358  * Add a flow to a mac client's subflow table and instantiate the flow
359  * in the mac by creating the associated SRSs etc.
360  */
361 int
362 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
363     boolean_t instantiate_flow)
364 {
365 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
366 	flow_tab_info_t		*ftinfo;
367 	flow_mask_t		mask;
368 	flow_tab_t		*ft;
369 	int			err;
370 	boolean_t		ft_created = B_FALSE;
371 
372 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
373 
374 	/*
375 	 * If the subflow table exists already just add the new subflow
376 	 * to the existing table, else we create a new subflow table below.
377 	 */
378 	ft = mcip->mci_subflow_tab;
379 	if (ft == NULL) {
380 		mask = flent->fe_flow_desc.fd_mask;
381 		/*
382 		 * Try to create a new table and then add the subflow to the
383 		 * newly created subflow table
384 		 */
385 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL)
386 			return (EOPNOTSUPP);
387 
388 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
389 		    mcip->mci_mip, &ft);
390 		ft_created = B_TRUE;
391 	}
392 
393 	err = mac_flow_add(ft, flent);
394 	if (err != 0) {
395 		if (ft_created)
396 			mac_flow_tab_destroy(ft);
397 		return (err);
398 	}
399 
400 	if (instantiate_flow) {
401 		/* Now activate the flow by creating its SRSs */
402 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
403 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
404 		if (err != 0) {
405 			mac_flow_remove(ft, flent, B_FALSE);
406 			if (ft_created)
407 				mac_flow_tab_destroy(ft);
408 			return (err);
409 		}
410 	} else {
411 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
412 	}
413 	if (ft_created) {
414 		ASSERT(mcip->mci_subflow_tab == NULL);
415 		ft->ft_mcip = mcip;
416 		mcip->mci_subflow_tab = ft;
417 		if (instantiate_flow)
418 			mac_client_update_classifier(mcip, B_TRUE);
419 	}
420 	return (0);
421 }
422 
423 /*
424  * Remove flow entry from flow table.
425  */
426 void
427 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
428 {
429 	flow_entry_t	**fp;
430 
431 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
432 	if (!(flent->fe_flags & FE_FLOW_TAB))
433 		return;
434 
435 	rw_enter(&ft->ft_lock, RW_WRITER);
436 	/*
437 	 * If this is a permanent removal from the flow table, mark it
438 	 * CONDEMNED to prevent future references. If this is a temporary
439 	 * removal from the table, say to update the flow descriptor then
440 	 * we don't mark it CONDEMNED
441 	 */
442 	if (!temp)
443 		FLOW_MARK(flent, FE_CONDEMNED);
444 	/*
445 	 * Locate the specified flent.
446 	 */
447 	fp = &ft->ft_table[flent->fe_index];
448 	while (*fp != flent)
449 		fp = &(*fp)->fe_next;
450 
451 	/*
452 	 * The flent must exist. Otherwise it's a bug.
453 	 */
454 	ASSERT(fp != NULL);
455 	*fp = flent->fe_next;
456 	flent->fe_next = NULL;
457 
458 	/*
459 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
460 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
461 	 * will panic.
462 	 */
463 	flent->fe_index = -1;
464 	FLOW_UNMARK(flent, FE_FLOW_TAB);
465 	ft->ft_flow_count--;
466 	rw_exit(&ft->ft_lock);
467 }
468 
469 /*
470  * This is the flow lookup routine used by the mac sw classifier engine.
471  */
472 int
473 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
474 {
475 	flow_state_t	s;
476 	flow_entry_t	*flent;
477 	flow_ops_t	*ops = &ft->ft_ops;
478 	boolean_t	retried = B_FALSE;
479 	int		i, err;
480 
481 	s.fs_flags = flags;
482 	s.fs_mp = mp;
483 retry:
484 
485 	/*
486 	 * Walk the list of predeclared accept functions.
487 	 * Each of these would accumulate enough state to allow the next
488 	 * accept routine to make progress.
489 	 */
490 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
491 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
492 			/*
493 			 * ENOBUFS indicates that the mp could be too short
494 			 * and may need a pullup.
495 			 */
496 			if (err != ENOBUFS || retried)
497 				return (err);
498 
499 			/*
500 			 * Don't modify the mblk if there are references to it.
501 			 * Also, there is no point pulling up if b_cont is NULL.
502 			 */
503 			if (DB_REF(mp) > 1 || mp->b_cont == NULL ||
504 			    pullupmsg(mp, -1) == 0)
505 				return (EINVAL);
506 
507 			retried = B_TRUE;
508 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
509 			    flow_state_t *, &s);
510 			goto retry;
511 		}
512 	}
513 
514 	/*
515 	 * The packet is considered sane. We may now attempt to
516 	 * find the corresponding flent.
517 	 */
518 	rw_enter(&ft->ft_lock, RW_READER);
519 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
520 	for (; flent != NULL; flent = flent->fe_next) {
521 		if (flent->fe_match(ft, flent, &s)) {
522 			FLOW_TRY_REFHOLD(flent, err);
523 			if (err != 0)
524 				continue;
525 			*flentp = flent;
526 			rw_exit(&ft->ft_lock);
527 			return (0);
528 		}
529 	}
530 	rw_exit(&ft->ft_lock);
531 	return (ENOENT);
532 }
533 
534 /*
535  * Walk flow table.
536  * The caller is assumed to have proper perimeter protection.
537  */
538 int
539 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
540     void *arg)
541 {
542 	int		err, i, cnt = 0;
543 	flow_entry_t	*flent;
544 
545 	if (ft == NULL)
546 		return (0);
547 
548 	for (i = 0; i < ft->ft_size; i++) {
549 		for (flent = ft->ft_table[i]; flent != NULL;
550 		    flent = flent->fe_next) {
551 			cnt++;
552 			err = (*fn)(flent, arg);
553 			if (err != 0)
554 				return (err);
555 		}
556 	}
557 	VERIFY(cnt == ft->ft_flow_count);
558 	return (0);
559 }
560 
561 /*
562  * Same as the above except a mutex is used for protection here.
563  */
564 int
565 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
566     void *arg)
567 {
568 	int		err;
569 
570 	if (ft == NULL)
571 		return (0);
572 
573 	rw_enter(&ft->ft_lock, RW_WRITER);
574 	err = mac_flow_walk_nolock(ft, fn, arg);
575 	rw_exit(&ft->ft_lock);
576 	return (err);
577 }
578 
579 static boolean_t	mac_flow_clean(flow_entry_t *);
580 
581 /*
582  * Destroy a flow entry. Called when the last reference on a flow is released.
583  */
584 void
585 mac_flow_destroy(flow_entry_t *flent)
586 {
587 	ASSERT(flent->fe_refcnt == 0);
588 
589 	if ((flent->fe_type & FLOW_USER) != 0) {
590 		ASSERT(mac_flow_clean(flent));
591 	} else {
592 		mac_flow_cleanup(flent);
593 	}
594 
595 	mutex_destroy(&flent->fe_lock);
596 	cv_destroy(&flent->fe_cv);
597 	flow_stat_destroy(flent);
598 	kmem_cache_free(flow_cache, flent);
599 }
600 
601 /*
602  * XXX eric
603  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
604  * mac_link_flow_modify() should really be moved/reworked into the
605  * two functions below. This would consolidate all the mac property
606  * checking in one place. I'm leaving this alone for now since it's
607  * out of scope of the new flows work.
608  */
609 /* ARGSUSED */
610 uint32_t
611 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
612 {
613 	uint32_t		changed_mask = 0;
614 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
615 	int			i;
616 
617 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
618 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
619 		changed_mask |= MRP_MAXBW;
620 		fmrp->mrp_maxbw = mrp->mrp_maxbw;
621 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
622 			fmrp->mrp_mask &= ~MRP_MAXBW;
623 		} else {
624 			fmrp->mrp_mask |= MRP_MAXBW;
625 		}
626 	}
627 
628 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
629 		if (fmrp->mrp_priority != mrp->mrp_priority)
630 			changed_mask |= MRP_PRIORITY;
631 		if (mrp->mrp_priority == MPL_RESET) {
632 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
633 			fmrp->mrp_mask &= ~MRP_PRIORITY;
634 		} else {
635 			fmrp->mrp_priority = mrp->mrp_priority;
636 			fmrp->mrp_mask |= MRP_PRIORITY;
637 		}
638 	}
639 
640 	/* modify fanout */
641 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
642 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
643 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
644 			for (i = 0; i < mrp->mrp_ncpus; i++) {
645 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
646 					break;
647 			}
648 			if (i == mrp->mrp_ncpus) {
649 				/*
650 				 * The new set of cpus passed is exactly
651 				 * the same as the existing set.
652 				 */
653 				return (changed_mask);
654 			}
655 		}
656 		changed_mask |= MRP_CPUS;
657 		MAC_COPY_CPUS(mrp, fmrp);
658 	}
659 	return (changed_mask);
660 }
661 
662 void
663 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
664 {
665 	uint32_t changed_mask;
666 	mac_client_impl_t *mcip = flent->fe_mcip;
667 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
668 
669 	ASSERT(flent != NULL);
670 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
671 
672 	rw_enter(&ft->ft_lock, RW_WRITER);
673 
674 	/* Update the cached values inside the subflow entry */
675 	changed_mask = mac_flow_modify_props(flent, mrp);
676 	rw_exit(&ft->ft_lock);
677 	/*
678 	 * Push the changed parameters to the scheduling code in the
679 	 * SRS's, to take effect right away.
680 	 */
681 	if (changed_mask & MRP_MAXBW) {
682 		mac_srs_update_bwlimit(flent, mrp);
683 		/*
684 		 * If bandwidth is changed, we may have to change
685 		 * the number of soft ring to be used for fanout.
686 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
687 		 * is not set and there is no user supplied cpu
688 		 * info. This applies only to link at this time.
689 		 */
690 		if (!(flent->fe_type & FLOW_USER) &&
691 		    !(changed_mask & MRP_CPUS) &&
692 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
693 			mac_fanout_setup(mcip, flent, mcip_mrp,
694 			    mac_rx_deliver, mcip, NULL);
695 		}
696 	}
697 	if (mrp->mrp_mask & MRP_PRIORITY)
698 		mac_flow_update_priority(mcip, flent);
699 
700 	if (changed_mask & MRP_CPUS)
701 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
702 }
703 
704 /*
705  * This function waits for a certain condition to be met and is generally
706  * used before a destructive or quiescing operation.
707  */
708 void
709 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
710 {
711 	mutex_enter(&flent->fe_lock);
712 	flent->fe_flags |= FE_WAITER;
713 
714 	switch (event) {
715 	case FLOW_DRIVER_UPCALL:
716 		/*
717 		 * We want to make sure the driver upcalls have finished before
718 		 * we signal the Rx SRS worker to quit.
719 		 */
720 		while (flent->fe_refcnt != 1)
721 			cv_wait(&flent->fe_cv, &flent->fe_lock);
722 		break;
723 
724 	case FLOW_USER_REF:
725 		/*
726 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
727 		 * been removed from the global flow hash.
728 		 */
729 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
730 		while (flent->fe_user_refcnt != 0)
731 			cv_wait(&flent->fe_cv, &flent->fe_lock);
732 		break;
733 
734 	default:
735 		ASSERT(0);
736 	}
737 
738 	flent->fe_flags &= ~FE_WAITER;
739 	mutex_exit(&flent->fe_lock);
740 }
741 
742 static boolean_t
743 mac_flow_clean(flow_entry_t *flent)
744 {
745 	ASSERT(flent->fe_next == NULL);
746 	ASSERT(flent->fe_tx_srs == NULL);
747 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
748 	ASSERT(flent->fe_mbg == NULL);
749 
750 	return (B_TRUE);
751 }
752 
753 void
754 mac_flow_cleanup(flow_entry_t *flent)
755 {
756 	if ((flent->fe_type & FLOW_USER) == 0) {
757 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
758 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
759 		ASSERT(flent->fe_refcnt == 0);
760 	} else {
761 		ASSERT(flent->fe_refcnt == 1);
762 	}
763 
764 	if (flent->fe_mbg != NULL) {
765 		ASSERT(flent->fe_tx_srs == NULL);
766 		/* This is a multicast or broadcast flow entry */
767 		mac_bcast_grp_free(flent->fe_mbg);
768 		flent->fe_mbg = NULL;
769 	}
770 
771 	if (flent->fe_tx_srs != NULL) {
772 		ASSERT(flent->fe_mbg == NULL);
773 		mac_srs_free(flent->fe_tx_srs);
774 		flent->fe_tx_srs = NULL;
775 	}
776 
777 	/*
778 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
779 	 * when mac_unicast_add fails we may not have set up any SRS
780 	 * in which case fe_rx_srs_cnt will be zero.
781 	 */
782 	if (flent->fe_rx_srs_cnt != 0) {
783 		ASSERT(flent->fe_rx_srs_cnt == 1);
784 		mac_srs_free(flent->fe_rx_srs[0]);
785 		flent->fe_rx_srs[0] = NULL;
786 		flent->fe_rx_srs_cnt = 0;
787 	}
788 	ASSERT(flent->fe_rx_srs[0] == NULL);
789 }
790 
791 void
792 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
793 {
794 	/*
795 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
796 	 * Updates to the fe_flow_desc happen under the fe_lock
797 	 * after removing the flent from the flow table
798 	 */
799 	mutex_enter(&flent->fe_lock);
800 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
801 	mutex_exit(&flent->fe_lock);
802 }
803 
804 /*
805  * Update a field of a flow entry. The mac perimeter ensures that
806  * this is the only thread doing a modify operation on this mac end point.
807  * So the flow table can't change or disappear. The ft_lock protects access
808  * to the flow entry, and holding the lock ensures that there isn't any thread
809  * accessing the flow entry or attempting a flow table lookup. However
810  * data threads that are using the flow entry based on the old descriptor
811  * will continue to use the flow entry. If strong coherence is required
812  * then the flow will have to be quiesced before the descriptor can be
813  * changed.
814  */
815 void
816 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
817 {
818 	flow_tab_t	*ft = flent->fe_flow_tab;
819 	flow_desc_t	old_desc;
820 	int		err;
821 
822 	if (ft == NULL) {
823 		/*
824 		 * The flow hasn't yet been inserted into the table,
825 		 * so only the caller knows about this flow, however for
826 		 * uniformity we grab the fe_lock here.
827 		 */
828 		mutex_enter(&flent->fe_lock);
829 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
830 		mutex_exit(&flent->fe_lock);
831 	}
832 
833 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
834 
835 	/*
836 	 * Need to remove the flow entry from the table and reinsert it,
837 	 * into a potentially diference hash line. The hash depends on
838 	 * the new descriptor fields. However access to fe_desc itself
839 	 * is always under the fe_lock. This helps log and stat functions
840 	 * see a self-consistent fe_flow_desc.
841 	 */
842 	mac_flow_remove(ft, flent, B_TRUE);
843 	old_desc = flent->fe_flow_desc;
844 
845 	mutex_enter(&flent->fe_lock);
846 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
847 	mutex_exit(&flent->fe_lock);
848 
849 	if (mac_flow_add(ft, flent) != 0) {
850 		/*
851 		 * The add failed say due to an invalid flow descriptor.
852 		 * Undo the update
853 		 */
854 		flent->fe_flow_desc = old_desc;
855 		err = mac_flow_add(ft, flent);
856 		ASSERT(err == 0);
857 	}
858 }
859 
860 void
861 mac_flow_set_name(flow_entry_t *flent, const char *name)
862 {
863 	flow_tab_t	*ft = flent->fe_flow_tab;
864 
865 	if (ft == NULL) {
866 		/*
867 		 *  The flow hasn't yet been inserted into the table,
868 		 * so only the caller knows about this flow
869 		 */
870 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
871 	} else {
872 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
873 	}
874 
875 	mutex_enter(&flent->fe_lock);
876 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
877 	mutex_exit(&flent->fe_lock);
878 }
879 
880 /*
881  * Return the client-private cookie that was associated with
882  * the flow when it was created.
883  */
884 void *
885 mac_flow_get_client_cookie(flow_entry_t *flent)
886 {
887 	return (flent->fe_client_cookie);
888 }
889 
890 /*
891  * Forward declarations.
892  */
893 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
894 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
895 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
896 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
897 
898 /*
899  * Create flow table.
900  */
901 void
902 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
903     mac_impl_t *mip, flow_tab_t **ftp)
904 {
905 	flow_tab_t	*ft;
906 	flow_ops_t	*new_ops;
907 
908 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
909 	bzero(ft, sizeof (*ft));
910 
911 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
912 
913 	/*
914 	 * We make a copy of the ops vector instead of just pointing to it
915 	 * because we might want to customize the ops vector on a per table
916 	 * basis (e.g. for optimization).
917 	 */
918 	new_ops = &ft->ft_ops;
919 	bcopy(ops, new_ops, sizeof (*ops));
920 	ft->ft_mask = mask;
921 	ft->ft_size = size;
922 	ft->ft_mip = mip;
923 
924 	/*
925 	 * Optimization for DL_ETHER media.
926 	 */
927 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
928 		if (new_ops->fo_hash == flow_l2_hash)
929 			new_ops->fo_hash = flow_ether_hash;
930 
931 		if (new_ops->fo_accept[0] == flow_l2_accept)
932 			new_ops->fo_accept[0] = flow_ether_accept;
933 
934 	}
935 	*ftp = ft;
936 }
937 
938 void
939 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
940 {
941 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
942 	    1024, mip, ftp);
943 }
944 
945 /*
946  * Destroy flow table.
947  */
948 void
949 mac_flow_tab_destroy(flow_tab_t *ft)
950 {
951 	if (ft == NULL)
952 		return;
953 
954 	ASSERT(ft->ft_flow_count == 0);
955 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
956 	bzero(ft, sizeof (*ft));
957 	kmem_cache_free(flow_tab_cache, ft);
958 }
959 
960 /*
961  * Add a new flow entry to the global flow hash table
962  */
963 int
964 mac_flow_hash_add(flow_entry_t *flent)
965 {
966 	int	err;
967 
968 	rw_enter(&flow_tab_lock, RW_WRITER);
969 	err = mod_hash_insert(flow_hash,
970 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
971 	if (err != 0) {
972 		rw_exit(&flow_tab_lock);
973 		return (EEXIST);
974 	}
975 	/* Mark as inserted into the global flow hash table */
976 	FLOW_MARK(flent, FE_G_FLOW_HASH);
977 	rw_exit(&flow_tab_lock);
978 	return (err);
979 }
980 
981 /*
982  * Remove a flow entry from the global flow hash table
983  */
984 void
985 mac_flow_hash_remove(flow_entry_t *flent)
986 {
987 	mod_hash_val_t	val;
988 
989 	rw_enter(&flow_tab_lock, RW_WRITER);
990 	VERIFY(mod_hash_remove(flow_hash,
991 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
992 
993 	/* Clear the mark that says inserted into the global flow hash table */
994 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
995 	rw_exit(&flow_tab_lock);
996 }
997 
998 /*
999  * Retrieve a flow entry from the global flow hash table.
1000  */
1001 int
1002 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1003 {
1004 	int		err;
1005 	flow_entry_t	*flent;
1006 
1007 	rw_enter(&flow_tab_lock, RW_READER);
1008 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1009 	    (mod_hash_val_t *)&flent);
1010 	if (err != 0) {
1011 		rw_exit(&flow_tab_lock);
1012 		return (ENOENT);
1013 	}
1014 	ASSERT(flent != NULL);
1015 	FLOW_USER_REFHOLD(flent);
1016 	rw_exit(&flow_tab_lock);
1017 
1018 	*flentp = flent;
1019 	return (0);
1020 }
1021 
1022 /*
1023  * Initialize or release mac client flows by walking the subflow table.
1024  * These are typically invoked during plumb/unplumb of links.
1025  */
1026 
1027 static int
1028 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1029 {
1030 	mac_client_impl_t	*mcip = arg;
1031 
1032 	if (mac_link_flow_init(arg, flent) != 0) {
1033 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1034 		    flent->fe_flow_name, mcip->mci_name);
1035 	} else {
1036 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1037 	}
1038 	return (0);
1039 }
1040 
1041 void
1042 mac_link_init_flows(mac_client_handle_t mch)
1043 {
1044 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1045 
1046 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1047 	    mac_link_init_flows_cb, mcip);
1048 	/*
1049 	 * If mac client had subflow(s) configured before plumb, change
1050 	 * function to mac_rx_srs_subflow_process and in case of hardware
1051 	 * classification, disable polling.
1052 	 */
1053 	mac_client_update_classifier(mcip, B_TRUE);
1054 
1055 }
1056 
1057 boolean_t
1058 mac_link_has_flows(mac_client_handle_t mch)
1059 {
1060 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1061 
1062 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1063 		return (B_TRUE);
1064 
1065 	return (B_FALSE);
1066 }
1067 
1068 static int
1069 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1070 {
1071 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1072 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1073 	mac_link_flow_clean(arg, flent);
1074 	return (0);
1075 }
1076 
1077 void
1078 mac_link_release_flows(mac_client_handle_t mch)
1079 {
1080 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1081 
1082 	/*
1083 	 * Change the mci_flent callback back to mac_rx_srs_process()
1084 	 * because flows are about to be deactivated.
1085 	 */
1086 	mac_client_update_classifier(mcip, B_FALSE);
1087 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1088 	    mac_link_release_flows_cb, mcip);
1089 }
1090 
1091 void
1092 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1093 {
1094 	mac_flow_set_name(fep, new_name);
1095 	if (fep->fe_ksp != NULL) {
1096 		flow_stat_destroy(fep);
1097 		flow_stat_create(fep);
1098 	}
1099 }
1100 
1101 /*
1102  * mac_link_flow_init()
1103  * Internal flow interface used for allocating SRSs and related
1104  * data structures. Not meant to be used by mac clients.
1105  */
1106 int
1107 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1108 {
1109 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1110 	mac_impl_t		*mip = mcip->mci_mip;
1111 	int			err;
1112 
1113 	ASSERT(mch != NULL);
1114 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1115 
1116 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1117 		return (err);
1118 
1119 	sub_flow->fe_mcip = mcip;
1120 
1121 	return (0);
1122 }
1123 
1124 /*
1125  * mac_link_flow_add()
1126  * Used by flowadm(1m) or kernel mac clients for creating flows.
1127  */
1128 int
1129 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1130     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1131 {
1132 	flow_entry_t		*flent = NULL;
1133 	int			err;
1134 	dls_dl_handle_t		dlh;
1135 	dls_link_t		*dlp;
1136 	boolean_t		link_held = B_FALSE;
1137 	boolean_t		hash_added = B_FALSE;
1138 	mac_perim_handle_t	mph;
1139 
1140 	err = mac_flow_lookup_byname(flow_name, &flent);
1141 	if (err == 0) {
1142 		FLOW_USER_REFRELE(flent);
1143 		return (EEXIST);
1144 	}
1145 
1146 	/*
1147 	 * First create a flow entry given the description provided
1148 	 * by the caller.
1149 	 */
1150 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1151 	    FLOW_USER | FLOW_OTHER, &flent);
1152 
1153 	if (err != 0)
1154 		return (err);
1155 
1156 	/*
1157 	 * We've got a local variable referencing this flow now, so we need
1158 	 * to hold it. We'll release this flow before returning.
1159 	 * All failures until we return will undo any action that may internally
1160 	 * held the flow, so the last REFRELE will assure a clean freeing
1161 	 * of resources.
1162 	 */
1163 	FLOW_REFHOLD(flent);
1164 
1165 	flent->fe_link_id = linkid;
1166 	FLOW_MARK(flent, FE_INCIPIENT);
1167 
1168 	err = mac_perim_enter_by_linkid(linkid, &mph);
1169 	if (err != 0) {
1170 		FLOW_FINAL_REFRELE(flent);
1171 		return (err);
1172 	}
1173 
1174 	/*
1175 	 * dls will eventually be merged with mac so it's ok
1176 	 * to call dls' internal functions.
1177 	 */
1178 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1179 	if (err != 0)
1180 		goto bail;
1181 
1182 	link_held = B_TRUE;
1183 
1184 	/*
1185 	 * Add the flow to the global flow table, this table will be per
1186 	 * exclusive zone so each zone can have its own flow namespace.
1187 	 * RFE 6625651 will fix this.
1188 	 *
1189 	 */
1190 	if ((err = mac_flow_hash_add(flent)) != 0)
1191 		goto bail;
1192 
1193 	hash_added = B_TRUE;
1194 
1195 	/*
1196 	 * do not allow flows to be configured on an anchor VNIC
1197 	 */
1198 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1199 		err = ENOTSUP;
1200 		goto bail;
1201 	}
1202 
1203 	/*
1204 	 * Save the zoneid of the underlying link in the flow entry,
1205 	 * this is needed to prevent non-global zone from getting
1206 	 * statistics information of global zone.
1207 	 */
1208 	flent->fe_zoneid = dlp->dl_zid;
1209 
1210 	/*
1211 	 * Add the subflow to the subflow table. Also instantiate the flow
1212 	 * in the mac if there is an active DLS user. The dl_mah is set when
1213 	 * dls_active_set() is called, typically during interface plumb.
1214 	 */
1215 	err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL);
1216 	if (err != 0)
1217 		goto bail;
1218 
1219 	FLOW_UNMARK(flent, FE_INCIPIENT);
1220 	dls_devnet_rele_link(dlh, dlp);
1221 	mac_perim_exit(mph);
1222 	return (0);
1223 
1224 bail:
1225 	if (hash_added)
1226 		mac_flow_hash_remove(flent);
1227 
1228 	if (link_held)
1229 		dls_devnet_rele_link(dlh, dlp);
1230 
1231 	/*
1232 	 * Wait for any transient global flow hash refs to clear
1233 	 * and then release the creation reference on the flow
1234 	 */
1235 	mac_flow_wait(flent, FLOW_USER_REF);
1236 	FLOW_FINAL_REFRELE(flent);
1237 	mac_perim_exit(mph);
1238 	return (err);
1239 }
1240 
1241 /*
1242  * mac_link_flow_clean()
1243  * Internal flow interface used for freeing SRSs and related
1244  * data structures. Not meant to be used by mac clients.
1245  */
1246 void
1247 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1248 {
1249 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1250 	mac_impl_t		*mip = mcip->mci_mip;
1251 	boolean_t		last_subflow;
1252 
1253 	ASSERT(mch != NULL);
1254 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1255 
1256 	/*
1257 	 * This sub flow entry may fail to be fully initialized by
1258 	 * mac_link_flow_init(). If so, simply return.
1259 	 */
1260 	if (sub_flow->fe_mcip == NULL)
1261 		return;
1262 
1263 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1264 	/*
1265 	 * Tear down the data path
1266 	 */
1267 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1268 	sub_flow->fe_mcip = NULL;
1269 
1270 	/*
1271 	 * Delete the SRSs associated with this subflow. If this is being
1272 	 * driven by flowadm(1M) then the subflow will be deleted by
1273 	 * dls_rem_flow. However if this is a result of the interface being
1274 	 * unplumbed then the subflow itself won't be deleted.
1275 	 */
1276 	mac_flow_cleanup(sub_flow);
1277 
1278 	/*
1279 	 * If all the subflows are gone, renable some of the stuff
1280 	 * we disabled when adding a subflow, polling etc.
1281 	 */
1282 	if (last_subflow) {
1283 		/*
1284 		 * The subflow table itself is not protected by any locks or
1285 		 * refcnts. Hence quiesce the client upfront before clearing
1286 		 * mci_subflow_tab.
1287 		 */
1288 		mac_client_quiesce(mcip);
1289 		mac_client_update_classifier(mcip, B_FALSE);
1290 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1291 		mcip->mci_subflow_tab = NULL;
1292 		mac_client_restart(mcip);
1293 	}
1294 }
1295 
1296 /*
1297  * mac_link_flow_remove()
1298  * Used by flowadm(1m) or kernel mac clients for removing flows.
1299  */
1300 int
1301 mac_link_flow_remove(char *flow_name)
1302 {
1303 	flow_entry_t		*flent;
1304 	mac_perim_handle_t	mph;
1305 	int			err;
1306 	datalink_id_t		linkid;
1307 
1308 	err = mac_flow_lookup_byname(flow_name, &flent);
1309 	if (err != 0)
1310 		return (err);
1311 
1312 	linkid = flent->fe_link_id;
1313 	FLOW_USER_REFRELE(flent);
1314 
1315 	/*
1316 	 * The perim must be acquired before acquiring any other references
1317 	 * to maintain the lock and perimeter hierarchy. Please note the
1318 	 * FLOW_REFRELE above.
1319 	 */
1320 	err = mac_perim_enter_by_linkid(linkid, &mph);
1321 	if (err != 0)
1322 		return (err);
1323 
1324 	/*
1325 	 * Note the second lookup of the flow, because a concurrent thread
1326 	 * may have removed it already while we were waiting to enter the
1327 	 * link's perimeter.
1328 	 */
1329 	err = mac_flow_lookup_byname(flow_name, &flent);
1330 	if (err != 0) {
1331 		mac_perim_exit(mph);
1332 		return (err);
1333 	}
1334 	FLOW_USER_REFRELE(flent);
1335 
1336 	/*
1337 	 * Remove the flow from the subflow table and deactivate the flow
1338 	 * by quiescing and removings its SRSs
1339 	 */
1340 	mac_flow_rem_subflow(flent);
1341 
1342 	/*
1343 	 * Finally, remove the flow from the global table.
1344 	 */
1345 	mac_flow_hash_remove(flent);
1346 
1347 	/*
1348 	 * Wait for any transient global flow hash refs to clear
1349 	 * and then release the creation reference on the flow
1350 	 */
1351 	mac_flow_wait(flent, FLOW_USER_REF);
1352 	FLOW_FINAL_REFRELE(flent);
1353 
1354 	mac_perim_exit(mph);
1355 
1356 	return (0);
1357 }
1358 
1359 /*
1360  * mac_link_flow_modify()
1361  * Modifies the properties of a flow identified by its name.
1362  */
1363 int
1364 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1365 {
1366 	flow_entry_t		*flent;
1367 	mac_client_impl_t 	*mcip;
1368 	int			err = 0;
1369 	mac_perim_handle_t	mph;
1370 	datalink_id_t		linkid;
1371 	flow_tab_t		*flow_tab;
1372 
1373 	err = mac_validate_props(mrp);
1374 	if (err != 0)
1375 		return (err);
1376 
1377 	err = mac_flow_lookup_byname(flow_name, &flent);
1378 	if (err != 0)
1379 		return (err);
1380 
1381 	linkid = flent->fe_link_id;
1382 	FLOW_USER_REFRELE(flent);
1383 
1384 	/*
1385 	 * The perim must be acquired before acquiring any other references
1386 	 * to maintain the lock and perimeter hierarchy. Please note the
1387 	 * FLOW_REFRELE above.
1388 	 */
1389 	err = mac_perim_enter_by_linkid(linkid, &mph);
1390 	if (err != 0)
1391 		return (err);
1392 
1393 	/*
1394 	 * Note the second lookup of the flow, because a concurrent thread
1395 	 * may have removed it already while we were waiting to enter the
1396 	 * link's perimeter.
1397 	 */
1398 	err = mac_flow_lookup_byname(flow_name, &flent);
1399 	if (err != 0) {
1400 		mac_perim_exit(mph);
1401 		return (err);
1402 	}
1403 	FLOW_USER_REFRELE(flent);
1404 
1405 	/*
1406 	 * If this flow is attached to a MAC client, then pass the request
1407 	 * along to the client.
1408 	 * Otherwise, just update the cached values.
1409 	 */
1410 	mcip = flent->fe_mcip;
1411 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1412 	if (mcip != NULL) {
1413 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1414 			err = ENOENT;
1415 		} else {
1416 			mac_flow_modify(flow_tab, flent, mrp);
1417 		}
1418 	} else {
1419 		(void) mac_flow_modify_props(flent, mrp);
1420 	}
1421 
1422 done:
1423 	mac_perim_exit(mph);
1424 	return (err);
1425 }
1426 
1427 
1428 /*
1429  * State structure and misc functions used by mac_link_flow_walk().
1430  */
1431 typedef struct {
1432 	int	(*ws_func)(mac_flowinfo_t *, void *);
1433 	void	*ws_arg;
1434 } flow_walk_state_t;
1435 
1436 static void
1437 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1438 {
1439 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1440 	    MAXFLOWNAMELEN);
1441 	finfop->fi_link_id = flent->fe_link_id;
1442 	finfop->fi_flow_desc = flent->fe_flow_desc;
1443 	finfop->fi_resource_props = flent->fe_resource_props;
1444 }
1445 
1446 static int
1447 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1448 {
1449 	flow_walk_state_t	*statep = arg;
1450 	mac_flowinfo_t		finfo;
1451 
1452 	mac_link_flowinfo_copy(&finfo, flent);
1453 	return (statep->ws_func(&finfo, statep->ws_arg));
1454 }
1455 
1456 /*
1457  * mac_link_flow_walk()
1458  * Invokes callback 'func' for all flows belonging to the specified link.
1459  */
1460 int
1461 mac_link_flow_walk(datalink_id_t linkid,
1462     int (*func)(mac_flowinfo_t *, void *), void *arg)
1463 {
1464 	mac_client_impl_t	*mcip;
1465 	mac_perim_handle_t	mph;
1466 	flow_walk_state_t	state;
1467 	dls_dl_handle_t		dlh;
1468 	dls_link_t		*dlp;
1469 	int			err;
1470 
1471 	err = mac_perim_enter_by_linkid(linkid, &mph);
1472 	if (err != 0)
1473 		return (err);
1474 
1475 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1476 	if (err != 0) {
1477 		mac_perim_exit(mph);
1478 		return (err);
1479 	}
1480 
1481 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1482 	state.ws_func = func;
1483 	state.ws_arg = arg;
1484 
1485 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1486 	    mac_link_flow_walk_cb, &state);
1487 
1488 	dls_devnet_rele_link(dlh, dlp);
1489 	mac_perim_exit(mph);
1490 	return (err);
1491 }
1492 
1493 /*
1494  * mac_link_flow_info()
1495  * Retrieves information about a specific flow.
1496  */
1497 int
1498 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1499 {
1500 	flow_entry_t	*flent;
1501 	int		err;
1502 
1503 	err = mac_flow_lookup_byname(flow_name, &flent);
1504 	if (err != 0)
1505 		return (err);
1506 
1507 	mac_link_flowinfo_copy(finfo, flent);
1508 	FLOW_USER_REFRELE(flent);
1509 	return (0);
1510 }
1511 
1512 #define	HASH_MAC_VID(a, v, s) \
1513 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1514 
1515 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1516 
1517 /* ARGSUSED */
1518 static boolean_t
1519 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1520 {
1521 	flow_l2info_t		*l2 = &s->fs_l2info;
1522 	flow_desc_t		*fd = &flent->fe_flow_desc;
1523 
1524 	return (l2->l2_vid == fd->fd_vid &&
1525 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1526 }
1527 
1528 /*
1529  * Layer 2 hash function.
1530  * Must be paired with flow_l2_accept() within a set of flow_ops
1531  * because it assumes the dest address is already extracted.
1532  */
1533 static uint32_t
1534 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1535 {
1536 	flow_l2info_t		*l2 = &s->fs_l2info;
1537 
1538 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1539 }
1540 
1541 /*
1542  * This is the generic layer 2 accept function.
1543  * It makes use of mac_header_info() to extract the header length,
1544  * sap, vlan ID and destination address.
1545  */
1546 static int
1547 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1548 {
1549 	boolean_t		is_ether;
1550 	flow_l2info_t		*l2 = &s->fs_l2info;
1551 	mac_header_info_t	mhi;
1552 	int			err;
1553 
1554 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1555 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1556 	    s->fs_mp, &mhi)) != 0) {
1557 		if (err == EINVAL)
1558 			err = ENOBUFS;
1559 
1560 		return (err);
1561 	}
1562 
1563 	l2->l2_start = s->fs_mp->b_rptr;
1564 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1565 
1566 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1567 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1568 		struct ether_vlan_header	*evhp =
1569 		    (struct ether_vlan_header *)l2->l2_start;
1570 
1571 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1572 			return (ENOBUFS);
1573 
1574 		l2->l2_sap = ntohs(evhp->ether_type);
1575 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1576 		l2->l2_hdrsize = sizeof (*evhp);
1577 	} else {
1578 		l2->l2_sap = mhi.mhi_bindsap;
1579 		l2->l2_vid = 0;
1580 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1581 	}
1582 	return (0);
1583 }
1584 
1585 /*
1586  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1587  * accept(). The notable difference is that dest address is now extracted
1588  * by hash() rather than by accept(). This saves a few memory references
1589  * for flow tables that do not care about mac addresses.
1590  */
1591 static uint32_t
1592 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1593 {
1594 	flow_l2info_t			*l2 = &s->fs_l2info;
1595 	struct ether_vlan_header	*evhp;
1596 
1597 	evhp = (struct ether_vlan_header *)l2->l2_start;
1598 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1599 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1600 }
1601 
1602 /* ARGSUSED */
1603 static int
1604 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1605 {
1606 	flow_l2info_t			*l2 = &s->fs_l2info;
1607 	struct ether_vlan_header	*evhp;
1608 	uint16_t			sap;
1609 
1610 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1611 	l2->l2_start = (uchar_t *)evhp;
1612 
1613 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1614 		return (ENOBUFS);
1615 
1616 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1617 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1618 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1619 			return (ENOBUFS);
1620 
1621 		l2->l2_sap = ntohs(evhp->ether_type);
1622 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1623 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1624 	} else {
1625 		l2->l2_sap = sap;
1626 		l2->l2_vid = 0;
1627 		l2->l2_hdrsize = sizeof (struct ether_header);
1628 	}
1629 	return (0);
1630 }
1631 
1632 /*
1633  * Validates a layer 2 flow entry.
1634  */
1635 static int
1636 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1637 {
1638 	int		i;
1639 	flow_desc_t	*fd = &flent->fe_flow_desc;
1640 
1641 	/*
1642 	 * Dest address is mandatory.
1643 	 */
1644 	if ((fd->fd_mask & FLOW_LINK_DST) == 0)
1645 		return (EINVAL);
1646 
1647 	for (i = 0; i < fd->fd_mac_len; i++) {
1648 		if (fd->fd_dst_mac[i] != 0)
1649 			break;
1650 	}
1651 	if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL)
1652 		return (EINVAL);
1653 
1654 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1655 		/*
1656 		 * VLAN flows are only supported over ethernet macs.
1657 		 */
1658 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1659 			return (EINVAL);
1660 
1661 		if (fd->fd_vid == 0)
1662 			return (EINVAL);
1663 
1664 	}
1665 	flent->fe_match = flow_l2_match;
1666 	return (0);
1667 }
1668 
1669 /*
1670  * Calculates hash index of flow entry.
1671  */
1672 static uint32_t
1673 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1674 {
1675 	flow_desc_t	*fd = &flent->fe_flow_desc;
1676 
1677 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1678 	return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1679 }
1680 
1681 /*
1682  * This is used for duplicate flow checking.
1683  */
1684 /* ARGSUSED */
1685 static boolean_t
1686 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1687 {
1688 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1689 
1690 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1691 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1692 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1693 }
1694 
1695 /*
1696  * Generic flow entry insertion function.
1697  * Used by flow tables that do not have ordering requirements.
1698  */
1699 /* ARGSUSED */
1700 static int
1701 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1702     flow_entry_t *flent)
1703 {
1704 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1705 
1706 	if (*headp != NULL) {
1707 		ASSERT(flent->fe_next == NULL);
1708 		flent->fe_next = *headp;
1709 	}
1710 	*headp = flent;
1711 	return (0);
1712 }
1713 
1714 /*
1715  * IP version independent DSField matching function.
1716  */
1717 /* ARGSUSED */
1718 static boolean_t
1719 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1720 {
1721 	flow_l3info_t	*l3info = &s->fs_l3info;
1722 	flow_desc_t	*fd = &flent->fe_flow_desc;
1723 
1724 	switch (l3info->l3_version) {
1725 	case IPV4_VERSION: {
1726 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1727 
1728 		return ((ipha->ipha_type_of_service &
1729 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1730 	}
1731 	case IPV6_VERSION: {
1732 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1733 
1734 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1735 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1736 	}
1737 	default:
1738 		return (B_FALSE);
1739 	}
1740 }
1741 
1742 /*
1743  * IP v4 and v6 address matching.
1744  * The netmask only needs to be applied on the packet but not on the
1745  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1746  */
1747 
1748 /* ARGSUSED */
1749 static boolean_t
1750 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1751 {
1752 	flow_l3info_t	*l3info = &s->fs_l3info;
1753 	flow_desc_t	*fd = &flent->fe_flow_desc;
1754 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1755 	in_addr_t	addr;
1756 
1757 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1758 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1759 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1760 		    V4_PART_OF_V6(fd->fd_local_addr));
1761 	}
1762 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1763 	    V4_PART_OF_V6(fd->fd_remote_addr));
1764 }
1765 
1766 /* ARGSUSED */
1767 static boolean_t
1768 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1769 {
1770 	flow_l3info_t	*l3info = &s->fs_l3info;
1771 	flow_desc_t	*fd = &flent->fe_flow_desc;
1772 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1773 	in6_addr_t	*addrp;
1774 
1775 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1776 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1777 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1778 		    fd->fd_local_addr));
1779 	}
1780 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1781 }
1782 
1783 /* ARGSUSED */
1784 static boolean_t
1785 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1786 {
1787 	flow_l3info_t	*l3info = &s->fs_l3info;
1788 	flow_desc_t	*fd = &flent->fe_flow_desc;
1789 
1790 	return (l3info->l3_protocol == fd->fd_protocol);
1791 }
1792 
1793 static uint32_t
1794 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1795 {
1796 	flow_l3info_t	*l3info = &s->fs_l3info;
1797 	flow_mask_t	mask = ft->ft_mask;
1798 
1799 	if ((mask & FLOW_IP_LOCAL) != 0) {
1800 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1801 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1802 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1803 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1804 		/*
1805 		 * DSField flents are arranged as a single list.
1806 		 */
1807 		return (0);
1808 	}
1809 	/*
1810 	 * IP addr flents are hashed into two lists, v4 or v6.
1811 	 */
1812 	ASSERT(ft->ft_size >= 2);
1813 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1814 }
1815 
1816 static uint32_t
1817 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1818 {
1819 	flow_l3info_t	*l3info = &s->fs_l3info;
1820 
1821 	return (l3info->l3_protocol % ft->ft_size);
1822 }
1823 
1824 /* ARGSUSED */
1825 static int
1826 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1827 {
1828 	flow_l2info_t	*l2info = &s->fs_l2info;
1829 	flow_l3info_t	*l3info = &s->fs_l3info;
1830 	uint16_t	sap = l2info->l2_sap;
1831 	uchar_t		*l3_start;
1832 
1833 	l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize;
1834 	if (!OK_32PTR(l3_start))
1835 		return (EINVAL);
1836 
1837 	switch (sap) {
1838 	case ETHERTYPE_IP: {
1839 		ipha_t	*ipha = (ipha_t *)l3_start;
1840 
1841 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1842 			return (ENOBUFS);
1843 
1844 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1845 		l3info->l3_protocol = ipha->ipha_protocol;
1846 		l3info->l3_version = IPV4_VERSION;
1847 		l3info->l3_fragmented =
1848 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1849 		break;
1850 	}
1851 	case ETHERTYPE_IPV6: {
1852 		ip6_t   *ip6h = (ip6_t *)l3_start;
1853 		uint16_t ip6_hdrlen;
1854 		uint8_t	 nexthdr;
1855 
1856 		if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
1857 		    &nexthdr)) {
1858 			return (ENOBUFS);
1859 		}
1860 		l3info->l3_hdrsize = ip6_hdrlen;
1861 		l3info->l3_protocol = nexthdr;
1862 		l3info->l3_version = IPV6_VERSION;
1863 		l3info->l3_fragmented = B_FALSE;
1864 		break;
1865 	}
1866 	default:
1867 		return (EINVAL);
1868 	}
1869 	return (0);
1870 }
1871 
1872 /* ARGSUSED */
1873 static int
1874 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1875 {
1876 	flow_desc_t	*fd = &flent->fe_flow_desc;
1877 
1878 	switch (fd->fd_protocol) {
1879 	case IPPROTO_TCP:
1880 	case IPPROTO_UDP:
1881 	case IPPROTO_SCTP:
1882 	case IPPROTO_ICMP:
1883 	case IPPROTO_ICMPV6:
1884 		flent->fe_match = flow_ip_proto_match;
1885 		return (0);
1886 	default:
1887 		return (EINVAL);
1888 	}
1889 }
1890 
1891 /* ARGSUSED */
1892 static int
1893 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1894 {
1895 	flow_desc_t	*fd = &flent->fe_flow_desc;
1896 	flow_mask_t	mask;
1897 	uint8_t		version;
1898 	in6_addr_t	*addr, *netmask;
1899 
1900 	/*
1901 	 * DSField does not require a IP version.
1902 	 */
1903 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
1904 		if (fd->fd_dsfield_mask == 0)
1905 			return (EINVAL);
1906 
1907 		flent->fe_match = flow_ip_dsfield_match;
1908 		return (0);
1909 	}
1910 
1911 	/*
1912 	 * IP addresses must come with a version to avoid ambiguity.
1913 	 */
1914 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
1915 		return (EINVAL);
1916 
1917 	version = fd->fd_ipversion;
1918 	if (version != IPV4_VERSION && version != IPV6_VERSION)
1919 		return (EINVAL);
1920 
1921 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
1922 	switch (mask) {
1923 	case FLOW_IP_LOCAL:
1924 		addr = &fd->fd_local_addr;
1925 		netmask = &fd->fd_local_netmask;
1926 		break;
1927 	case FLOW_IP_REMOTE:
1928 		addr = &fd->fd_remote_addr;
1929 		netmask = &fd->fd_remote_netmask;
1930 		break;
1931 	default:
1932 		return (EINVAL);
1933 	}
1934 
1935 	/*
1936 	 * Apply netmask onto specified address.
1937 	 */
1938 	V6_MASK_COPY(*addr, *netmask, *addr);
1939 	if (version == IPV4_VERSION) {
1940 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
1941 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
1942 
1943 		if (v4addr == 0 || v4mask == 0)
1944 			return (EINVAL);
1945 		flent->fe_match = flow_ip_v4_match;
1946 	} else {
1947 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
1948 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
1949 			return (EINVAL);
1950 		flent->fe_match = flow_ip_v6_match;
1951 	}
1952 	return (0);
1953 }
1954 
1955 static uint32_t
1956 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1957 {
1958 	flow_desc_t	*fd = &flent->fe_flow_desc;
1959 
1960 	return (fd->fd_protocol % ft->ft_size);
1961 }
1962 
1963 static uint32_t
1964 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1965 {
1966 	flow_desc_t	*fd = &flent->fe_flow_desc;
1967 
1968 	/*
1969 	 * DSField flents are arranged as a single list.
1970 	 */
1971 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
1972 		return (0);
1973 
1974 	/*
1975 	 * IP addr flents are hashed into two lists, v4 or v6.
1976 	 */
1977 	ASSERT(ft->ft_size >= 2);
1978 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
1979 }
1980 
1981 /* ARGSUSED */
1982 static boolean_t
1983 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1984 {
1985 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1986 
1987 	return (fd1->fd_protocol == fd2->fd_protocol);
1988 }
1989 
1990 /* ARGSUSED */
1991 static boolean_t
1992 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1993 {
1994 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1995 	in6_addr_t	*a1, *m1, *a2, *m2;
1996 
1997 	ASSERT(fd1->fd_mask == fd2->fd_mask);
1998 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
1999 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2000 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2001 	}
2002 
2003 	/*
2004 	 * flow_ip_accept_fe() already validated the version.
2005 	 */
2006 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2007 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2008 		return (B_FALSE);
2009 
2010 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2011 	case FLOW_IP_LOCAL:
2012 		a1 = &fd1->fd_local_addr;
2013 		m1 = &fd1->fd_local_netmask;
2014 		a2 = &fd2->fd_local_addr;
2015 		m2 = &fd2->fd_local_netmask;
2016 		break;
2017 	case FLOW_IP_REMOTE:
2018 		a1 = &fd1->fd_remote_addr;
2019 		m1 = &fd1->fd_remote_netmask;
2020 		a2 = &fd2->fd_remote_addr;
2021 		m2 = &fd2->fd_remote_netmask;
2022 		break;
2023 	default:
2024 		/*
2025 		 * This is unreachable given the checks in
2026 		 * flow_ip_accept_fe().
2027 		 */
2028 		return (B_FALSE);
2029 	}
2030 
2031 	if (fd1->fd_ipversion == IPV4_VERSION) {
2032 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2033 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2034 
2035 	} else {
2036 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2037 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2038 	}
2039 }
2040 
2041 static int
2042 flow_ip_mask2plen(in6_addr_t *v6mask)
2043 {
2044 	int		bits;
2045 	int		plen = IPV6_ABITS;
2046 	int		i;
2047 
2048 	for (i = 3; i >= 0; i--) {
2049 		if (v6mask->s6_addr32[i] == 0) {
2050 			plen -= 32;
2051 			continue;
2052 		}
2053 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2054 		if (bits == 0)
2055 			break;
2056 		plen -= bits;
2057 	}
2058 	return (plen);
2059 }
2060 
2061 /* ARGSUSED */
2062 static int
2063 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2064     flow_entry_t *flent)
2065 {
2066 	flow_entry_t	**p = headp;
2067 	flow_desc_t	*fd0, *fd;
2068 	in6_addr_t	*m0, *m;
2069 	int		plen0, plen;
2070 
2071 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2072 
2073 	/*
2074 	 * No special ordering needed for dsfield.
2075 	 */
2076 	fd0 = &flent->fe_flow_desc;
2077 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2078 		if (*p != NULL) {
2079 			ASSERT(flent->fe_next == NULL);
2080 			flent->fe_next = *p;
2081 		}
2082 		*p = flent;
2083 		return (0);
2084 	}
2085 
2086 	/*
2087 	 * IP address flows are arranged in descending prefix length order.
2088 	 */
2089 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2090 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2091 	plen0 = flow_ip_mask2plen(m0);
2092 	ASSERT(plen0 != 0);
2093 
2094 	for (; *p != NULL; p = &(*p)->fe_next) {
2095 		fd = &(*p)->fe_flow_desc;
2096 
2097 		/*
2098 		 * Normally a dsfield flent shouldn't end up on the same
2099 		 * list as an IP address because flow tables are (for now)
2100 		 * disjoint. If we decide to support both IP and dsfield
2101 		 * in the same table in the future, this check will allow
2102 		 * for that.
2103 		 */
2104 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2105 			continue;
2106 
2107 		/*
2108 		 * We also allow for the mixing of local and remote address
2109 		 * flents within one list.
2110 		 */
2111 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2112 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2113 		plen = flow_ip_mask2plen(m);
2114 
2115 		if (plen <= plen0)
2116 			break;
2117 	}
2118 	if (*p != NULL) {
2119 		ASSERT(flent->fe_next == NULL);
2120 		flent->fe_next = *p;
2121 	}
2122 	*p = flent;
2123 	return (0);
2124 }
2125 
2126 /*
2127  * Transport layer protocol and port matching functions.
2128  */
2129 
2130 /* ARGSUSED */
2131 static boolean_t
2132 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2133 {
2134 	flow_l3info_t	*l3info = &s->fs_l3info;
2135 	flow_l4info_t	*l4info = &s->fs_l4info;
2136 	flow_desc_t	*fd = &flent->fe_flow_desc;
2137 
2138 	return (fd->fd_protocol == l3info->l3_protocol &&
2139 	    fd->fd_local_port == l4info->l4_hash_port);
2140 }
2141 
2142 /* ARGSUSED */
2143 static boolean_t
2144 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2145 {
2146 	flow_l3info_t	*l3info = &s->fs_l3info;
2147 	flow_l4info_t	*l4info = &s->fs_l4info;
2148 	flow_desc_t	*fd = &flent->fe_flow_desc;
2149 
2150 	return (fd->fd_protocol == l3info->l3_protocol &&
2151 	    fd->fd_remote_port == l4info->l4_hash_port);
2152 }
2153 
2154 /*
2155  * Transport hash function.
2156  * Since we only support either local or remote port flows,
2157  * we only need to extract one of the ports to be used for
2158  * matching.
2159  */
2160 static uint32_t
2161 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2162 {
2163 	flow_l3info_t	*l3info = &s->fs_l3info;
2164 	flow_l4info_t	*l4info = &s->fs_l4info;
2165 	uint8_t		proto = l3info->l3_protocol;
2166 	boolean_t	dst_or_src;
2167 
2168 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2169 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2170 	} else {
2171 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2172 	}
2173 
2174 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2175 	    l4info->l4_src_port;
2176 
2177 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2178 }
2179 
2180 /*
2181  * Unlike other accept() functions above, we do not need to get the header
2182  * size because this is our highest layer so far. If we want to do support
2183  * other higher layer protocols, we would need to save the l4_hdrsize
2184  * in the code below.
2185  */
2186 
2187 /* ARGSUSED */
2188 static int
2189 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2190 {
2191 	flow_l3info_t	*l3info = &s->fs_l3info;
2192 	flow_l4info_t	*l4info = &s->fs_l4info;
2193 	uint8_t		proto = l3info->l3_protocol;
2194 	uchar_t		*l4_start;
2195 
2196 	l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize;
2197 	if (!OK_32PTR(l4_start))
2198 		return (EINVAL);
2199 
2200 	if (l3info->l3_fragmented == B_TRUE)
2201 		return (EINVAL);
2202 
2203 	switch (proto) {
2204 	case IPPROTO_TCP: {
2205 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2206 
2207 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2208 			return (ENOBUFS);
2209 
2210 		l4info->l4_src_port = tcph->th_sport;
2211 		l4info->l4_dst_port = tcph->th_dport;
2212 		break;
2213 	}
2214 	case IPPROTO_UDP: {
2215 		struct udphdr	*udph = (struct udphdr *)l4_start;
2216 
2217 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2218 			return (ENOBUFS);
2219 
2220 		l4info->l4_src_port = udph->uh_sport;
2221 		l4info->l4_dst_port = udph->uh_dport;
2222 		break;
2223 	}
2224 	case IPPROTO_SCTP: {
2225 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2226 
2227 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2228 			return (ENOBUFS);
2229 
2230 		l4info->l4_src_port = sctph->sh_sport;
2231 		l4info->l4_dst_port = sctph->sh_dport;
2232 		break;
2233 	}
2234 	default:
2235 		return (EINVAL);
2236 	}
2237 
2238 	return (0);
2239 }
2240 
2241 /*
2242  * Validates transport flow entry.
2243  * The protocol field must be present.
2244  */
2245 
2246 /* ARGSUSED */
2247 static int
2248 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2249 {
2250 	flow_desc_t	*fd = &flent->fe_flow_desc;
2251 	flow_mask_t	mask = fd->fd_mask;
2252 
2253 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2254 		return (EINVAL);
2255 
2256 	switch (fd->fd_protocol) {
2257 	case IPPROTO_TCP:
2258 	case IPPROTO_UDP:
2259 	case IPPROTO_SCTP:
2260 		break;
2261 	default:
2262 		return (EINVAL);
2263 	}
2264 
2265 	switch (mask & ~FLOW_IP_PROTOCOL) {
2266 	case FLOW_ULP_PORT_LOCAL:
2267 		if (fd->fd_local_port == 0)
2268 			return (EINVAL);
2269 
2270 		flent->fe_match = flow_transport_lport_match;
2271 		break;
2272 	case FLOW_ULP_PORT_REMOTE:
2273 		if (fd->fd_remote_port == 0)
2274 			return (EINVAL);
2275 
2276 		flent->fe_match = flow_transport_rport_match;
2277 		break;
2278 	case 0:
2279 		/*
2280 		 * transport-only flows conflicts with our table type.
2281 		 */
2282 		return (EOPNOTSUPP);
2283 	default:
2284 		return (EINVAL);
2285 	}
2286 
2287 	return (0);
2288 }
2289 
2290 static uint32_t
2291 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2292 {
2293 	flow_desc_t	*fd = &flent->fe_flow_desc;
2294 	uint16_t	port = 0;
2295 
2296 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2297 	    fd->fd_local_port : fd->fd_remote_port;
2298 
2299 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2300 }
2301 
2302 /* ARGSUSED */
2303 static boolean_t
2304 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2305 {
2306 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2307 
2308 	if (fd1->fd_protocol != fd2->fd_protocol)
2309 		return (B_FALSE);
2310 
2311 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2312 		return (fd1->fd_local_port == fd2->fd_local_port);
2313 
2314 	return (fd1->fd_remote_port == fd2->fd_remote_port);
2315 }
2316 
2317 static flow_ops_t flow_l2_ops = {
2318 	flow_l2_accept_fe,
2319 	flow_l2_hash_fe,
2320 	flow_l2_match_fe,
2321 	flow_generic_insert_fe,
2322 	flow_l2_hash,
2323 	{flow_l2_accept}
2324 };
2325 
2326 static flow_ops_t flow_ip_ops = {
2327 	flow_ip_accept_fe,
2328 	flow_ip_hash_fe,
2329 	flow_ip_match_fe,
2330 	flow_ip_insert_fe,
2331 	flow_ip_hash,
2332 	{flow_l2_accept, flow_ip_accept}
2333 };
2334 
2335 static flow_ops_t flow_ip_proto_ops = {
2336 	flow_ip_proto_accept_fe,
2337 	flow_ip_proto_hash_fe,
2338 	flow_ip_proto_match_fe,
2339 	flow_generic_insert_fe,
2340 	flow_ip_proto_hash,
2341 	{flow_l2_accept, flow_ip_accept}
2342 };
2343 
2344 static flow_ops_t flow_transport_ops = {
2345 	flow_transport_accept_fe,
2346 	flow_transport_hash_fe,
2347 	flow_transport_match_fe,
2348 	flow_generic_insert_fe,
2349 	flow_transport_hash,
2350 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2351 };
2352 
2353 static flow_tab_info_t flow_tab_info_list[] = {
2354 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2355 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2356 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2357 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2358 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}
2359 };
2360 
2361 #define	FLOW_MAX_TAB_INFO \
2362 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2363 
2364 static flow_tab_info_t *
2365 mac_flow_tab_info_get(flow_mask_t mask)
2366 {
2367 	int	i;
2368 
2369 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2370 		if (mask == flow_tab_info_list[i].fti_mask)
2371 			return (&flow_tab_info_list[i]);
2372 	}
2373 	return (NULL);
2374 }
2375