xref: /illumos-gate/usr/src/uts/common/io/mac/mac_flow.c (revision c211fc479225fa54805cf480633bf6689ca9a2db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/dls.h>
33 #include <sys/dls_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/ethernet.h>
36 #include <sys/vlan.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <netinet/tcp.h>
40 #include <netinet/udp.h>
41 #include <netinet/sctp.h>
42 
43 /* global flow table, will be a per exclusive-zone table later */
44 static mod_hash_t	*flow_hash;
45 static krwlock_t	flow_tab_lock;
46 
47 static kmem_cache_t	*flow_cache;
48 static kmem_cache_t	*flow_tab_cache;
49 static flow_ops_t	flow_l2_ops;
50 
51 typedef struct {
52 	const char	*fs_name;
53 	uint_t		fs_offset;
54 } flow_stats_info_t;
55 
56 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
57 static flow_stats_info_t flow_stats_list[] = {
58 	{"rbytes",	FS_OFF(fs_rbytes)},
59 	{"ipackets",	FS_OFF(fs_ipackets)},
60 	{"ierrors",	FS_OFF(fs_ierrors)},
61 	{"obytes",	FS_OFF(fs_obytes)},
62 	{"opackets",	FS_OFF(fs_opackets)},
63 	{"oerrors",	FS_OFF(fs_oerrors)}
64 };
65 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
66 
67 /*
68  * Checks whether a flow mask is legal.
69  */
70 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
71 
72 static void
73 flow_stat_init(kstat_named_t *knp)
74 {
75 	int	i;
76 
77 	for (i = 0; i < FS_SIZE; i++, knp++) {
78 		kstat_named_init(knp, flow_stats_list[i].fs_name,
79 		    KSTAT_DATA_UINT64);
80 	}
81 }
82 
83 static int
84 flow_stat_update(kstat_t *ksp, int rw)
85 {
86 	flow_entry_t		*fep = ksp->ks_private;
87 	flow_stats_t 		*fsp = &fep->fe_flowstats;
88 	kstat_named_t		*knp = ksp->ks_data;
89 	uint64_t		*statp;
90 	zoneid_t		zid;
91 	int			i;
92 
93 	if (rw != KSTAT_READ)
94 		return (EACCES);
95 
96 	zid = getzoneid();
97 	if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) {
98 		for (i = 0; i < FS_SIZE; i++, knp++)
99 			knp->value.ui64 = 0;
100 
101 		return (0);
102 	}
103 
104 	for (i = 0; i < FS_SIZE; i++, knp++) {
105 		statp = (uint64_t *)
106 		    ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
107 
108 		knp->value.ui64 = *statp;
109 	}
110 	return (0);
111 }
112 
113 static void
114 flow_stat_create(flow_entry_t *fep)
115 {
116 	kstat_t		*ksp;
117 	kstat_named_t	*knp;
118 	uint_t		nstats = FS_SIZE;
119 
120 	ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow",
121 	    KSTAT_TYPE_NAMED, nstats, 0);
122 	if (ksp == NULL)
123 		return;
124 
125 	ksp->ks_update = flow_stat_update;
126 	ksp->ks_private = fep;
127 	fep->fe_ksp = ksp;
128 
129 	knp = (kstat_named_t *)ksp->ks_data;
130 	flow_stat_init(knp);
131 	kstat_install(ksp);
132 }
133 
134 void
135 flow_stat_destroy(flow_entry_t *fep)
136 {
137 	if (fep->fe_ksp != NULL) {
138 		kstat_delete(fep->fe_ksp);
139 		fep->fe_ksp = NULL;
140 	}
141 }
142 
143 /*
144  * Initialize the flow table
145  */
146 void
147 mac_flow_init()
148 {
149 	flow_cache = kmem_cache_create("flow_entry_cache",
150 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
151 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
152 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
153 	flow_hash = mod_hash_create_extended("flow_hash",
154 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
155 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
156 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
157 }
158 
159 /*
160  * Cleanup and release the flow table
161  */
162 void
163 mac_flow_fini()
164 {
165 	kmem_cache_destroy(flow_cache);
166 	kmem_cache_destroy(flow_tab_cache);
167 	mod_hash_destroy_hash(flow_hash);
168 	rw_destroy(&flow_tab_lock);
169 }
170 
171 /*
172  * mac_create_flow(): create a flow_entry_t.
173  */
174 int
175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
176     void *client_cookie, uint_t type, flow_entry_t **flentp)
177 {
178 	flow_entry_t	*flent = *flentp;
179 	int		err = 0;
180 
181 	if (mrp != NULL) {
182 		err = mac_validate_props(mrp);
183 		if (err != 0)
184 			return (err);
185 	}
186 
187 	if (flent == NULL) {
188 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
189 		bzero(flent, sizeof (*flent));
190 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
191 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
192 
193 		/* Initialize the receiver function to a safe routine */
194 		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
195 		flent->fe_index = -1;
196 	}
197 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
198 
199 	/* This is an initial flow, will be configured later */
200 	if (fd == NULL) {
201 		*flentp = flent;
202 		return (0);
203 	}
204 
205 	flent->fe_client_cookie = client_cookie;
206 	flent->fe_type = type;
207 
208 	/*
209 	 * As flow creation is only allowed in global zone, this will
210 	 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will
211 	 * later set the right value.
212 	 */
213 	flent->fe_zoneid = getzoneid();
214 
215 	/* Save flow desc */
216 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
217 
218 	if (mrp != NULL) {
219 		/*
220 		 * We have already set fe_resource_props for a Link.
221 		 */
222 		if (type & FLOW_USER) {
223 			bcopy(mrp, &flent->fe_resource_props,
224 			    sizeof (mac_resource_props_t));
225 		}
226 		/*
227 		 * The effective resource list should reflect the priority
228 		 * that we set implicitly.
229 		 */
230 		if (!(mrp->mrp_mask & MRP_PRIORITY))
231 			mrp->mrp_mask |= MRP_PRIORITY;
232 		if (type & FLOW_USER)
233 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
234 		else
235 			mrp->mrp_priority = MPL_LINK_DEFAULT;
236 		bcopy(mrp, &flent->fe_effective_props,
237 		    sizeof (mac_resource_props_t));
238 	}
239 	flow_stat_create(flent);
240 
241 	*flentp = flent;
242 	return (0);
243 }
244 
245 /*
246  * Validate flow entry and add it to a flow table.
247  */
248 int
249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
250 {
251 	flow_entry_t	**headp, **p;
252 	flow_ops_t	*ops = &ft->ft_ops;
253 	flow_mask_t	mask;
254 	uint32_t	index;
255 	int		err;
256 
257 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
258 
259 	/*
260 	 * Check for invalid bits in mask.
261 	 */
262 	mask = flent->fe_flow_desc.fd_mask;
263 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
264 		return (EOPNOTSUPP);
265 
266 	/*
267 	 * Validate flent.
268 	 */
269 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
270 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
271 		    flow_entry_t *, flent, int, err);
272 		return (err);
273 	}
274 
275 	/*
276 	 * Flent is valid. now calculate hash and insert it
277 	 * into hash table.
278 	 */
279 	index = ops->fo_hash_fe(ft, flent);
280 
281 	/*
282 	 * We do not need a lock up until now because we were
283 	 * not accessing the flow table.
284 	 */
285 	rw_enter(&ft->ft_lock, RW_WRITER);
286 	headp = &ft->ft_table[index];
287 
288 	/*
289 	 * Check for duplicate flow.
290 	 */
291 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
292 		if ((*p)->fe_flow_desc.fd_mask !=
293 		    flent->fe_flow_desc.fd_mask)
294 			continue;
295 
296 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
297 			rw_exit(&ft->ft_lock);
298 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
299 			    flow_entry_t *, flent, int, err);
300 			return (EALREADY);
301 		}
302 	}
303 
304 	/*
305 	 * Insert flow to hash list.
306 	 */
307 	err = ops->fo_insert_fe(ft, headp, flent);
308 	if (err != 0) {
309 		rw_exit(&ft->ft_lock);
310 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
311 		    flow_entry_t *, flent, int, err);
312 		return (err);
313 	}
314 
315 	/*
316 	 * Save the hash index so it can be used by mac_flow_remove().
317 	 */
318 	flent->fe_index = (int)index;
319 
320 	/*
321 	 * Save the flow tab back reference.
322 	 */
323 	flent->fe_flow_tab = ft;
324 	FLOW_MARK(flent, FE_FLOW_TAB);
325 	ft->ft_flow_count++;
326 	rw_exit(&ft->ft_lock);
327 	return (0);
328 }
329 
330 /*
331  * Remove a flow from a mac client's subflow table
332  */
333 void
334 mac_flow_rem_subflow(flow_entry_t *flent)
335 {
336 	flow_tab_t		*ft = flent->fe_flow_tab;
337 	mac_client_impl_t	*mcip = ft->ft_mcip;
338 
339 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
340 
341 	mac_flow_remove(ft, flent, B_FALSE);
342 	if (flent->fe_mcip == NULL) {
343 		/*
344 		 * The interface is not yet plumbed and mac_client_flow_add
345 		 * was not done.
346 		 */
347 		if (FLOW_TAB_EMPTY(ft)) {
348 			mac_flow_tab_destroy(ft);
349 			mcip->mci_subflow_tab = NULL;
350 		}
351 		return;
352 	}
353 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
354 	mac_link_flow_clean((mac_client_handle_t)mcip, flent);
355 }
356 
357 /*
358  * Add a flow to a mac client's subflow table and instantiate the flow
359  * in the mac by creating the associated SRSs etc.
360  */
361 int
362 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
363     boolean_t instantiate_flow)
364 {
365 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
366 	flow_tab_info_t		*ftinfo;
367 	flow_mask_t		mask;
368 	flow_tab_t		*ft;
369 	int			err;
370 	boolean_t		ft_created = B_FALSE;
371 
372 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
373 
374 	/*
375 	 * If the subflow table exists already just add the new subflow
376 	 * to the existing table, else we create a new subflow table below.
377 	 */
378 	ft = mcip->mci_subflow_tab;
379 	if (ft == NULL) {
380 		mask = flent->fe_flow_desc.fd_mask;
381 		/*
382 		 * Try to create a new table and then add the subflow to the
383 		 * newly created subflow table
384 		 */
385 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL)
386 			return (EOPNOTSUPP);
387 
388 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
389 		    mcip->mci_mip, &ft);
390 		ft_created = B_TRUE;
391 	}
392 
393 	err = mac_flow_add(ft, flent);
394 	if (err != 0) {
395 		if (ft_created)
396 			mac_flow_tab_destroy(ft);
397 		return (err);
398 	}
399 
400 	if (instantiate_flow) {
401 		/* Now activate the flow by creating its SRSs */
402 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
403 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
404 		if (err != 0) {
405 			mac_flow_remove(ft, flent, B_FALSE);
406 			if (ft_created)
407 				mac_flow_tab_destroy(ft);
408 			return (err);
409 		}
410 	} else {
411 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
412 	}
413 	if (ft_created) {
414 		ASSERT(mcip->mci_subflow_tab == NULL);
415 		ft->ft_mcip = mcip;
416 		mcip->mci_subflow_tab = ft;
417 		if (instantiate_flow)
418 			mac_client_update_classifier(mcip, B_TRUE);
419 	}
420 	return (0);
421 }
422 
423 /*
424  * Remove flow entry from flow table.
425  */
426 void
427 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
428 {
429 	flow_entry_t	**fp;
430 
431 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
432 	if (!(flent->fe_flags & FE_FLOW_TAB))
433 		return;
434 
435 	rw_enter(&ft->ft_lock, RW_WRITER);
436 	/*
437 	 * If this is a permanent removal from the flow table, mark it
438 	 * CONDEMNED to prevent future references. If this is a temporary
439 	 * removal from the table, say to update the flow descriptor then
440 	 * we don't mark it CONDEMNED
441 	 */
442 	if (!temp)
443 		FLOW_MARK(flent, FE_CONDEMNED);
444 	/*
445 	 * Locate the specified flent.
446 	 */
447 	fp = &ft->ft_table[flent->fe_index];
448 	while (*fp != flent)
449 		fp = &(*fp)->fe_next;
450 
451 	/*
452 	 * The flent must exist. Otherwise it's a bug.
453 	 */
454 	ASSERT(fp != NULL);
455 	*fp = flent->fe_next;
456 	flent->fe_next = NULL;
457 
458 	/*
459 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
460 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
461 	 * will panic.
462 	 */
463 	flent->fe_index = -1;
464 	FLOW_UNMARK(flent, FE_FLOW_TAB);
465 	ft->ft_flow_count--;
466 	rw_exit(&ft->ft_lock);
467 }
468 
469 /*
470  * This is the flow lookup routine used by the mac sw classifier engine.
471  */
472 int
473 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
474 {
475 	flow_state_t	s;
476 	flow_entry_t	*flent;
477 	flow_ops_t	*ops = &ft->ft_ops;
478 	boolean_t	retried = B_FALSE;
479 	int		i, err;
480 
481 	s.fs_flags = flags;
482 retry:
483 	s.fs_mp = mp;
484 
485 	/*
486 	 * Walk the list of predeclared accept functions.
487 	 * Each of these would accumulate enough state to allow the next
488 	 * accept routine to make progress.
489 	 */
490 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
491 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
492 			mblk_t	*last;
493 
494 			/*
495 			 * ENOBUFS indicates that the mp could be too short
496 			 * and may need a pullup.
497 			 */
498 			if (err != ENOBUFS || retried)
499 				return (err);
500 
501 			/*
502 			 * The pullup is done on the last processed mblk, not
503 			 * the starting one. pullup is not done if the mblk
504 			 * has references or if b_cont is NULL.
505 			 */
506 			last = s.fs_mp;
507 			if (DB_REF(last) > 1 || last->b_cont == NULL ||
508 			    pullupmsg(last, -1) == 0)
509 				return (EINVAL);
510 
511 			retried = B_TRUE;
512 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
513 			    flow_state_t *, &s);
514 			goto retry;
515 		}
516 	}
517 
518 	/*
519 	 * The packet is considered sane. We may now attempt to
520 	 * find the corresponding flent.
521 	 */
522 	rw_enter(&ft->ft_lock, RW_READER);
523 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
524 	for (; flent != NULL; flent = flent->fe_next) {
525 		if (flent->fe_match(ft, flent, &s)) {
526 			FLOW_TRY_REFHOLD(flent, err);
527 			if (err != 0)
528 				continue;
529 			*flentp = flent;
530 			rw_exit(&ft->ft_lock);
531 			return (0);
532 		}
533 	}
534 	rw_exit(&ft->ft_lock);
535 	return (ENOENT);
536 }
537 
538 /*
539  * Walk flow table.
540  * The caller is assumed to have proper perimeter protection.
541  */
542 int
543 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
544     void *arg)
545 {
546 	int		err, i, cnt = 0;
547 	flow_entry_t	*flent;
548 
549 	if (ft == NULL)
550 		return (0);
551 
552 	for (i = 0; i < ft->ft_size; i++) {
553 		for (flent = ft->ft_table[i]; flent != NULL;
554 		    flent = flent->fe_next) {
555 			cnt++;
556 			err = (*fn)(flent, arg);
557 			if (err != 0)
558 				return (err);
559 		}
560 	}
561 	VERIFY(cnt == ft->ft_flow_count);
562 	return (0);
563 }
564 
565 /*
566  * Same as the above except a mutex is used for protection here.
567  */
568 int
569 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
570     void *arg)
571 {
572 	int		err;
573 
574 	if (ft == NULL)
575 		return (0);
576 
577 	rw_enter(&ft->ft_lock, RW_WRITER);
578 	err = mac_flow_walk_nolock(ft, fn, arg);
579 	rw_exit(&ft->ft_lock);
580 	return (err);
581 }
582 
583 static boolean_t	mac_flow_clean(flow_entry_t *);
584 
585 /*
586  * Destroy a flow entry. Called when the last reference on a flow is released.
587  */
588 void
589 mac_flow_destroy(flow_entry_t *flent)
590 {
591 	ASSERT(flent->fe_refcnt == 0);
592 
593 	if ((flent->fe_type & FLOW_USER) != 0) {
594 		ASSERT(mac_flow_clean(flent));
595 	} else {
596 		mac_flow_cleanup(flent);
597 	}
598 
599 	mutex_destroy(&flent->fe_lock);
600 	cv_destroy(&flent->fe_cv);
601 	flow_stat_destroy(flent);
602 	kmem_cache_free(flow_cache, flent);
603 }
604 
605 /*
606  * XXX eric
607  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
608  * mac_link_flow_modify() should really be moved/reworked into the
609  * two functions below. This would consolidate all the mac property
610  * checking in one place. I'm leaving this alone for now since it's
611  * out of scope of the new flows work.
612  */
613 /* ARGSUSED */
614 uint32_t
615 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
616 {
617 	uint32_t		changed_mask = 0;
618 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
619 	int			i;
620 
621 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
622 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
623 		changed_mask |= MRP_MAXBW;
624 		fmrp->mrp_maxbw = mrp->mrp_maxbw;
625 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
626 			fmrp->mrp_mask &= ~MRP_MAXBW;
627 		} else {
628 			fmrp->mrp_mask |= MRP_MAXBW;
629 		}
630 	}
631 
632 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
633 		if (fmrp->mrp_priority != mrp->mrp_priority)
634 			changed_mask |= MRP_PRIORITY;
635 		if (mrp->mrp_priority == MPL_RESET) {
636 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
637 			fmrp->mrp_mask &= ~MRP_PRIORITY;
638 		} else {
639 			fmrp->mrp_priority = mrp->mrp_priority;
640 			fmrp->mrp_mask |= MRP_PRIORITY;
641 		}
642 	}
643 
644 	/* modify fanout */
645 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
646 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
647 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
648 			for (i = 0; i < mrp->mrp_ncpus; i++) {
649 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
650 					break;
651 			}
652 			if (i == mrp->mrp_ncpus) {
653 				/*
654 				 * The new set of cpus passed is exactly
655 				 * the same as the existing set.
656 				 */
657 				return (changed_mask);
658 			}
659 		}
660 		changed_mask |= MRP_CPUS;
661 		MAC_COPY_CPUS(mrp, fmrp);
662 	}
663 	return (changed_mask);
664 }
665 
666 void
667 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
668 {
669 	uint32_t changed_mask;
670 	mac_client_impl_t *mcip = flent->fe_mcip;
671 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
672 
673 	ASSERT(flent != NULL);
674 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
675 
676 	rw_enter(&ft->ft_lock, RW_WRITER);
677 
678 	/* Update the cached values inside the subflow entry */
679 	changed_mask = mac_flow_modify_props(flent, mrp);
680 	rw_exit(&ft->ft_lock);
681 	/*
682 	 * Push the changed parameters to the scheduling code in the
683 	 * SRS's, to take effect right away.
684 	 */
685 	if (changed_mask & MRP_MAXBW) {
686 		mac_srs_update_bwlimit(flent, mrp);
687 		/*
688 		 * If bandwidth is changed, we may have to change
689 		 * the number of soft ring to be used for fanout.
690 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
691 		 * is not set and there is no user supplied cpu
692 		 * info. This applies only to link at this time.
693 		 */
694 		if (!(flent->fe_type & FLOW_USER) &&
695 		    !(changed_mask & MRP_CPUS) &&
696 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
697 			mac_fanout_setup(mcip, flent, mcip_mrp,
698 			    mac_rx_deliver, mcip, NULL);
699 		}
700 	}
701 	if (mrp->mrp_mask & MRP_PRIORITY)
702 		mac_flow_update_priority(mcip, flent);
703 
704 	if (changed_mask & MRP_CPUS)
705 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
706 }
707 
708 /*
709  * This function waits for a certain condition to be met and is generally
710  * used before a destructive or quiescing operation.
711  */
712 void
713 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
714 {
715 	mutex_enter(&flent->fe_lock);
716 	flent->fe_flags |= FE_WAITER;
717 
718 	switch (event) {
719 	case FLOW_DRIVER_UPCALL:
720 		/*
721 		 * We want to make sure the driver upcalls have finished before
722 		 * we signal the Rx SRS worker to quit.
723 		 */
724 		while (flent->fe_refcnt != 1)
725 			cv_wait(&flent->fe_cv, &flent->fe_lock);
726 		break;
727 
728 	case FLOW_USER_REF:
729 		/*
730 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
731 		 * been removed from the global flow hash.
732 		 */
733 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
734 		while (flent->fe_user_refcnt != 0)
735 			cv_wait(&flent->fe_cv, &flent->fe_lock);
736 		break;
737 
738 	default:
739 		ASSERT(0);
740 	}
741 
742 	flent->fe_flags &= ~FE_WAITER;
743 	mutex_exit(&flent->fe_lock);
744 }
745 
746 static boolean_t
747 mac_flow_clean(flow_entry_t *flent)
748 {
749 	ASSERT(flent->fe_next == NULL);
750 	ASSERT(flent->fe_tx_srs == NULL);
751 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
752 	ASSERT(flent->fe_mbg == NULL);
753 
754 	return (B_TRUE);
755 }
756 
757 void
758 mac_flow_cleanup(flow_entry_t *flent)
759 {
760 	if ((flent->fe_type & FLOW_USER) == 0) {
761 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
762 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
763 		ASSERT(flent->fe_refcnt == 0);
764 	} else {
765 		ASSERT(flent->fe_refcnt == 1);
766 	}
767 
768 	if (flent->fe_mbg != NULL) {
769 		ASSERT(flent->fe_tx_srs == NULL);
770 		/* This is a multicast or broadcast flow entry */
771 		mac_bcast_grp_free(flent->fe_mbg);
772 		flent->fe_mbg = NULL;
773 	}
774 
775 	if (flent->fe_tx_srs != NULL) {
776 		ASSERT(flent->fe_mbg == NULL);
777 		mac_srs_free(flent->fe_tx_srs);
778 		flent->fe_tx_srs = NULL;
779 	}
780 
781 	/*
782 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
783 	 * when mac_unicast_add fails we may not have set up any SRS
784 	 * in which case fe_rx_srs_cnt will be zero.
785 	 */
786 	if (flent->fe_rx_srs_cnt != 0) {
787 		ASSERT(flent->fe_rx_srs_cnt == 1);
788 		mac_srs_free(flent->fe_rx_srs[0]);
789 		flent->fe_rx_srs[0] = NULL;
790 		flent->fe_rx_srs_cnt = 0;
791 	}
792 	ASSERT(flent->fe_rx_srs[0] == NULL);
793 }
794 
795 void
796 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
797 {
798 	/*
799 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
800 	 * Updates to the fe_flow_desc happen under the fe_lock
801 	 * after removing the flent from the flow table
802 	 */
803 	mutex_enter(&flent->fe_lock);
804 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
805 	mutex_exit(&flent->fe_lock);
806 }
807 
808 /*
809  * Update a field of a flow entry. The mac perimeter ensures that
810  * this is the only thread doing a modify operation on this mac end point.
811  * So the flow table can't change or disappear. The ft_lock protects access
812  * to the flow entry, and holding the lock ensures that there isn't any thread
813  * accessing the flow entry or attempting a flow table lookup. However
814  * data threads that are using the flow entry based on the old descriptor
815  * will continue to use the flow entry. If strong coherence is required
816  * then the flow will have to be quiesced before the descriptor can be
817  * changed.
818  */
819 void
820 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
821 {
822 	flow_tab_t	*ft = flent->fe_flow_tab;
823 	flow_desc_t	old_desc;
824 	int		err;
825 
826 	if (ft == NULL) {
827 		/*
828 		 * The flow hasn't yet been inserted into the table,
829 		 * so only the caller knows about this flow, however for
830 		 * uniformity we grab the fe_lock here.
831 		 */
832 		mutex_enter(&flent->fe_lock);
833 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
834 		mutex_exit(&flent->fe_lock);
835 	}
836 
837 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
838 
839 	/*
840 	 * Need to remove the flow entry from the table and reinsert it,
841 	 * into a potentially diference hash line. The hash depends on
842 	 * the new descriptor fields. However access to fe_desc itself
843 	 * is always under the fe_lock. This helps log and stat functions
844 	 * see a self-consistent fe_flow_desc.
845 	 */
846 	mac_flow_remove(ft, flent, B_TRUE);
847 	old_desc = flent->fe_flow_desc;
848 
849 	mutex_enter(&flent->fe_lock);
850 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
851 	mutex_exit(&flent->fe_lock);
852 
853 	if (mac_flow_add(ft, flent) != 0) {
854 		/*
855 		 * The add failed say due to an invalid flow descriptor.
856 		 * Undo the update
857 		 */
858 		flent->fe_flow_desc = old_desc;
859 		err = mac_flow_add(ft, flent);
860 		ASSERT(err == 0);
861 	}
862 }
863 
864 void
865 mac_flow_set_name(flow_entry_t *flent, const char *name)
866 {
867 	flow_tab_t	*ft = flent->fe_flow_tab;
868 
869 	if (ft == NULL) {
870 		/*
871 		 *  The flow hasn't yet been inserted into the table,
872 		 * so only the caller knows about this flow
873 		 */
874 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
875 	} else {
876 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
877 	}
878 
879 	mutex_enter(&flent->fe_lock);
880 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
881 	mutex_exit(&flent->fe_lock);
882 }
883 
884 /*
885  * Return the client-private cookie that was associated with
886  * the flow when it was created.
887  */
888 void *
889 mac_flow_get_client_cookie(flow_entry_t *flent)
890 {
891 	return (flent->fe_client_cookie);
892 }
893 
894 /*
895  * Forward declarations.
896  */
897 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
898 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
899 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
900 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
901 
902 /*
903  * Create flow table.
904  */
905 void
906 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
907     mac_impl_t *mip, flow_tab_t **ftp)
908 {
909 	flow_tab_t	*ft;
910 	flow_ops_t	*new_ops;
911 
912 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
913 	bzero(ft, sizeof (*ft));
914 
915 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
916 
917 	/*
918 	 * We make a copy of the ops vector instead of just pointing to it
919 	 * because we might want to customize the ops vector on a per table
920 	 * basis (e.g. for optimization).
921 	 */
922 	new_ops = &ft->ft_ops;
923 	bcopy(ops, new_ops, sizeof (*ops));
924 	ft->ft_mask = mask;
925 	ft->ft_size = size;
926 	ft->ft_mip = mip;
927 
928 	/*
929 	 * Optimization for DL_ETHER media.
930 	 */
931 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
932 		if (new_ops->fo_hash == flow_l2_hash)
933 			new_ops->fo_hash = flow_ether_hash;
934 
935 		if (new_ops->fo_accept[0] == flow_l2_accept)
936 			new_ops->fo_accept[0] = flow_ether_accept;
937 
938 	}
939 	*ftp = ft;
940 }
941 
942 void
943 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
944 {
945 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
946 	    1024, mip, ftp);
947 }
948 
949 /*
950  * Destroy flow table.
951  */
952 void
953 mac_flow_tab_destroy(flow_tab_t *ft)
954 {
955 	if (ft == NULL)
956 		return;
957 
958 	ASSERT(ft->ft_flow_count == 0);
959 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
960 	bzero(ft, sizeof (*ft));
961 	kmem_cache_free(flow_tab_cache, ft);
962 }
963 
964 /*
965  * Add a new flow entry to the global flow hash table
966  */
967 int
968 mac_flow_hash_add(flow_entry_t *flent)
969 {
970 	int	err;
971 
972 	rw_enter(&flow_tab_lock, RW_WRITER);
973 	err = mod_hash_insert(flow_hash,
974 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
975 	if (err != 0) {
976 		rw_exit(&flow_tab_lock);
977 		return (EEXIST);
978 	}
979 	/* Mark as inserted into the global flow hash table */
980 	FLOW_MARK(flent, FE_G_FLOW_HASH);
981 	rw_exit(&flow_tab_lock);
982 	return (err);
983 }
984 
985 /*
986  * Remove a flow entry from the global flow hash table
987  */
988 void
989 mac_flow_hash_remove(flow_entry_t *flent)
990 {
991 	mod_hash_val_t	val;
992 
993 	rw_enter(&flow_tab_lock, RW_WRITER);
994 	VERIFY(mod_hash_remove(flow_hash,
995 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
996 
997 	/* Clear the mark that says inserted into the global flow hash table */
998 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
999 	rw_exit(&flow_tab_lock);
1000 }
1001 
1002 /*
1003  * Retrieve a flow entry from the global flow hash table.
1004  */
1005 int
1006 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1007 {
1008 	int		err;
1009 	flow_entry_t	*flent;
1010 
1011 	rw_enter(&flow_tab_lock, RW_READER);
1012 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1013 	    (mod_hash_val_t *)&flent);
1014 	if (err != 0) {
1015 		rw_exit(&flow_tab_lock);
1016 		return (ENOENT);
1017 	}
1018 	ASSERT(flent != NULL);
1019 	FLOW_USER_REFHOLD(flent);
1020 	rw_exit(&flow_tab_lock);
1021 
1022 	*flentp = flent;
1023 	return (0);
1024 }
1025 
1026 /*
1027  * Initialize or release mac client flows by walking the subflow table.
1028  * These are typically invoked during plumb/unplumb of links.
1029  */
1030 
1031 static int
1032 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1033 {
1034 	mac_client_impl_t	*mcip = arg;
1035 
1036 	if (mac_link_flow_init(arg, flent) != 0) {
1037 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1038 		    flent->fe_flow_name, mcip->mci_name);
1039 	} else {
1040 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1041 	}
1042 	return (0);
1043 }
1044 
1045 void
1046 mac_link_init_flows(mac_client_handle_t mch)
1047 {
1048 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1049 
1050 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1051 	    mac_link_init_flows_cb, mcip);
1052 	/*
1053 	 * If mac client had subflow(s) configured before plumb, change
1054 	 * function to mac_rx_srs_subflow_process and in case of hardware
1055 	 * classification, disable polling.
1056 	 */
1057 	mac_client_update_classifier(mcip, B_TRUE);
1058 
1059 }
1060 
1061 boolean_t
1062 mac_link_has_flows(mac_client_handle_t mch)
1063 {
1064 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1065 
1066 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1067 		return (B_TRUE);
1068 
1069 	return (B_FALSE);
1070 }
1071 
1072 static int
1073 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1074 {
1075 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1076 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1077 	mac_link_flow_clean(arg, flent);
1078 	return (0);
1079 }
1080 
1081 void
1082 mac_link_release_flows(mac_client_handle_t mch)
1083 {
1084 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1085 
1086 	/*
1087 	 * Change the mci_flent callback back to mac_rx_srs_process()
1088 	 * because flows are about to be deactivated.
1089 	 */
1090 	mac_client_update_classifier(mcip, B_FALSE);
1091 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1092 	    mac_link_release_flows_cb, mcip);
1093 }
1094 
1095 void
1096 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1097 {
1098 	mac_flow_set_name(fep, new_name);
1099 	if (fep->fe_ksp != NULL) {
1100 		flow_stat_destroy(fep);
1101 		flow_stat_create(fep);
1102 	}
1103 }
1104 
1105 /*
1106  * mac_link_flow_init()
1107  * Internal flow interface used for allocating SRSs and related
1108  * data structures. Not meant to be used by mac clients.
1109  */
1110 int
1111 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1112 {
1113 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1114 	mac_impl_t		*mip = mcip->mci_mip;
1115 	int			err;
1116 
1117 	ASSERT(mch != NULL);
1118 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1119 
1120 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1121 		return (err);
1122 
1123 	sub_flow->fe_mcip = mcip;
1124 
1125 	return (0);
1126 }
1127 
1128 /*
1129  * mac_link_flow_add()
1130  * Used by flowadm(1m) or kernel mac clients for creating flows.
1131  */
1132 int
1133 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1134     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1135 {
1136 	flow_entry_t		*flent = NULL;
1137 	int			err;
1138 	dls_dl_handle_t		dlh;
1139 	dls_link_t		*dlp;
1140 	boolean_t		link_held = B_FALSE;
1141 	boolean_t		hash_added = B_FALSE;
1142 	mac_perim_handle_t	mph;
1143 
1144 	err = mac_flow_lookup_byname(flow_name, &flent);
1145 	if (err == 0) {
1146 		FLOW_USER_REFRELE(flent);
1147 		return (EEXIST);
1148 	}
1149 
1150 	/*
1151 	 * First create a flow entry given the description provided
1152 	 * by the caller.
1153 	 */
1154 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1155 	    FLOW_USER | FLOW_OTHER, &flent);
1156 
1157 	if (err != 0)
1158 		return (err);
1159 
1160 	/*
1161 	 * We've got a local variable referencing this flow now, so we need
1162 	 * to hold it. We'll release this flow before returning.
1163 	 * All failures until we return will undo any action that may internally
1164 	 * held the flow, so the last REFRELE will assure a clean freeing
1165 	 * of resources.
1166 	 */
1167 	FLOW_REFHOLD(flent);
1168 
1169 	flent->fe_link_id = linkid;
1170 	FLOW_MARK(flent, FE_INCIPIENT);
1171 
1172 	err = mac_perim_enter_by_linkid(linkid, &mph);
1173 	if (err != 0) {
1174 		FLOW_FINAL_REFRELE(flent);
1175 		return (err);
1176 	}
1177 
1178 	/*
1179 	 * dls will eventually be merged with mac so it's ok
1180 	 * to call dls' internal functions.
1181 	 */
1182 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1183 	if (err != 0)
1184 		goto bail;
1185 
1186 	link_held = B_TRUE;
1187 
1188 	/*
1189 	 * Add the flow to the global flow table, this table will be per
1190 	 * exclusive zone so each zone can have its own flow namespace.
1191 	 * RFE 6625651 will fix this.
1192 	 *
1193 	 */
1194 	if ((err = mac_flow_hash_add(flent)) != 0)
1195 		goto bail;
1196 
1197 	hash_added = B_TRUE;
1198 
1199 	/*
1200 	 * do not allow flows to be configured on an anchor VNIC
1201 	 */
1202 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1203 		err = ENOTSUP;
1204 		goto bail;
1205 	}
1206 
1207 	/*
1208 	 * Save the zoneid of the underlying link in the flow entry,
1209 	 * this is needed to prevent non-global zone from getting
1210 	 * statistics information of global zone.
1211 	 */
1212 	flent->fe_zoneid = dlp->dl_zid;
1213 
1214 	/*
1215 	 * Add the subflow to the subflow table. Also instantiate the flow
1216 	 * in the mac if there is an active user (we check if the MAC client's
1217 	 * datapath has been setup).
1218 	 */
1219 	err = mac_flow_add_subflow(dlp->dl_mch, flent,
1220 	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1221 	if (err != 0)
1222 		goto bail;
1223 
1224 	FLOW_UNMARK(flent, FE_INCIPIENT);
1225 	dls_devnet_rele_link(dlh, dlp);
1226 	mac_perim_exit(mph);
1227 	return (0);
1228 
1229 bail:
1230 	if (hash_added)
1231 		mac_flow_hash_remove(flent);
1232 
1233 	if (link_held)
1234 		dls_devnet_rele_link(dlh, dlp);
1235 
1236 	/*
1237 	 * Wait for any transient global flow hash refs to clear
1238 	 * and then release the creation reference on the flow
1239 	 */
1240 	mac_flow_wait(flent, FLOW_USER_REF);
1241 	FLOW_FINAL_REFRELE(flent);
1242 	mac_perim_exit(mph);
1243 	return (err);
1244 }
1245 
1246 /*
1247  * mac_link_flow_clean()
1248  * Internal flow interface used for freeing SRSs and related
1249  * data structures. Not meant to be used by mac clients.
1250  */
1251 void
1252 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1253 {
1254 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1255 	mac_impl_t		*mip = mcip->mci_mip;
1256 	boolean_t		last_subflow;
1257 
1258 	ASSERT(mch != NULL);
1259 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1260 
1261 	/*
1262 	 * This sub flow entry may fail to be fully initialized by
1263 	 * mac_link_flow_init(). If so, simply return.
1264 	 */
1265 	if (sub_flow->fe_mcip == NULL)
1266 		return;
1267 
1268 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1269 	/*
1270 	 * Tear down the data path
1271 	 */
1272 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1273 	sub_flow->fe_mcip = NULL;
1274 
1275 	/*
1276 	 * Delete the SRSs associated with this subflow. If this is being
1277 	 * driven by flowadm(1M) then the subflow will be deleted by
1278 	 * dls_rem_flow. However if this is a result of the interface being
1279 	 * unplumbed then the subflow itself won't be deleted.
1280 	 */
1281 	mac_flow_cleanup(sub_flow);
1282 
1283 	/*
1284 	 * If all the subflows are gone, renable some of the stuff
1285 	 * we disabled when adding a subflow, polling etc.
1286 	 */
1287 	if (last_subflow) {
1288 		/*
1289 		 * The subflow table itself is not protected by any locks or
1290 		 * refcnts. Hence quiesce the client upfront before clearing
1291 		 * mci_subflow_tab.
1292 		 */
1293 		mac_client_quiesce(mcip);
1294 		mac_client_update_classifier(mcip, B_FALSE);
1295 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1296 		mcip->mci_subflow_tab = NULL;
1297 		mac_client_restart(mcip);
1298 	}
1299 }
1300 
1301 /*
1302  * mac_link_flow_remove()
1303  * Used by flowadm(1m) or kernel mac clients for removing flows.
1304  */
1305 int
1306 mac_link_flow_remove(char *flow_name)
1307 {
1308 	flow_entry_t		*flent;
1309 	mac_perim_handle_t	mph;
1310 	int			err;
1311 	datalink_id_t		linkid;
1312 
1313 	err = mac_flow_lookup_byname(flow_name, &flent);
1314 	if (err != 0)
1315 		return (err);
1316 
1317 	linkid = flent->fe_link_id;
1318 	FLOW_USER_REFRELE(flent);
1319 
1320 	/*
1321 	 * The perim must be acquired before acquiring any other references
1322 	 * to maintain the lock and perimeter hierarchy. Please note the
1323 	 * FLOW_REFRELE above.
1324 	 */
1325 	err = mac_perim_enter_by_linkid(linkid, &mph);
1326 	if (err != 0)
1327 		return (err);
1328 
1329 	/*
1330 	 * Note the second lookup of the flow, because a concurrent thread
1331 	 * may have removed it already while we were waiting to enter the
1332 	 * link's perimeter.
1333 	 */
1334 	err = mac_flow_lookup_byname(flow_name, &flent);
1335 	if (err != 0) {
1336 		mac_perim_exit(mph);
1337 		return (err);
1338 	}
1339 	FLOW_USER_REFRELE(flent);
1340 
1341 	/*
1342 	 * Remove the flow from the subflow table and deactivate the flow
1343 	 * by quiescing and removings its SRSs
1344 	 */
1345 	mac_flow_rem_subflow(flent);
1346 
1347 	/*
1348 	 * Finally, remove the flow from the global table.
1349 	 */
1350 	mac_flow_hash_remove(flent);
1351 
1352 	/*
1353 	 * Wait for any transient global flow hash refs to clear
1354 	 * and then release the creation reference on the flow
1355 	 */
1356 	mac_flow_wait(flent, FLOW_USER_REF);
1357 	FLOW_FINAL_REFRELE(flent);
1358 
1359 	mac_perim_exit(mph);
1360 
1361 	return (0);
1362 }
1363 
1364 /*
1365  * mac_link_flow_modify()
1366  * Modifies the properties of a flow identified by its name.
1367  */
1368 int
1369 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1370 {
1371 	flow_entry_t		*flent;
1372 	mac_client_impl_t 	*mcip;
1373 	int			err = 0;
1374 	mac_perim_handle_t	mph;
1375 	datalink_id_t		linkid;
1376 	flow_tab_t		*flow_tab;
1377 
1378 	err = mac_validate_props(mrp);
1379 	if (err != 0)
1380 		return (err);
1381 
1382 	err = mac_flow_lookup_byname(flow_name, &flent);
1383 	if (err != 0)
1384 		return (err);
1385 
1386 	linkid = flent->fe_link_id;
1387 	FLOW_USER_REFRELE(flent);
1388 
1389 	/*
1390 	 * The perim must be acquired before acquiring any other references
1391 	 * to maintain the lock and perimeter hierarchy. Please note the
1392 	 * FLOW_REFRELE above.
1393 	 */
1394 	err = mac_perim_enter_by_linkid(linkid, &mph);
1395 	if (err != 0)
1396 		return (err);
1397 
1398 	/*
1399 	 * Note the second lookup of the flow, because a concurrent thread
1400 	 * may have removed it already while we were waiting to enter the
1401 	 * link's perimeter.
1402 	 */
1403 	err = mac_flow_lookup_byname(flow_name, &flent);
1404 	if (err != 0) {
1405 		mac_perim_exit(mph);
1406 		return (err);
1407 	}
1408 	FLOW_USER_REFRELE(flent);
1409 
1410 	/*
1411 	 * If this flow is attached to a MAC client, then pass the request
1412 	 * along to the client.
1413 	 * Otherwise, just update the cached values.
1414 	 */
1415 	mcip = flent->fe_mcip;
1416 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1417 	if (mcip != NULL) {
1418 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1419 			err = ENOENT;
1420 		} else {
1421 			mac_flow_modify(flow_tab, flent, mrp);
1422 		}
1423 	} else {
1424 		(void) mac_flow_modify_props(flent, mrp);
1425 	}
1426 
1427 done:
1428 	mac_perim_exit(mph);
1429 	return (err);
1430 }
1431 
1432 
1433 /*
1434  * State structure and misc functions used by mac_link_flow_walk().
1435  */
1436 typedef struct {
1437 	int	(*ws_func)(mac_flowinfo_t *, void *);
1438 	void	*ws_arg;
1439 } flow_walk_state_t;
1440 
1441 static void
1442 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1443 {
1444 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1445 	    MAXFLOWNAMELEN);
1446 	finfop->fi_link_id = flent->fe_link_id;
1447 	finfop->fi_flow_desc = flent->fe_flow_desc;
1448 	finfop->fi_resource_props = flent->fe_resource_props;
1449 }
1450 
1451 static int
1452 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1453 {
1454 	flow_walk_state_t	*statep = arg;
1455 	mac_flowinfo_t		finfo;
1456 
1457 	mac_link_flowinfo_copy(&finfo, flent);
1458 	return (statep->ws_func(&finfo, statep->ws_arg));
1459 }
1460 
1461 /*
1462  * mac_link_flow_walk()
1463  * Invokes callback 'func' for all flows belonging to the specified link.
1464  */
1465 int
1466 mac_link_flow_walk(datalink_id_t linkid,
1467     int (*func)(mac_flowinfo_t *, void *), void *arg)
1468 {
1469 	mac_client_impl_t	*mcip;
1470 	mac_perim_handle_t	mph;
1471 	flow_walk_state_t	state;
1472 	dls_dl_handle_t		dlh;
1473 	dls_link_t		*dlp;
1474 	int			err;
1475 
1476 	err = mac_perim_enter_by_linkid(linkid, &mph);
1477 	if (err != 0)
1478 		return (err);
1479 
1480 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1481 	if (err != 0) {
1482 		mac_perim_exit(mph);
1483 		return (err);
1484 	}
1485 
1486 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1487 	state.ws_func = func;
1488 	state.ws_arg = arg;
1489 
1490 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1491 	    mac_link_flow_walk_cb, &state);
1492 
1493 	dls_devnet_rele_link(dlh, dlp);
1494 	mac_perim_exit(mph);
1495 	return (err);
1496 }
1497 
1498 /*
1499  * mac_link_flow_info()
1500  * Retrieves information about a specific flow.
1501  */
1502 int
1503 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1504 {
1505 	flow_entry_t	*flent;
1506 	int		err;
1507 
1508 	err = mac_flow_lookup_byname(flow_name, &flent);
1509 	if (err != 0)
1510 		return (err);
1511 
1512 	mac_link_flowinfo_copy(finfo, flent);
1513 	FLOW_USER_REFRELE(flent);
1514 	return (0);
1515 }
1516 
1517 #define	HASH_MAC_VID(a, v, s) \
1518 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1519 
1520 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1521 
1522 #define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
1523 	if ((s)->fs_mp->b_wptr == (start)) {		\
1524 		mblk_t	*next = (s)->fs_mp->b_cont;	\
1525 		if (next == NULL)			\
1526 			return (EINVAL);		\
1527 							\
1528 		(s)->fs_mp = next;			\
1529 		(start) = next->b_rptr;			\
1530 	}						\
1531 }
1532 
1533 /* ARGSUSED */
1534 static boolean_t
1535 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1536 {
1537 	flow_l2info_t		*l2 = &s->fs_l2info;
1538 	flow_desc_t		*fd = &flent->fe_flow_desc;
1539 
1540 	return (l2->l2_vid == fd->fd_vid &&
1541 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1542 }
1543 
1544 /*
1545  * Layer 2 hash function.
1546  * Must be paired with flow_l2_accept() within a set of flow_ops
1547  * because it assumes the dest address is already extracted.
1548  */
1549 static uint32_t
1550 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1551 {
1552 	flow_l2info_t		*l2 = &s->fs_l2info;
1553 
1554 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1555 }
1556 
1557 /*
1558  * This is the generic layer 2 accept function.
1559  * It makes use of mac_header_info() to extract the header length,
1560  * sap, vlan ID and destination address.
1561  */
1562 static int
1563 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1564 {
1565 	boolean_t		is_ether;
1566 	flow_l2info_t		*l2 = &s->fs_l2info;
1567 	mac_header_info_t	mhi;
1568 	int			err;
1569 
1570 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1571 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1572 	    s->fs_mp, &mhi)) != 0) {
1573 		if (err == EINVAL)
1574 			err = ENOBUFS;
1575 
1576 		return (err);
1577 	}
1578 
1579 	l2->l2_start = s->fs_mp->b_rptr;
1580 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1581 
1582 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1583 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1584 		struct ether_vlan_header	*evhp =
1585 		    (struct ether_vlan_header *)l2->l2_start;
1586 
1587 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1588 			return (ENOBUFS);
1589 
1590 		l2->l2_sap = ntohs(evhp->ether_type);
1591 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1592 		l2->l2_hdrsize = sizeof (*evhp);
1593 	} else {
1594 		l2->l2_sap = mhi.mhi_bindsap;
1595 		l2->l2_vid = 0;
1596 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1597 	}
1598 	return (0);
1599 }
1600 
1601 /*
1602  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1603  * accept(). The notable difference is that dest address is now extracted
1604  * by hash() rather than by accept(). This saves a few memory references
1605  * for flow tables that do not care about mac addresses.
1606  */
1607 static uint32_t
1608 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1609 {
1610 	flow_l2info_t			*l2 = &s->fs_l2info;
1611 	struct ether_vlan_header	*evhp;
1612 
1613 	evhp = (struct ether_vlan_header *)l2->l2_start;
1614 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1615 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1616 }
1617 
1618 /* ARGSUSED */
1619 static int
1620 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1621 {
1622 	flow_l2info_t			*l2 = &s->fs_l2info;
1623 	struct ether_vlan_header	*evhp;
1624 	uint16_t			sap;
1625 
1626 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1627 	l2->l2_start = (uchar_t *)evhp;
1628 
1629 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1630 		return (ENOBUFS);
1631 
1632 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1633 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1634 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1635 			return (ENOBUFS);
1636 
1637 		l2->l2_sap = ntohs(evhp->ether_type);
1638 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1639 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1640 	} else {
1641 		l2->l2_sap = sap;
1642 		l2->l2_vid = 0;
1643 		l2->l2_hdrsize = sizeof (struct ether_header);
1644 	}
1645 	return (0);
1646 }
1647 
1648 /*
1649  * Validates a layer 2 flow entry.
1650  */
1651 static int
1652 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1653 {
1654 	int		i;
1655 	flow_desc_t	*fd = &flent->fe_flow_desc;
1656 
1657 	/*
1658 	 * Dest address is mandatory.
1659 	 */
1660 	if ((fd->fd_mask & FLOW_LINK_DST) == 0)
1661 		return (EINVAL);
1662 
1663 	for (i = 0; i < fd->fd_mac_len; i++) {
1664 		if (fd->fd_dst_mac[i] != 0)
1665 			break;
1666 	}
1667 	if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL)
1668 		return (EINVAL);
1669 
1670 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1671 		/*
1672 		 * VLAN flows are only supported over ethernet macs.
1673 		 */
1674 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1675 			return (EINVAL);
1676 
1677 		if (fd->fd_vid == 0)
1678 			return (EINVAL);
1679 
1680 	}
1681 	flent->fe_match = flow_l2_match;
1682 	return (0);
1683 }
1684 
1685 /*
1686  * Calculates hash index of flow entry.
1687  */
1688 static uint32_t
1689 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1690 {
1691 	flow_desc_t	*fd = &flent->fe_flow_desc;
1692 
1693 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1694 	return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1695 }
1696 
1697 /*
1698  * This is used for duplicate flow checking.
1699  */
1700 /* ARGSUSED */
1701 static boolean_t
1702 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1703 {
1704 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1705 
1706 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1707 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1708 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1709 }
1710 
1711 /*
1712  * Generic flow entry insertion function.
1713  * Used by flow tables that do not have ordering requirements.
1714  */
1715 /* ARGSUSED */
1716 static int
1717 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1718     flow_entry_t *flent)
1719 {
1720 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1721 
1722 	if (*headp != NULL) {
1723 		ASSERT(flent->fe_next == NULL);
1724 		flent->fe_next = *headp;
1725 	}
1726 	*headp = flent;
1727 	return (0);
1728 }
1729 
1730 /*
1731  * IP version independent DSField matching function.
1732  */
1733 /* ARGSUSED */
1734 static boolean_t
1735 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1736 {
1737 	flow_l3info_t	*l3info = &s->fs_l3info;
1738 	flow_desc_t	*fd = &flent->fe_flow_desc;
1739 
1740 	switch (l3info->l3_version) {
1741 	case IPV4_VERSION: {
1742 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1743 
1744 		return ((ipha->ipha_type_of_service &
1745 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1746 	}
1747 	case IPV6_VERSION: {
1748 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1749 
1750 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1751 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1752 	}
1753 	default:
1754 		return (B_FALSE);
1755 	}
1756 }
1757 
1758 /*
1759  * IP v4 and v6 address matching.
1760  * The netmask only needs to be applied on the packet but not on the
1761  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1762  */
1763 
1764 /* ARGSUSED */
1765 static boolean_t
1766 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1767 {
1768 	flow_l3info_t	*l3info = &s->fs_l3info;
1769 	flow_desc_t	*fd = &flent->fe_flow_desc;
1770 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1771 	in_addr_t	addr;
1772 
1773 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1774 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1775 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1776 		    V4_PART_OF_V6(fd->fd_local_addr));
1777 	}
1778 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1779 	    V4_PART_OF_V6(fd->fd_remote_addr));
1780 }
1781 
1782 /* ARGSUSED */
1783 static boolean_t
1784 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1785 {
1786 	flow_l3info_t	*l3info = &s->fs_l3info;
1787 	flow_desc_t	*fd = &flent->fe_flow_desc;
1788 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1789 	in6_addr_t	*addrp;
1790 
1791 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1792 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1793 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1794 		    fd->fd_local_addr));
1795 	}
1796 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1797 }
1798 
1799 /* ARGSUSED */
1800 static boolean_t
1801 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1802 {
1803 	flow_l3info_t	*l3info = &s->fs_l3info;
1804 	flow_desc_t	*fd = &flent->fe_flow_desc;
1805 
1806 	return (l3info->l3_protocol == fd->fd_protocol);
1807 }
1808 
1809 static uint32_t
1810 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1811 {
1812 	flow_l3info_t	*l3info = &s->fs_l3info;
1813 	flow_mask_t	mask = ft->ft_mask;
1814 
1815 	if ((mask & FLOW_IP_LOCAL) != 0) {
1816 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1817 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1818 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1819 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1820 		/*
1821 		 * DSField flents are arranged as a single list.
1822 		 */
1823 		return (0);
1824 	}
1825 	/*
1826 	 * IP addr flents are hashed into two lists, v4 or v6.
1827 	 */
1828 	ASSERT(ft->ft_size >= 2);
1829 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1830 }
1831 
1832 static uint32_t
1833 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1834 {
1835 	flow_l3info_t	*l3info = &s->fs_l3info;
1836 
1837 	return (l3info->l3_protocol % ft->ft_size);
1838 }
1839 
1840 /* ARGSUSED */
1841 static int
1842 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1843 {
1844 	flow_l2info_t	*l2info = &s->fs_l2info;
1845 	flow_l3info_t	*l3info = &s->fs_l3info;
1846 	uint16_t	sap = l2info->l2_sap;
1847 	uchar_t		*l3_start;
1848 
1849 	l3_start = l2info->l2_start + l2info->l2_hdrsize;
1850 
1851 	/*
1852 	 * Adjust start pointer if we're at the end of an mblk.
1853 	 */
1854 	CHECK_AND_ADJUST_START_PTR(s, l3_start);
1855 
1856 	l3info->l3_start = l3_start;
1857 	if (!OK_32PTR(l3_start))
1858 		return (EINVAL);
1859 
1860 	switch (sap) {
1861 	case ETHERTYPE_IP: {
1862 		ipha_t	*ipha = (ipha_t *)l3_start;
1863 
1864 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1865 			return (ENOBUFS);
1866 
1867 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1868 		l3info->l3_protocol = ipha->ipha_protocol;
1869 		l3info->l3_version = IPV4_VERSION;
1870 		l3info->l3_fragmented =
1871 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1872 		break;
1873 	}
1874 	case ETHERTYPE_IPV6: {
1875 		ip6_t   *ip6h = (ip6_t *)l3_start;
1876 		uint16_t ip6_hdrlen;
1877 		uint8_t	 nexthdr;
1878 
1879 		if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
1880 		    &nexthdr)) {
1881 			return (ENOBUFS);
1882 		}
1883 		l3info->l3_hdrsize = ip6_hdrlen;
1884 		l3info->l3_protocol = nexthdr;
1885 		l3info->l3_version = IPV6_VERSION;
1886 		l3info->l3_fragmented = B_FALSE;
1887 		break;
1888 	}
1889 	default:
1890 		return (EINVAL);
1891 	}
1892 	return (0);
1893 }
1894 
1895 /* ARGSUSED */
1896 static int
1897 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1898 {
1899 	flow_desc_t	*fd = &flent->fe_flow_desc;
1900 
1901 	switch (fd->fd_protocol) {
1902 	case IPPROTO_TCP:
1903 	case IPPROTO_UDP:
1904 	case IPPROTO_SCTP:
1905 	case IPPROTO_ICMP:
1906 	case IPPROTO_ICMPV6:
1907 		flent->fe_match = flow_ip_proto_match;
1908 		return (0);
1909 	default:
1910 		return (EINVAL);
1911 	}
1912 }
1913 
1914 /* ARGSUSED */
1915 static int
1916 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1917 {
1918 	flow_desc_t	*fd = &flent->fe_flow_desc;
1919 	flow_mask_t	mask;
1920 	uint8_t		version;
1921 	in6_addr_t	*addr, *netmask;
1922 
1923 	/*
1924 	 * DSField does not require a IP version.
1925 	 */
1926 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
1927 		if (fd->fd_dsfield_mask == 0)
1928 			return (EINVAL);
1929 
1930 		flent->fe_match = flow_ip_dsfield_match;
1931 		return (0);
1932 	}
1933 
1934 	/*
1935 	 * IP addresses must come with a version to avoid ambiguity.
1936 	 */
1937 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
1938 		return (EINVAL);
1939 
1940 	version = fd->fd_ipversion;
1941 	if (version != IPV4_VERSION && version != IPV6_VERSION)
1942 		return (EINVAL);
1943 
1944 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
1945 	switch (mask) {
1946 	case FLOW_IP_LOCAL:
1947 		addr = &fd->fd_local_addr;
1948 		netmask = &fd->fd_local_netmask;
1949 		break;
1950 	case FLOW_IP_REMOTE:
1951 		addr = &fd->fd_remote_addr;
1952 		netmask = &fd->fd_remote_netmask;
1953 		break;
1954 	default:
1955 		return (EINVAL);
1956 	}
1957 
1958 	/*
1959 	 * Apply netmask onto specified address.
1960 	 */
1961 	V6_MASK_COPY(*addr, *netmask, *addr);
1962 	if (version == IPV4_VERSION) {
1963 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
1964 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
1965 
1966 		if (v4addr == 0 || v4mask == 0)
1967 			return (EINVAL);
1968 		flent->fe_match = flow_ip_v4_match;
1969 	} else {
1970 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
1971 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
1972 			return (EINVAL);
1973 		flent->fe_match = flow_ip_v6_match;
1974 	}
1975 	return (0);
1976 }
1977 
1978 static uint32_t
1979 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1980 {
1981 	flow_desc_t	*fd = &flent->fe_flow_desc;
1982 
1983 	return (fd->fd_protocol % ft->ft_size);
1984 }
1985 
1986 static uint32_t
1987 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1988 {
1989 	flow_desc_t	*fd = &flent->fe_flow_desc;
1990 
1991 	/*
1992 	 * DSField flents are arranged as a single list.
1993 	 */
1994 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
1995 		return (0);
1996 
1997 	/*
1998 	 * IP addr flents are hashed into two lists, v4 or v6.
1999 	 */
2000 	ASSERT(ft->ft_size >= 2);
2001 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2002 }
2003 
2004 /* ARGSUSED */
2005 static boolean_t
2006 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2007 {
2008 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2009 
2010 	return (fd1->fd_protocol == fd2->fd_protocol);
2011 }
2012 
2013 /* ARGSUSED */
2014 static boolean_t
2015 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2016 {
2017 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2018 	in6_addr_t	*a1, *m1, *a2, *m2;
2019 
2020 	ASSERT(fd1->fd_mask == fd2->fd_mask);
2021 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2022 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2023 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2024 	}
2025 
2026 	/*
2027 	 * flow_ip_accept_fe() already validated the version.
2028 	 */
2029 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2030 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2031 		return (B_FALSE);
2032 
2033 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2034 	case FLOW_IP_LOCAL:
2035 		a1 = &fd1->fd_local_addr;
2036 		m1 = &fd1->fd_local_netmask;
2037 		a2 = &fd2->fd_local_addr;
2038 		m2 = &fd2->fd_local_netmask;
2039 		break;
2040 	case FLOW_IP_REMOTE:
2041 		a1 = &fd1->fd_remote_addr;
2042 		m1 = &fd1->fd_remote_netmask;
2043 		a2 = &fd2->fd_remote_addr;
2044 		m2 = &fd2->fd_remote_netmask;
2045 		break;
2046 	default:
2047 		/*
2048 		 * This is unreachable given the checks in
2049 		 * flow_ip_accept_fe().
2050 		 */
2051 		return (B_FALSE);
2052 	}
2053 
2054 	if (fd1->fd_ipversion == IPV4_VERSION) {
2055 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2056 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2057 
2058 	} else {
2059 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2060 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2061 	}
2062 }
2063 
2064 static int
2065 flow_ip_mask2plen(in6_addr_t *v6mask)
2066 {
2067 	int		bits;
2068 	int		plen = IPV6_ABITS;
2069 	int		i;
2070 
2071 	for (i = 3; i >= 0; i--) {
2072 		if (v6mask->s6_addr32[i] == 0) {
2073 			plen -= 32;
2074 			continue;
2075 		}
2076 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2077 		if (bits == 0)
2078 			break;
2079 		plen -= bits;
2080 	}
2081 	return (plen);
2082 }
2083 
2084 /* ARGSUSED */
2085 static int
2086 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2087     flow_entry_t *flent)
2088 {
2089 	flow_entry_t	**p = headp;
2090 	flow_desc_t	*fd0, *fd;
2091 	in6_addr_t	*m0, *m;
2092 	int		plen0, plen;
2093 
2094 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2095 
2096 	/*
2097 	 * No special ordering needed for dsfield.
2098 	 */
2099 	fd0 = &flent->fe_flow_desc;
2100 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2101 		if (*p != NULL) {
2102 			ASSERT(flent->fe_next == NULL);
2103 			flent->fe_next = *p;
2104 		}
2105 		*p = flent;
2106 		return (0);
2107 	}
2108 
2109 	/*
2110 	 * IP address flows are arranged in descending prefix length order.
2111 	 */
2112 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2113 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2114 	plen0 = flow_ip_mask2plen(m0);
2115 	ASSERT(plen0 != 0);
2116 
2117 	for (; *p != NULL; p = &(*p)->fe_next) {
2118 		fd = &(*p)->fe_flow_desc;
2119 
2120 		/*
2121 		 * Normally a dsfield flent shouldn't end up on the same
2122 		 * list as an IP address because flow tables are (for now)
2123 		 * disjoint. If we decide to support both IP and dsfield
2124 		 * in the same table in the future, this check will allow
2125 		 * for that.
2126 		 */
2127 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2128 			continue;
2129 
2130 		/*
2131 		 * We also allow for the mixing of local and remote address
2132 		 * flents within one list.
2133 		 */
2134 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2135 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2136 		plen = flow_ip_mask2plen(m);
2137 
2138 		if (plen <= plen0)
2139 			break;
2140 	}
2141 	if (*p != NULL) {
2142 		ASSERT(flent->fe_next == NULL);
2143 		flent->fe_next = *p;
2144 	}
2145 	*p = flent;
2146 	return (0);
2147 }
2148 
2149 /*
2150  * Transport layer protocol and port matching functions.
2151  */
2152 
2153 /* ARGSUSED */
2154 static boolean_t
2155 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2156 {
2157 	flow_l3info_t	*l3info = &s->fs_l3info;
2158 	flow_l4info_t	*l4info = &s->fs_l4info;
2159 	flow_desc_t	*fd = &flent->fe_flow_desc;
2160 
2161 	return (fd->fd_protocol == l3info->l3_protocol &&
2162 	    fd->fd_local_port == l4info->l4_hash_port);
2163 }
2164 
2165 /* ARGSUSED */
2166 static boolean_t
2167 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2168 {
2169 	flow_l3info_t	*l3info = &s->fs_l3info;
2170 	flow_l4info_t	*l4info = &s->fs_l4info;
2171 	flow_desc_t	*fd = &flent->fe_flow_desc;
2172 
2173 	return (fd->fd_protocol == l3info->l3_protocol &&
2174 	    fd->fd_remote_port == l4info->l4_hash_port);
2175 }
2176 
2177 /*
2178  * Transport hash function.
2179  * Since we only support either local or remote port flows,
2180  * we only need to extract one of the ports to be used for
2181  * matching.
2182  */
2183 static uint32_t
2184 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2185 {
2186 	flow_l3info_t	*l3info = &s->fs_l3info;
2187 	flow_l4info_t	*l4info = &s->fs_l4info;
2188 	uint8_t		proto = l3info->l3_protocol;
2189 	boolean_t	dst_or_src;
2190 
2191 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2192 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2193 	} else {
2194 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2195 	}
2196 
2197 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2198 	    l4info->l4_src_port;
2199 
2200 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2201 }
2202 
2203 /*
2204  * Unlike other accept() functions above, we do not need to get the header
2205  * size because this is our highest layer so far. If we want to do support
2206  * other higher layer protocols, we would need to save the l4_hdrsize
2207  * in the code below.
2208  */
2209 
2210 /* ARGSUSED */
2211 static int
2212 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2213 {
2214 	flow_l3info_t	*l3info = &s->fs_l3info;
2215 	flow_l4info_t	*l4info = &s->fs_l4info;
2216 	uint8_t		proto = l3info->l3_protocol;
2217 	uchar_t		*l4_start;
2218 
2219 	l4_start = l3info->l3_start + l3info->l3_hdrsize;
2220 
2221 	/*
2222 	 * Adjust start pointer if we're at the end of an mblk.
2223 	 */
2224 	CHECK_AND_ADJUST_START_PTR(s, l4_start);
2225 
2226 	l4info->l4_start = l4_start;
2227 	if (!OK_32PTR(l4_start))
2228 		return (EINVAL);
2229 
2230 	if (l3info->l3_fragmented == B_TRUE)
2231 		return (EINVAL);
2232 
2233 	switch (proto) {
2234 	case IPPROTO_TCP: {
2235 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2236 
2237 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2238 			return (ENOBUFS);
2239 
2240 		l4info->l4_src_port = tcph->th_sport;
2241 		l4info->l4_dst_port = tcph->th_dport;
2242 		break;
2243 	}
2244 	case IPPROTO_UDP: {
2245 		struct udphdr	*udph = (struct udphdr *)l4_start;
2246 
2247 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2248 			return (ENOBUFS);
2249 
2250 		l4info->l4_src_port = udph->uh_sport;
2251 		l4info->l4_dst_port = udph->uh_dport;
2252 		break;
2253 	}
2254 	case IPPROTO_SCTP: {
2255 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2256 
2257 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2258 			return (ENOBUFS);
2259 
2260 		l4info->l4_src_port = sctph->sh_sport;
2261 		l4info->l4_dst_port = sctph->sh_dport;
2262 		break;
2263 	}
2264 	default:
2265 		return (EINVAL);
2266 	}
2267 
2268 	return (0);
2269 }
2270 
2271 /*
2272  * Validates transport flow entry.
2273  * The protocol field must be present.
2274  */
2275 
2276 /* ARGSUSED */
2277 static int
2278 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2279 {
2280 	flow_desc_t	*fd = &flent->fe_flow_desc;
2281 	flow_mask_t	mask = fd->fd_mask;
2282 
2283 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2284 		return (EINVAL);
2285 
2286 	switch (fd->fd_protocol) {
2287 	case IPPROTO_TCP:
2288 	case IPPROTO_UDP:
2289 	case IPPROTO_SCTP:
2290 		break;
2291 	default:
2292 		return (EINVAL);
2293 	}
2294 
2295 	switch (mask & ~FLOW_IP_PROTOCOL) {
2296 	case FLOW_ULP_PORT_LOCAL:
2297 		if (fd->fd_local_port == 0)
2298 			return (EINVAL);
2299 
2300 		flent->fe_match = flow_transport_lport_match;
2301 		break;
2302 	case FLOW_ULP_PORT_REMOTE:
2303 		if (fd->fd_remote_port == 0)
2304 			return (EINVAL);
2305 
2306 		flent->fe_match = flow_transport_rport_match;
2307 		break;
2308 	case 0:
2309 		/*
2310 		 * transport-only flows conflicts with our table type.
2311 		 */
2312 		return (EOPNOTSUPP);
2313 	default:
2314 		return (EINVAL);
2315 	}
2316 
2317 	return (0);
2318 }
2319 
2320 static uint32_t
2321 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2322 {
2323 	flow_desc_t	*fd = &flent->fe_flow_desc;
2324 	uint16_t	port = 0;
2325 
2326 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2327 	    fd->fd_local_port : fd->fd_remote_port;
2328 
2329 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2330 }
2331 
2332 /* ARGSUSED */
2333 static boolean_t
2334 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2335 {
2336 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2337 
2338 	if (fd1->fd_protocol != fd2->fd_protocol)
2339 		return (B_FALSE);
2340 
2341 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2342 		return (fd1->fd_local_port == fd2->fd_local_port);
2343 
2344 	return (fd1->fd_remote_port == fd2->fd_remote_port);
2345 }
2346 
2347 static flow_ops_t flow_l2_ops = {
2348 	flow_l2_accept_fe,
2349 	flow_l2_hash_fe,
2350 	flow_l2_match_fe,
2351 	flow_generic_insert_fe,
2352 	flow_l2_hash,
2353 	{flow_l2_accept}
2354 };
2355 
2356 static flow_ops_t flow_ip_ops = {
2357 	flow_ip_accept_fe,
2358 	flow_ip_hash_fe,
2359 	flow_ip_match_fe,
2360 	flow_ip_insert_fe,
2361 	flow_ip_hash,
2362 	{flow_l2_accept, flow_ip_accept}
2363 };
2364 
2365 static flow_ops_t flow_ip_proto_ops = {
2366 	flow_ip_proto_accept_fe,
2367 	flow_ip_proto_hash_fe,
2368 	flow_ip_proto_match_fe,
2369 	flow_generic_insert_fe,
2370 	flow_ip_proto_hash,
2371 	{flow_l2_accept, flow_ip_accept}
2372 };
2373 
2374 static flow_ops_t flow_transport_ops = {
2375 	flow_transport_accept_fe,
2376 	flow_transport_hash_fe,
2377 	flow_transport_match_fe,
2378 	flow_generic_insert_fe,
2379 	flow_transport_hash,
2380 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2381 };
2382 
2383 static flow_tab_info_t flow_tab_info_list[] = {
2384 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2385 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2386 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2387 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2388 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}
2389 };
2390 
2391 #define	FLOW_MAX_TAB_INFO \
2392 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2393 
2394 static flow_tab_info_t *
2395 mac_flow_tab_info_get(flow_mask_t mask)
2396 {
2397 	int	i;
2398 
2399 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2400 		if (mask == flow_tab_info_list[i].fti_mask)
2401 			return (&flow_tab_info_list[i]);
2402 	}
2403 	return (NULL);
2404 }
2405