xref: /titanic_51/usr/src/uts/common/io/mac/mac_flow.c (revision 37acf26adb79d43bb16f72774829c6f4655d0cc4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/dls.h>
33 #include <sys/dls_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/ethernet.h>
36 #include <sys/vlan.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <netinet/tcp.h>
40 #include <netinet/udp.h>
41 #include <netinet/sctp.h>
42 
43 /* global flow table, will be a per exclusive-zone table later */
44 static mod_hash_t	*flow_hash;
45 static krwlock_t	flow_tab_lock;
46 
47 static kmem_cache_t	*flow_cache;
48 static kmem_cache_t	*flow_tab_cache;
49 static flow_ops_t	flow_l2_ops;
50 
51 typedef struct {
52 	const char	*fs_name;
53 	uint_t		fs_offset;
54 } flow_stats_info_t;
55 
56 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
57 static flow_stats_info_t flow_stats_list[] = {
58 	{"rbytes",	FS_OFF(fs_rbytes)},
59 	{"ipackets",	FS_OFF(fs_ipackets)},
60 	{"ierrors",	FS_OFF(fs_ierrors)},
61 	{"obytes",	FS_OFF(fs_obytes)},
62 	{"opackets",	FS_OFF(fs_opackets)},
63 	{"oerrors",	FS_OFF(fs_oerrors)}
64 };
65 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
66 
67 /*
68  * Checks whether a flow mask is legal.
69  */
70 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
71 
72 static void
73 flow_stat_init(kstat_named_t *knp)
74 {
75 	int	i;
76 
77 	for (i = 0; i < FS_SIZE; i++, knp++) {
78 		kstat_named_init(knp, flow_stats_list[i].fs_name,
79 		    KSTAT_DATA_UINT64);
80 	}
81 }
82 
83 static int
84 flow_stat_update(kstat_t *ksp, int rw)
85 {
86 	flow_entry_t		*fep = ksp->ks_private;
87 	flow_stats_t 		*fsp = &fep->fe_flowstats;
88 	kstat_named_t		*knp = ksp->ks_data;
89 	uint64_t		*statp;
90 	zoneid_t		zid;
91 	int			i;
92 
93 	if (rw != KSTAT_READ)
94 		return (EACCES);
95 
96 	zid = getzoneid();
97 	if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) {
98 		for (i = 0; i < FS_SIZE; i++, knp++)
99 			knp->value.ui64 = 0;
100 
101 		return (0);
102 	}
103 
104 	for (i = 0; i < FS_SIZE; i++, knp++) {
105 		statp = (uint64_t *)
106 		    ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
107 
108 		knp->value.ui64 = *statp;
109 	}
110 	return (0);
111 }
112 
113 static void
114 flow_stat_create(flow_entry_t *fep)
115 {
116 	kstat_t		*ksp;
117 	kstat_named_t	*knp;
118 	uint_t		nstats = FS_SIZE;
119 
120 	ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow",
121 	    KSTAT_TYPE_NAMED, nstats, 0);
122 	if (ksp == NULL)
123 		return;
124 
125 	ksp->ks_update = flow_stat_update;
126 	ksp->ks_private = fep;
127 	fep->fe_ksp = ksp;
128 
129 	knp = (kstat_named_t *)ksp->ks_data;
130 	flow_stat_init(knp);
131 	kstat_install(ksp);
132 }
133 
134 void
135 flow_stat_destroy(flow_entry_t *fep)
136 {
137 	if (fep->fe_ksp != NULL) {
138 		kstat_delete(fep->fe_ksp);
139 		fep->fe_ksp = NULL;
140 	}
141 }
142 
143 /*
144  * Initialize the flow table
145  */
146 void
147 mac_flow_init()
148 {
149 	flow_cache = kmem_cache_create("flow_entry_cache",
150 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
151 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
152 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
153 	flow_hash = mod_hash_create_extended("flow_hash",
154 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
155 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
156 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
157 }
158 
159 /*
160  * Cleanup and release the flow table
161  */
162 void
163 mac_flow_fini()
164 {
165 	kmem_cache_destroy(flow_cache);
166 	kmem_cache_destroy(flow_tab_cache);
167 	mod_hash_destroy_hash(flow_hash);
168 	rw_destroy(&flow_tab_lock);
169 }
170 
171 /*
172  * mac_create_flow(): create a flow_entry_t.
173  */
174 int
175 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
176     void *client_cookie, uint_t type, flow_entry_t **flentp)
177 {
178 	flow_entry_t	*flent = *flentp;
179 	int		err = 0;
180 
181 	if (mrp != NULL) {
182 		err = mac_validate_props(mrp);
183 		if (err != 0)
184 			return (err);
185 	}
186 
187 	if (flent == NULL) {
188 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
189 		bzero(flent, sizeof (*flent));
190 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
191 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
192 
193 		/* Initialize the receiver function to a safe routine */
194 		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
195 		flent->fe_index = -1;
196 	}
197 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
198 
199 	/* This is an initial flow, will be configured later */
200 	if (fd == NULL) {
201 		*flentp = flent;
202 		return (0);
203 	}
204 
205 	flent->fe_client_cookie = client_cookie;
206 	flent->fe_type = type;
207 
208 	/*
209 	 * As flow creation is only allowed in global zone, this will
210 	 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will
211 	 * later set the right value.
212 	 */
213 	flent->fe_zoneid = getzoneid();
214 
215 	/* Save flow desc */
216 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
217 
218 	if (mrp != NULL) {
219 		/*
220 		 * We have already set fe_resource_props for a Link.
221 		 */
222 		if (type & FLOW_USER) {
223 			bcopy(mrp, &flent->fe_resource_props,
224 			    sizeof (mac_resource_props_t));
225 		}
226 		/*
227 		 * The effective resource list should reflect the priority
228 		 * that we set implicitly.
229 		 */
230 		if (!(mrp->mrp_mask & MRP_PRIORITY))
231 			mrp->mrp_mask |= MRP_PRIORITY;
232 		if (type & FLOW_USER)
233 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
234 		else
235 			mrp->mrp_priority = MPL_LINK_DEFAULT;
236 		bcopy(mrp, &flent->fe_effective_props,
237 		    sizeof (mac_resource_props_t));
238 	}
239 	flow_stat_create(flent);
240 
241 	*flentp = flent;
242 	return (0);
243 }
244 
245 /*
246  * Validate flow entry and add it to a flow table.
247  */
248 int
249 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
250 {
251 	flow_entry_t	**headp, **p;
252 	flow_ops_t	*ops = &ft->ft_ops;
253 	flow_mask_t	mask;
254 	uint32_t	index;
255 	int		err;
256 
257 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
258 
259 	/*
260 	 * Check for invalid bits in mask.
261 	 */
262 	mask = flent->fe_flow_desc.fd_mask;
263 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
264 		return (EOPNOTSUPP);
265 
266 	/*
267 	 * Validate flent.
268 	 */
269 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
270 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
271 		    flow_entry_t *, flent, int, err);
272 		return (err);
273 	}
274 
275 	/*
276 	 * Flent is valid. now calculate hash and insert it
277 	 * into hash table.
278 	 */
279 	index = ops->fo_hash_fe(ft, flent);
280 
281 	/*
282 	 * We do not need a lock up until now because we were
283 	 * not accessing the flow table.
284 	 */
285 	rw_enter(&ft->ft_lock, RW_WRITER);
286 	headp = &ft->ft_table[index];
287 
288 	/*
289 	 * Check for duplicate flow.
290 	 */
291 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
292 		if ((*p)->fe_flow_desc.fd_mask !=
293 		    flent->fe_flow_desc.fd_mask)
294 			continue;
295 
296 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
297 			rw_exit(&ft->ft_lock);
298 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
299 			    flow_entry_t *, flent, int, err);
300 			return (EALREADY);
301 		}
302 	}
303 
304 	/*
305 	 * Insert flow to hash list.
306 	 */
307 	err = ops->fo_insert_fe(ft, headp, flent);
308 	if (err != 0) {
309 		rw_exit(&ft->ft_lock);
310 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
311 		    flow_entry_t *, flent, int, err);
312 		return (err);
313 	}
314 
315 	/*
316 	 * Save the hash index so it can be used by mac_flow_remove().
317 	 */
318 	flent->fe_index = (int)index;
319 
320 	/*
321 	 * Save the flow tab back reference.
322 	 */
323 	flent->fe_flow_tab = ft;
324 	FLOW_MARK(flent, FE_FLOW_TAB);
325 	ft->ft_flow_count++;
326 	rw_exit(&ft->ft_lock);
327 	return (0);
328 }
329 
330 /*
331  * Remove a flow from a mac client's subflow table
332  */
333 void
334 mac_flow_rem_subflow(flow_entry_t *flent)
335 {
336 	flow_tab_t		*ft = flent->fe_flow_tab;
337 	mac_client_impl_t	*mcip = ft->ft_mcip;
338 	mac_handle_t		mh = (mac_handle_t)ft->ft_mip;
339 
340 	ASSERT(MAC_PERIM_HELD(mh));
341 
342 	mac_flow_remove(ft, flent, B_FALSE);
343 	if (flent->fe_mcip == NULL) {
344 		/*
345 		 * The interface is not yet plumbed and mac_client_flow_add
346 		 * was not done.
347 		 */
348 		if (FLOW_TAB_EMPTY(ft)) {
349 			mac_flow_tab_destroy(ft);
350 			mcip->mci_subflow_tab = NULL;
351 		}
352 	} else {
353 		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
354 		mac_link_flow_clean((mac_client_handle_t)mcip, flent);
355 	}
356 	mac_fastpath_enable(mh);
357 }
358 
359 /*
360  * Add a flow to a mac client's subflow table and instantiate the flow
361  * in the mac by creating the associated SRSs etc.
362  */
363 int
364 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
365     boolean_t instantiate_flow)
366 {
367 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
368 	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
369 	flow_tab_info_t		*ftinfo;
370 	flow_mask_t		mask;
371 	flow_tab_t		*ft;
372 	int			err;
373 	boolean_t		ft_created = B_FALSE;
374 
375 	ASSERT(MAC_PERIM_HELD(mh));
376 
377 	if ((err = mac_fastpath_disable(mh)) != 0)
378 		return (err);
379 
380 	/*
381 	 * If the subflow table exists already just add the new subflow
382 	 * to the existing table, else we create a new subflow table below.
383 	 */
384 	ft = mcip->mci_subflow_tab;
385 	if (ft == NULL) {
386 		mask = flent->fe_flow_desc.fd_mask;
387 		/*
388 		 * Try to create a new table and then add the subflow to the
389 		 * newly created subflow table
390 		 */
391 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
392 			mac_fastpath_enable(mh);
393 			return (EOPNOTSUPP);
394 		}
395 
396 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
397 		    mcip->mci_mip, &ft);
398 		ft_created = B_TRUE;
399 	}
400 
401 	err = mac_flow_add(ft, flent);
402 	if (err != 0) {
403 		if (ft_created)
404 			mac_flow_tab_destroy(ft);
405 		mac_fastpath_enable(mh);
406 		return (err);
407 	}
408 
409 	if (instantiate_flow) {
410 		/* Now activate the flow by creating its SRSs */
411 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
412 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
413 		if (err != 0) {
414 			mac_flow_remove(ft, flent, B_FALSE);
415 			if (ft_created)
416 				mac_flow_tab_destroy(ft);
417 			mac_fastpath_enable(mh);
418 			return (err);
419 		}
420 	} else {
421 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
422 	}
423 	if (ft_created) {
424 		ASSERT(mcip->mci_subflow_tab == NULL);
425 		ft->ft_mcip = mcip;
426 		mcip->mci_subflow_tab = ft;
427 		if (instantiate_flow)
428 			mac_client_update_classifier(mcip, B_TRUE);
429 	}
430 	return (0);
431 }
432 
433 /*
434  * Remove flow entry from flow table.
435  */
436 void
437 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
438 {
439 	flow_entry_t	**fp;
440 
441 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
442 	if (!(flent->fe_flags & FE_FLOW_TAB))
443 		return;
444 
445 	rw_enter(&ft->ft_lock, RW_WRITER);
446 	/*
447 	 * If this is a permanent removal from the flow table, mark it
448 	 * CONDEMNED to prevent future references. If this is a temporary
449 	 * removal from the table, say to update the flow descriptor then
450 	 * we don't mark it CONDEMNED
451 	 */
452 	if (!temp)
453 		FLOW_MARK(flent, FE_CONDEMNED);
454 	/*
455 	 * Locate the specified flent.
456 	 */
457 	fp = &ft->ft_table[flent->fe_index];
458 	while (*fp != flent)
459 		fp = &(*fp)->fe_next;
460 
461 	/*
462 	 * The flent must exist. Otherwise it's a bug.
463 	 */
464 	ASSERT(fp != NULL);
465 	*fp = flent->fe_next;
466 	flent->fe_next = NULL;
467 
468 	/*
469 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
470 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
471 	 * will panic.
472 	 */
473 	flent->fe_index = -1;
474 	FLOW_UNMARK(flent, FE_FLOW_TAB);
475 	ft->ft_flow_count--;
476 	rw_exit(&ft->ft_lock);
477 }
478 
479 /*
480  * This is the flow lookup routine used by the mac sw classifier engine.
481  */
482 int
483 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
484 {
485 	flow_state_t	s;
486 	flow_entry_t	*flent;
487 	flow_ops_t	*ops = &ft->ft_ops;
488 	boolean_t	retried = B_FALSE;
489 	int		i, err;
490 
491 	s.fs_flags = flags;
492 retry:
493 	s.fs_mp = mp;
494 
495 	/*
496 	 * Walk the list of predeclared accept functions.
497 	 * Each of these would accumulate enough state to allow the next
498 	 * accept routine to make progress.
499 	 */
500 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
501 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
502 			mblk_t	*last;
503 
504 			/*
505 			 * ENOBUFS indicates that the mp could be too short
506 			 * and may need a pullup.
507 			 */
508 			if (err != ENOBUFS || retried)
509 				return (err);
510 
511 			/*
512 			 * The pullup is done on the last processed mblk, not
513 			 * the starting one. pullup is not done if the mblk
514 			 * has references or if b_cont is NULL.
515 			 */
516 			last = s.fs_mp;
517 			if (DB_REF(last) > 1 || last->b_cont == NULL ||
518 			    pullupmsg(last, -1) == 0)
519 				return (EINVAL);
520 
521 			retried = B_TRUE;
522 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
523 			    flow_state_t *, &s);
524 			goto retry;
525 		}
526 	}
527 
528 	/*
529 	 * The packet is considered sane. We may now attempt to
530 	 * find the corresponding flent.
531 	 */
532 	rw_enter(&ft->ft_lock, RW_READER);
533 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
534 	for (; flent != NULL; flent = flent->fe_next) {
535 		if (flent->fe_match(ft, flent, &s)) {
536 			FLOW_TRY_REFHOLD(flent, err);
537 			if (err != 0)
538 				continue;
539 			*flentp = flent;
540 			rw_exit(&ft->ft_lock);
541 			return (0);
542 		}
543 	}
544 	rw_exit(&ft->ft_lock);
545 	return (ENOENT);
546 }
547 
548 /*
549  * Walk flow table.
550  * The caller is assumed to have proper perimeter protection.
551  */
552 int
553 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
554     void *arg)
555 {
556 	int		err, i, cnt = 0;
557 	flow_entry_t	*flent;
558 
559 	if (ft == NULL)
560 		return (0);
561 
562 	for (i = 0; i < ft->ft_size; i++) {
563 		for (flent = ft->ft_table[i]; flent != NULL;
564 		    flent = flent->fe_next) {
565 			cnt++;
566 			err = (*fn)(flent, arg);
567 			if (err != 0)
568 				return (err);
569 		}
570 	}
571 	VERIFY(cnt == ft->ft_flow_count);
572 	return (0);
573 }
574 
575 /*
576  * Same as the above except a mutex is used for protection here.
577  */
578 int
579 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
580     void *arg)
581 {
582 	int		err;
583 
584 	if (ft == NULL)
585 		return (0);
586 
587 	rw_enter(&ft->ft_lock, RW_WRITER);
588 	err = mac_flow_walk_nolock(ft, fn, arg);
589 	rw_exit(&ft->ft_lock);
590 	return (err);
591 }
592 
593 static boolean_t	mac_flow_clean(flow_entry_t *);
594 
595 /*
596  * Destroy a flow entry. Called when the last reference on a flow is released.
597  */
598 void
599 mac_flow_destroy(flow_entry_t *flent)
600 {
601 	ASSERT(flent->fe_refcnt == 0);
602 
603 	if ((flent->fe_type & FLOW_USER) != 0) {
604 		ASSERT(mac_flow_clean(flent));
605 	} else {
606 		mac_flow_cleanup(flent);
607 	}
608 
609 	mutex_destroy(&flent->fe_lock);
610 	cv_destroy(&flent->fe_cv);
611 	flow_stat_destroy(flent);
612 	kmem_cache_free(flow_cache, flent);
613 }
614 
615 /*
616  * XXX eric
617  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
618  * mac_link_flow_modify() should really be moved/reworked into the
619  * two functions below. This would consolidate all the mac property
620  * checking in one place. I'm leaving this alone for now since it's
621  * out of scope of the new flows work.
622  */
623 /* ARGSUSED */
624 uint32_t
625 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
626 {
627 	uint32_t		changed_mask = 0;
628 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
629 	int			i;
630 
631 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
632 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
633 		changed_mask |= MRP_MAXBW;
634 		fmrp->mrp_maxbw = mrp->mrp_maxbw;
635 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
636 			fmrp->mrp_mask &= ~MRP_MAXBW;
637 		} else {
638 			fmrp->mrp_mask |= MRP_MAXBW;
639 		}
640 	}
641 
642 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
643 		if (fmrp->mrp_priority != mrp->mrp_priority)
644 			changed_mask |= MRP_PRIORITY;
645 		if (mrp->mrp_priority == MPL_RESET) {
646 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
647 			fmrp->mrp_mask &= ~MRP_PRIORITY;
648 		} else {
649 			fmrp->mrp_priority = mrp->mrp_priority;
650 			fmrp->mrp_mask |= MRP_PRIORITY;
651 		}
652 	}
653 
654 	/* modify fanout */
655 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
656 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
657 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
658 			for (i = 0; i < mrp->mrp_ncpus; i++) {
659 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
660 					break;
661 			}
662 			if (i == mrp->mrp_ncpus) {
663 				/*
664 				 * The new set of cpus passed is exactly
665 				 * the same as the existing set.
666 				 */
667 				return (changed_mask);
668 			}
669 		}
670 		changed_mask |= MRP_CPUS;
671 		MAC_COPY_CPUS(mrp, fmrp);
672 	}
673 	return (changed_mask);
674 }
675 
676 void
677 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
678 {
679 	uint32_t changed_mask;
680 	mac_client_impl_t *mcip = flent->fe_mcip;
681 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
682 
683 	ASSERT(flent != NULL);
684 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
685 
686 	rw_enter(&ft->ft_lock, RW_WRITER);
687 
688 	/* Update the cached values inside the subflow entry */
689 	changed_mask = mac_flow_modify_props(flent, mrp);
690 	rw_exit(&ft->ft_lock);
691 	/*
692 	 * Push the changed parameters to the scheduling code in the
693 	 * SRS's, to take effect right away.
694 	 */
695 	if (changed_mask & MRP_MAXBW) {
696 		mac_srs_update_bwlimit(flent, mrp);
697 		/*
698 		 * If bandwidth is changed, we may have to change
699 		 * the number of soft ring to be used for fanout.
700 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
701 		 * is not set and there is no user supplied cpu
702 		 * info. This applies only to link at this time.
703 		 */
704 		if (!(flent->fe_type & FLOW_USER) &&
705 		    !(changed_mask & MRP_CPUS) &&
706 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
707 			mac_fanout_setup(mcip, flent, mcip_mrp,
708 			    mac_rx_deliver, mcip, NULL);
709 		}
710 	}
711 	if (mrp->mrp_mask & MRP_PRIORITY)
712 		mac_flow_update_priority(mcip, flent);
713 
714 	if (changed_mask & MRP_CPUS)
715 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
716 }
717 
718 /*
719  * This function waits for a certain condition to be met and is generally
720  * used before a destructive or quiescing operation.
721  */
722 void
723 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
724 {
725 	mutex_enter(&flent->fe_lock);
726 	flent->fe_flags |= FE_WAITER;
727 
728 	switch (event) {
729 	case FLOW_DRIVER_UPCALL:
730 		/*
731 		 * We want to make sure the driver upcalls have finished before
732 		 * we signal the Rx SRS worker to quit.
733 		 */
734 		while (flent->fe_refcnt != 1)
735 			cv_wait(&flent->fe_cv, &flent->fe_lock);
736 		break;
737 
738 	case FLOW_USER_REF:
739 		/*
740 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
741 		 * been removed from the global flow hash.
742 		 */
743 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
744 		while (flent->fe_user_refcnt != 0)
745 			cv_wait(&flent->fe_cv, &flent->fe_lock);
746 		break;
747 
748 	default:
749 		ASSERT(0);
750 	}
751 
752 	flent->fe_flags &= ~FE_WAITER;
753 	mutex_exit(&flent->fe_lock);
754 }
755 
756 static boolean_t
757 mac_flow_clean(flow_entry_t *flent)
758 {
759 	ASSERT(flent->fe_next == NULL);
760 	ASSERT(flent->fe_tx_srs == NULL);
761 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
762 	ASSERT(flent->fe_mbg == NULL);
763 
764 	return (B_TRUE);
765 }
766 
767 void
768 mac_flow_cleanup(flow_entry_t *flent)
769 {
770 	if ((flent->fe_type & FLOW_USER) == 0) {
771 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
772 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
773 		ASSERT(flent->fe_refcnt == 0);
774 	} else {
775 		ASSERT(flent->fe_refcnt == 1);
776 	}
777 
778 	if (flent->fe_mbg != NULL) {
779 		ASSERT(flent->fe_tx_srs == NULL);
780 		/* This is a multicast or broadcast flow entry */
781 		mac_bcast_grp_free(flent->fe_mbg);
782 		flent->fe_mbg = NULL;
783 	}
784 
785 	if (flent->fe_tx_srs != NULL) {
786 		ASSERT(flent->fe_mbg == NULL);
787 		mac_srs_free(flent->fe_tx_srs);
788 		flent->fe_tx_srs = NULL;
789 	}
790 
791 	/*
792 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
793 	 * when mac_unicast_add fails we may not have set up any SRS
794 	 * in which case fe_rx_srs_cnt will be zero.
795 	 */
796 	if (flent->fe_rx_srs_cnt != 0) {
797 		ASSERT(flent->fe_rx_srs_cnt == 1);
798 		mac_srs_free(flent->fe_rx_srs[0]);
799 		flent->fe_rx_srs[0] = NULL;
800 		flent->fe_rx_srs_cnt = 0;
801 	}
802 	ASSERT(flent->fe_rx_srs[0] == NULL);
803 }
804 
805 void
806 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
807 {
808 	/*
809 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
810 	 * Updates to the fe_flow_desc happen under the fe_lock
811 	 * after removing the flent from the flow table
812 	 */
813 	mutex_enter(&flent->fe_lock);
814 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
815 	mutex_exit(&flent->fe_lock);
816 }
817 
818 /*
819  * Update a field of a flow entry. The mac perimeter ensures that
820  * this is the only thread doing a modify operation on this mac end point.
821  * So the flow table can't change or disappear. The ft_lock protects access
822  * to the flow entry, and holding the lock ensures that there isn't any thread
823  * accessing the flow entry or attempting a flow table lookup. However
824  * data threads that are using the flow entry based on the old descriptor
825  * will continue to use the flow entry. If strong coherence is required
826  * then the flow will have to be quiesced before the descriptor can be
827  * changed.
828  */
829 void
830 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
831 {
832 	flow_tab_t	*ft = flent->fe_flow_tab;
833 	flow_desc_t	old_desc;
834 	int		err;
835 
836 	if (ft == NULL) {
837 		/*
838 		 * The flow hasn't yet been inserted into the table,
839 		 * so only the caller knows about this flow, however for
840 		 * uniformity we grab the fe_lock here.
841 		 */
842 		mutex_enter(&flent->fe_lock);
843 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
844 		mutex_exit(&flent->fe_lock);
845 	}
846 
847 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
848 
849 	/*
850 	 * Need to remove the flow entry from the table and reinsert it,
851 	 * into a potentially diference hash line. The hash depends on
852 	 * the new descriptor fields. However access to fe_desc itself
853 	 * is always under the fe_lock. This helps log and stat functions
854 	 * see a self-consistent fe_flow_desc.
855 	 */
856 	mac_flow_remove(ft, flent, B_TRUE);
857 	old_desc = flent->fe_flow_desc;
858 
859 	mutex_enter(&flent->fe_lock);
860 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
861 	mutex_exit(&flent->fe_lock);
862 
863 	if (mac_flow_add(ft, flent) != 0) {
864 		/*
865 		 * The add failed say due to an invalid flow descriptor.
866 		 * Undo the update
867 		 */
868 		flent->fe_flow_desc = old_desc;
869 		err = mac_flow_add(ft, flent);
870 		ASSERT(err == 0);
871 	}
872 }
873 
874 void
875 mac_flow_set_name(flow_entry_t *flent, const char *name)
876 {
877 	flow_tab_t	*ft = flent->fe_flow_tab;
878 
879 	if (ft == NULL) {
880 		/*
881 		 *  The flow hasn't yet been inserted into the table,
882 		 * so only the caller knows about this flow
883 		 */
884 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
885 	} else {
886 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
887 	}
888 
889 	mutex_enter(&flent->fe_lock);
890 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
891 	mutex_exit(&flent->fe_lock);
892 }
893 
894 /*
895  * Return the client-private cookie that was associated with
896  * the flow when it was created.
897  */
898 void *
899 mac_flow_get_client_cookie(flow_entry_t *flent)
900 {
901 	return (flent->fe_client_cookie);
902 }
903 
904 /*
905  * Forward declarations.
906  */
907 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
908 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
909 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
910 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
911 
912 /*
913  * Create flow table.
914  */
915 void
916 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
917     mac_impl_t *mip, flow_tab_t **ftp)
918 {
919 	flow_tab_t	*ft;
920 	flow_ops_t	*new_ops;
921 
922 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
923 	bzero(ft, sizeof (*ft));
924 
925 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
926 
927 	/*
928 	 * We make a copy of the ops vector instead of just pointing to it
929 	 * because we might want to customize the ops vector on a per table
930 	 * basis (e.g. for optimization).
931 	 */
932 	new_ops = &ft->ft_ops;
933 	bcopy(ops, new_ops, sizeof (*ops));
934 	ft->ft_mask = mask;
935 	ft->ft_size = size;
936 	ft->ft_mip = mip;
937 
938 	/*
939 	 * Optimization for DL_ETHER media.
940 	 */
941 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
942 		if (new_ops->fo_hash == flow_l2_hash)
943 			new_ops->fo_hash = flow_ether_hash;
944 
945 		if (new_ops->fo_accept[0] == flow_l2_accept)
946 			new_ops->fo_accept[0] = flow_ether_accept;
947 
948 	}
949 	*ftp = ft;
950 }
951 
952 void
953 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
954 {
955 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
956 	    1024, mip, ftp);
957 }
958 
959 /*
960  * Destroy flow table.
961  */
962 void
963 mac_flow_tab_destroy(flow_tab_t *ft)
964 {
965 	if (ft == NULL)
966 		return;
967 
968 	ASSERT(ft->ft_flow_count == 0);
969 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
970 	bzero(ft, sizeof (*ft));
971 	kmem_cache_free(flow_tab_cache, ft);
972 }
973 
974 /*
975  * Add a new flow entry to the global flow hash table
976  */
977 int
978 mac_flow_hash_add(flow_entry_t *flent)
979 {
980 	int	err;
981 
982 	rw_enter(&flow_tab_lock, RW_WRITER);
983 	err = mod_hash_insert(flow_hash,
984 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
985 	if (err != 0) {
986 		rw_exit(&flow_tab_lock);
987 		return (EEXIST);
988 	}
989 	/* Mark as inserted into the global flow hash table */
990 	FLOW_MARK(flent, FE_G_FLOW_HASH);
991 	rw_exit(&flow_tab_lock);
992 	return (err);
993 }
994 
995 /*
996  * Remove a flow entry from the global flow hash table
997  */
998 void
999 mac_flow_hash_remove(flow_entry_t *flent)
1000 {
1001 	mod_hash_val_t	val;
1002 
1003 	rw_enter(&flow_tab_lock, RW_WRITER);
1004 	VERIFY(mod_hash_remove(flow_hash,
1005 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1006 
1007 	/* Clear the mark that says inserted into the global flow hash table */
1008 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1009 	rw_exit(&flow_tab_lock);
1010 }
1011 
1012 /*
1013  * Retrieve a flow entry from the global flow hash table.
1014  */
1015 int
1016 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1017 {
1018 	int		err;
1019 	flow_entry_t	*flent;
1020 
1021 	rw_enter(&flow_tab_lock, RW_READER);
1022 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1023 	    (mod_hash_val_t *)&flent);
1024 	if (err != 0) {
1025 		rw_exit(&flow_tab_lock);
1026 		return (ENOENT);
1027 	}
1028 	ASSERT(flent != NULL);
1029 	FLOW_USER_REFHOLD(flent);
1030 	rw_exit(&flow_tab_lock);
1031 
1032 	*flentp = flent;
1033 	return (0);
1034 }
1035 
1036 /*
1037  * Initialize or release mac client flows by walking the subflow table.
1038  * These are typically invoked during plumb/unplumb of links.
1039  */
1040 
1041 static int
1042 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1043 {
1044 	mac_client_impl_t	*mcip = arg;
1045 
1046 	if (mac_link_flow_init(arg, flent) != 0) {
1047 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1048 		    flent->fe_flow_name, mcip->mci_name);
1049 	} else {
1050 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1051 	}
1052 	return (0);
1053 }
1054 
1055 void
1056 mac_link_init_flows(mac_client_handle_t mch)
1057 {
1058 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1059 
1060 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1061 	    mac_link_init_flows_cb, mcip);
1062 	/*
1063 	 * If mac client had subflow(s) configured before plumb, change
1064 	 * function to mac_rx_srs_subflow_process and in case of hardware
1065 	 * classification, disable polling.
1066 	 */
1067 	mac_client_update_classifier(mcip, B_TRUE);
1068 
1069 }
1070 
1071 boolean_t
1072 mac_link_has_flows(mac_client_handle_t mch)
1073 {
1074 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1075 
1076 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1077 		return (B_TRUE);
1078 
1079 	return (B_FALSE);
1080 }
1081 
1082 static int
1083 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1084 {
1085 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1086 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1087 	mac_link_flow_clean(arg, flent);
1088 	return (0);
1089 }
1090 
1091 void
1092 mac_link_release_flows(mac_client_handle_t mch)
1093 {
1094 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1095 
1096 	/*
1097 	 * Change the mci_flent callback back to mac_rx_srs_process()
1098 	 * because flows are about to be deactivated.
1099 	 */
1100 	mac_client_update_classifier(mcip, B_FALSE);
1101 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1102 	    mac_link_release_flows_cb, mcip);
1103 }
1104 
1105 void
1106 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1107 {
1108 	mac_flow_set_name(fep, new_name);
1109 	if (fep->fe_ksp != NULL) {
1110 		flow_stat_destroy(fep);
1111 		flow_stat_create(fep);
1112 	}
1113 }
1114 
1115 /*
1116  * mac_link_flow_init()
1117  * Internal flow interface used for allocating SRSs and related
1118  * data structures. Not meant to be used by mac clients.
1119  */
1120 int
1121 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1122 {
1123 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1124 	mac_impl_t		*mip = mcip->mci_mip;
1125 	int			err;
1126 
1127 	ASSERT(mch != NULL);
1128 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1129 
1130 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1131 		return (err);
1132 
1133 	sub_flow->fe_mcip = mcip;
1134 
1135 	return (0);
1136 }
1137 
1138 /*
1139  * mac_link_flow_add()
1140  * Used by flowadm(1m) or kernel mac clients for creating flows.
1141  */
1142 int
1143 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1144     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1145 {
1146 	flow_entry_t		*flent = NULL;
1147 	int			err;
1148 	dls_dl_handle_t		dlh;
1149 	dls_link_t		*dlp;
1150 	boolean_t		link_held = B_FALSE;
1151 	boolean_t		hash_added = B_FALSE;
1152 	mac_perim_handle_t	mph;
1153 
1154 	err = mac_flow_lookup_byname(flow_name, &flent);
1155 	if (err == 0) {
1156 		FLOW_USER_REFRELE(flent);
1157 		return (EEXIST);
1158 	}
1159 
1160 	/*
1161 	 * First create a flow entry given the description provided
1162 	 * by the caller.
1163 	 */
1164 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1165 	    FLOW_USER | FLOW_OTHER, &flent);
1166 
1167 	if (err != 0)
1168 		return (err);
1169 
1170 	/*
1171 	 * We've got a local variable referencing this flow now, so we need
1172 	 * to hold it. We'll release this flow before returning.
1173 	 * All failures until we return will undo any action that may internally
1174 	 * held the flow, so the last REFRELE will assure a clean freeing
1175 	 * of resources.
1176 	 */
1177 	FLOW_REFHOLD(flent);
1178 
1179 	flent->fe_link_id = linkid;
1180 	FLOW_MARK(flent, FE_INCIPIENT);
1181 
1182 	err = mac_perim_enter_by_linkid(linkid, &mph);
1183 	if (err != 0) {
1184 		FLOW_FINAL_REFRELE(flent);
1185 		return (err);
1186 	}
1187 
1188 	/*
1189 	 * dls will eventually be merged with mac so it's ok
1190 	 * to call dls' internal functions.
1191 	 */
1192 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1193 	if (err != 0)
1194 		goto bail;
1195 
1196 	link_held = B_TRUE;
1197 
1198 	/*
1199 	 * Add the flow to the global flow table, this table will be per
1200 	 * exclusive zone so each zone can have its own flow namespace.
1201 	 * RFE 6625651 will fix this.
1202 	 *
1203 	 */
1204 	if ((err = mac_flow_hash_add(flent)) != 0)
1205 		goto bail;
1206 
1207 	hash_added = B_TRUE;
1208 
1209 	/*
1210 	 * do not allow flows to be configured on an anchor VNIC
1211 	 */
1212 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1213 		err = ENOTSUP;
1214 		goto bail;
1215 	}
1216 
1217 	/*
1218 	 * Save the zoneid of the underlying link in the flow entry,
1219 	 * this is needed to prevent non-global zone from getting
1220 	 * statistics information of global zone.
1221 	 */
1222 	flent->fe_zoneid = dlp->dl_zid;
1223 
1224 	/*
1225 	 * Add the subflow to the subflow table. Also instantiate the flow
1226 	 * in the mac if there is an active user (we check if the MAC client's
1227 	 * datapath has been setup).
1228 	 */
1229 	err = mac_flow_add_subflow(dlp->dl_mch, flent,
1230 	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1231 	if (err != 0)
1232 		goto bail;
1233 
1234 	FLOW_UNMARK(flent, FE_INCIPIENT);
1235 	dls_devnet_rele_link(dlh, dlp);
1236 	mac_perim_exit(mph);
1237 	return (0);
1238 
1239 bail:
1240 	if (hash_added)
1241 		mac_flow_hash_remove(flent);
1242 
1243 	if (link_held)
1244 		dls_devnet_rele_link(dlh, dlp);
1245 
1246 	/*
1247 	 * Wait for any transient global flow hash refs to clear
1248 	 * and then release the creation reference on the flow
1249 	 */
1250 	mac_flow_wait(flent, FLOW_USER_REF);
1251 	FLOW_FINAL_REFRELE(flent);
1252 	mac_perim_exit(mph);
1253 	return (err);
1254 }
1255 
1256 /*
1257  * mac_link_flow_clean()
1258  * Internal flow interface used for freeing SRSs and related
1259  * data structures. Not meant to be used by mac clients.
1260  */
1261 void
1262 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1263 {
1264 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1265 	mac_impl_t		*mip = mcip->mci_mip;
1266 	boolean_t		last_subflow;
1267 
1268 	ASSERT(mch != NULL);
1269 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1270 
1271 	/*
1272 	 * This sub flow entry may fail to be fully initialized by
1273 	 * mac_link_flow_init(). If so, simply return.
1274 	 */
1275 	if (sub_flow->fe_mcip == NULL)
1276 		return;
1277 
1278 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1279 	/*
1280 	 * Tear down the data path
1281 	 */
1282 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1283 	sub_flow->fe_mcip = NULL;
1284 
1285 	/*
1286 	 * Delete the SRSs associated with this subflow. If this is being
1287 	 * driven by flowadm(1M) then the subflow will be deleted by
1288 	 * dls_rem_flow. However if this is a result of the interface being
1289 	 * unplumbed then the subflow itself won't be deleted.
1290 	 */
1291 	mac_flow_cleanup(sub_flow);
1292 
1293 	/*
1294 	 * If all the subflows are gone, renable some of the stuff
1295 	 * we disabled when adding a subflow, polling etc.
1296 	 */
1297 	if (last_subflow) {
1298 		/*
1299 		 * The subflow table itself is not protected by any locks or
1300 		 * refcnts. Hence quiesce the client upfront before clearing
1301 		 * mci_subflow_tab.
1302 		 */
1303 		mac_client_quiesce(mcip);
1304 		mac_client_update_classifier(mcip, B_FALSE);
1305 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1306 		mcip->mci_subflow_tab = NULL;
1307 		mac_client_restart(mcip);
1308 	}
1309 }
1310 
1311 /*
1312  * mac_link_flow_remove()
1313  * Used by flowadm(1m) or kernel mac clients for removing flows.
1314  */
1315 int
1316 mac_link_flow_remove(char *flow_name)
1317 {
1318 	flow_entry_t		*flent;
1319 	mac_perim_handle_t	mph;
1320 	int			err;
1321 	datalink_id_t		linkid;
1322 
1323 	err = mac_flow_lookup_byname(flow_name, &flent);
1324 	if (err != 0)
1325 		return (err);
1326 
1327 	linkid = flent->fe_link_id;
1328 	FLOW_USER_REFRELE(flent);
1329 
1330 	/*
1331 	 * The perim must be acquired before acquiring any other references
1332 	 * to maintain the lock and perimeter hierarchy. Please note the
1333 	 * FLOW_REFRELE above.
1334 	 */
1335 	err = mac_perim_enter_by_linkid(linkid, &mph);
1336 	if (err != 0)
1337 		return (err);
1338 
1339 	/*
1340 	 * Note the second lookup of the flow, because a concurrent thread
1341 	 * may have removed it already while we were waiting to enter the
1342 	 * link's perimeter.
1343 	 */
1344 	err = mac_flow_lookup_byname(flow_name, &flent);
1345 	if (err != 0) {
1346 		mac_perim_exit(mph);
1347 		return (err);
1348 	}
1349 	FLOW_USER_REFRELE(flent);
1350 
1351 	/*
1352 	 * Remove the flow from the subflow table and deactivate the flow
1353 	 * by quiescing and removings its SRSs
1354 	 */
1355 	mac_flow_rem_subflow(flent);
1356 
1357 	/*
1358 	 * Finally, remove the flow from the global table.
1359 	 */
1360 	mac_flow_hash_remove(flent);
1361 
1362 	/*
1363 	 * Wait for any transient global flow hash refs to clear
1364 	 * and then release the creation reference on the flow
1365 	 */
1366 	mac_flow_wait(flent, FLOW_USER_REF);
1367 	FLOW_FINAL_REFRELE(flent);
1368 
1369 	mac_perim_exit(mph);
1370 
1371 	return (0);
1372 }
1373 
1374 /*
1375  * mac_link_flow_modify()
1376  * Modifies the properties of a flow identified by its name.
1377  */
1378 int
1379 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1380 {
1381 	flow_entry_t		*flent;
1382 	mac_client_impl_t 	*mcip;
1383 	int			err = 0;
1384 	mac_perim_handle_t	mph;
1385 	datalink_id_t		linkid;
1386 	flow_tab_t		*flow_tab;
1387 
1388 	err = mac_validate_props(mrp);
1389 	if (err != 0)
1390 		return (err);
1391 
1392 	err = mac_flow_lookup_byname(flow_name, &flent);
1393 	if (err != 0)
1394 		return (err);
1395 
1396 	linkid = flent->fe_link_id;
1397 	FLOW_USER_REFRELE(flent);
1398 
1399 	/*
1400 	 * The perim must be acquired before acquiring any other references
1401 	 * to maintain the lock and perimeter hierarchy. Please note the
1402 	 * FLOW_REFRELE above.
1403 	 */
1404 	err = mac_perim_enter_by_linkid(linkid, &mph);
1405 	if (err != 0)
1406 		return (err);
1407 
1408 	/*
1409 	 * Note the second lookup of the flow, because a concurrent thread
1410 	 * may have removed it already while we were waiting to enter the
1411 	 * link's perimeter.
1412 	 */
1413 	err = mac_flow_lookup_byname(flow_name, &flent);
1414 	if (err != 0) {
1415 		mac_perim_exit(mph);
1416 		return (err);
1417 	}
1418 	FLOW_USER_REFRELE(flent);
1419 
1420 	/*
1421 	 * If this flow is attached to a MAC client, then pass the request
1422 	 * along to the client.
1423 	 * Otherwise, just update the cached values.
1424 	 */
1425 	mcip = flent->fe_mcip;
1426 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1427 	if (mcip != NULL) {
1428 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1429 			err = ENOENT;
1430 		} else {
1431 			mac_flow_modify(flow_tab, flent, mrp);
1432 		}
1433 	} else {
1434 		(void) mac_flow_modify_props(flent, mrp);
1435 	}
1436 
1437 done:
1438 	mac_perim_exit(mph);
1439 	return (err);
1440 }
1441 
1442 
1443 /*
1444  * State structure and misc functions used by mac_link_flow_walk().
1445  */
1446 typedef struct {
1447 	int	(*ws_func)(mac_flowinfo_t *, void *);
1448 	void	*ws_arg;
1449 } flow_walk_state_t;
1450 
1451 static void
1452 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1453 {
1454 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1455 	    MAXFLOWNAMELEN);
1456 	finfop->fi_link_id = flent->fe_link_id;
1457 	finfop->fi_flow_desc = flent->fe_flow_desc;
1458 	finfop->fi_resource_props = flent->fe_resource_props;
1459 }
1460 
1461 static int
1462 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1463 {
1464 	flow_walk_state_t	*statep = arg;
1465 	mac_flowinfo_t		finfo;
1466 
1467 	mac_link_flowinfo_copy(&finfo, flent);
1468 	return (statep->ws_func(&finfo, statep->ws_arg));
1469 }
1470 
1471 /*
1472  * mac_link_flow_walk()
1473  * Invokes callback 'func' for all flows belonging to the specified link.
1474  */
1475 int
1476 mac_link_flow_walk(datalink_id_t linkid,
1477     int (*func)(mac_flowinfo_t *, void *), void *arg)
1478 {
1479 	mac_client_impl_t	*mcip;
1480 	mac_perim_handle_t	mph;
1481 	flow_walk_state_t	state;
1482 	dls_dl_handle_t		dlh;
1483 	dls_link_t		*dlp;
1484 	int			err;
1485 
1486 	err = mac_perim_enter_by_linkid(linkid, &mph);
1487 	if (err != 0)
1488 		return (err);
1489 
1490 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1491 	if (err != 0) {
1492 		mac_perim_exit(mph);
1493 		return (err);
1494 	}
1495 
1496 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1497 	state.ws_func = func;
1498 	state.ws_arg = arg;
1499 
1500 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1501 	    mac_link_flow_walk_cb, &state);
1502 
1503 	dls_devnet_rele_link(dlh, dlp);
1504 	mac_perim_exit(mph);
1505 	return (err);
1506 }
1507 
1508 /*
1509  * mac_link_flow_info()
1510  * Retrieves information about a specific flow.
1511  */
1512 int
1513 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1514 {
1515 	flow_entry_t	*flent;
1516 	int		err;
1517 
1518 	err = mac_flow_lookup_byname(flow_name, &flent);
1519 	if (err != 0)
1520 		return (err);
1521 
1522 	mac_link_flowinfo_copy(finfo, flent);
1523 	FLOW_USER_REFRELE(flent);
1524 	return (0);
1525 }
1526 
1527 #define	HASH_MAC_VID(a, v, s) \
1528 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1529 
1530 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1531 
1532 #define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
1533 	if ((s)->fs_mp->b_wptr == (start)) {		\
1534 		mblk_t	*next = (s)->fs_mp->b_cont;	\
1535 		if (next == NULL)			\
1536 			return (EINVAL);		\
1537 							\
1538 		(s)->fs_mp = next;			\
1539 		(start) = next->b_rptr;			\
1540 	}						\
1541 }
1542 
1543 /* ARGSUSED */
1544 static boolean_t
1545 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1546 {
1547 	flow_l2info_t		*l2 = &s->fs_l2info;
1548 	flow_desc_t		*fd = &flent->fe_flow_desc;
1549 
1550 	return (l2->l2_vid == fd->fd_vid &&
1551 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1552 }
1553 
1554 /*
1555  * Layer 2 hash function.
1556  * Must be paired with flow_l2_accept() within a set of flow_ops
1557  * because it assumes the dest address is already extracted.
1558  */
1559 static uint32_t
1560 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1561 {
1562 	flow_l2info_t		*l2 = &s->fs_l2info;
1563 
1564 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1565 }
1566 
1567 /*
1568  * This is the generic layer 2 accept function.
1569  * It makes use of mac_header_info() to extract the header length,
1570  * sap, vlan ID and destination address.
1571  */
1572 static int
1573 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1574 {
1575 	boolean_t		is_ether;
1576 	flow_l2info_t		*l2 = &s->fs_l2info;
1577 	mac_header_info_t	mhi;
1578 	int			err;
1579 
1580 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1581 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1582 	    s->fs_mp, &mhi)) != 0) {
1583 		if (err == EINVAL)
1584 			err = ENOBUFS;
1585 
1586 		return (err);
1587 	}
1588 
1589 	l2->l2_start = s->fs_mp->b_rptr;
1590 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1591 
1592 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1593 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1594 		struct ether_vlan_header	*evhp =
1595 		    (struct ether_vlan_header *)l2->l2_start;
1596 
1597 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1598 			return (ENOBUFS);
1599 
1600 		l2->l2_sap = ntohs(evhp->ether_type);
1601 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1602 		l2->l2_hdrsize = sizeof (*evhp);
1603 	} else {
1604 		l2->l2_sap = mhi.mhi_bindsap;
1605 		l2->l2_vid = 0;
1606 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1607 	}
1608 	return (0);
1609 }
1610 
1611 /*
1612  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1613  * accept(). The notable difference is that dest address is now extracted
1614  * by hash() rather than by accept(). This saves a few memory references
1615  * for flow tables that do not care about mac addresses.
1616  */
1617 static uint32_t
1618 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1619 {
1620 	flow_l2info_t			*l2 = &s->fs_l2info;
1621 	struct ether_vlan_header	*evhp;
1622 
1623 	evhp = (struct ether_vlan_header *)l2->l2_start;
1624 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1625 	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1626 }
1627 
1628 /* ARGSUSED */
1629 static int
1630 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1631 {
1632 	flow_l2info_t			*l2 = &s->fs_l2info;
1633 	struct ether_vlan_header	*evhp;
1634 	uint16_t			sap;
1635 
1636 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1637 	l2->l2_start = (uchar_t *)evhp;
1638 
1639 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1640 		return (ENOBUFS);
1641 
1642 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1643 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1644 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1645 			return (ENOBUFS);
1646 
1647 		l2->l2_sap = ntohs(evhp->ether_type);
1648 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1649 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1650 	} else {
1651 		l2->l2_sap = sap;
1652 		l2->l2_vid = 0;
1653 		l2->l2_hdrsize = sizeof (struct ether_header);
1654 	}
1655 	return (0);
1656 }
1657 
1658 /*
1659  * Validates a layer 2 flow entry.
1660  */
1661 static int
1662 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1663 {
1664 	int		i;
1665 	flow_desc_t	*fd = &flent->fe_flow_desc;
1666 
1667 	/*
1668 	 * Dest address is mandatory.
1669 	 */
1670 	if ((fd->fd_mask & FLOW_LINK_DST) == 0)
1671 		return (EINVAL);
1672 
1673 	for (i = 0; i < fd->fd_mac_len; i++) {
1674 		if (fd->fd_dst_mac[i] != 0)
1675 			break;
1676 	}
1677 	if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL)
1678 		return (EINVAL);
1679 
1680 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1681 		/*
1682 		 * VLAN flows are only supported over ethernet macs.
1683 		 */
1684 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1685 			return (EINVAL);
1686 
1687 		if (fd->fd_vid == 0)
1688 			return (EINVAL);
1689 
1690 	}
1691 	flent->fe_match = flow_l2_match;
1692 	return (0);
1693 }
1694 
1695 /*
1696  * Calculates hash index of flow entry.
1697  */
1698 static uint32_t
1699 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1700 {
1701 	flow_desc_t	*fd = &flent->fe_flow_desc;
1702 
1703 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1704 	return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1705 }
1706 
1707 /*
1708  * This is used for duplicate flow checking.
1709  */
1710 /* ARGSUSED */
1711 static boolean_t
1712 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1713 {
1714 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1715 
1716 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1717 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1718 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1719 }
1720 
1721 /*
1722  * Generic flow entry insertion function.
1723  * Used by flow tables that do not have ordering requirements.
1724  */
1725 /* ARGSUSED */
1726 static int
1727 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1728     flow_entry_t *flent)
1729 {
1730 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1731 
1732 	if (*headp != NULL) {
1733 		ASSERT(flent->fe_next == NULL);
1734 		flent->fe_next = *headp;
1735 	}
1736 	*headp = flent;
1737 	return (0);
1738 }
1739 
1740 /*
1741  * IP version independent DSField matching function.
1742  */
1743 /* ARGSUSED */
1744 static boolean_t
1745 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1746 {
1747 	flow_l3info_t	*l3info = &s->fs_l3info;
1748 	flow_desc_t	*fd = &flent->fe_flow_desc;
1749 
1750 	switch (l3info->l3_version) {
1751 	case IPV4_VERSION: {
1752 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1753 
1754 		return ((ipha->ipha_type_of_service &
1755 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1756 	}
1757 	case IPV6_VERSION: {
1758 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1759 
1760 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1761 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1762 	}
1763 	default:
1764 		return (B_FALSE);
1765 	}
1766 }
1767 
1768 /*
1769  * IP v4 and v6 address matching.
1770  * The netmask only needs to be applied on the packet but not on the
1771  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1772  */
1773 
1774 /* ARGSUSED */
1775 static boolean_t
1776 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1777 {
1778 	flow_l3info_t	*l3info = &s->fs_l3info;
1779 	flow_desc_t	*fd = &flent->fe_flow_desc;
1780 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1781 	in_addr_t	addr;
1782 
1783 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1784 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1785 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1786 		    V4_PART_OF_V6(fd->fd_local_addr));
1787 	}
1788 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1789 	    V4_PART_OF_V6(fd->fd_remote_addr));
1790 }
1791 
1792 /* ARGSUSED */
1793 static boolean_t
1794 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1795 {
1796 	flow_l3info_t	*l3info = &s->fs_l3info;
1797 	flow_desc_t	*fd = &flent->fe_flow_desc;
1798 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1799 	in6_addr_t	*addrp;
1800 
1801 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1802 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1803 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1804 		    fd->fd_local_addr));
1805 	}
1806 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1807 }
1808 
1809 /* ARGSUSED */
1810 static boolean_t
1811 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1812 {
1813 	flow_l3info_t	*l3info = &s->fs_l3info;
1814 	flow_desc_t	*fd = &flent->fe_flow_desc;
1815 
1816 	return (l3info->l3_protocol == fd->fd_protocol);
1817 }
1818 
1819 static uint32_t
1820 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1821 {
1822 	flow_l3info_t	*l3info = &s->fs_l3info;
1823 	flow_mask_t	mask = ft->ft_mask;
1824 
1825 	if ((mask & FLOW_IP_LOCAL) != 0) {
1826 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1827 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1828 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1829 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1830 		/*
1831 		 * DSField flents are arranged as a single list.
1832 		 */
1833 		return (0);
1834 	}
1835 	/*
1836 	 * IP addr flents are hashed into two lists, v4 or v6.
1837 	 */
1838 	ASSERT(ft->ft_size >= 2);
1839 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1840 }
1841 
1842 static uint32_t
1843 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1844 {
1845 	flow_l3info_t	*l3info = &s->fs_l3info;
1846 
1847 	return (l3info->l3_protocol % ft->ft_size);
1848 }
1849 
1850 /* ARGSUSED */
1851 static int
1852 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1853 {
1854 	flow_l2info_t	*l2info = &s->fs_l2info;
1855 	flow_l3info_t	*l3info = &s->fs_l3info;
1856 	uint16_t	sap = l2info->l2_sap;
1857 	uchar_t		*l3_start;
1858 
1859 	l3_start = l2info->l2_start + l2info->l2_hdrsize;
1860 
1861 	/*
1862 	 * Adjust start pointer if we're at the end of an mblk.
1863 	 */
1864 	CHECK_AND_ADJUST_START_PTR(s, l3_start);
1865 
1866 	l3info->l3_start = l3_start;
1867 	if (!OK_32PTR(l3_start))
1868 		return (EINVAL);
1869 
1870 	switch (sap) {
1871 	case ETHERTYPE_IP: {
1872 		ipha_t	*ipha = (ipha_t *)l3_start;
1873 
1874 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1875 			return (ENOBUFS);
1876 
1877 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1878 		l3info->l3_protocol = ipha->ipha_protocol;
1879 		l3info->l3_version = IPV4_VERSION;
1880 		l3info->l3_fragmented =
1881 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1882 		break;
1883 	}
1884 	case ETHERTYPE_IPV6: {
1885 		ip6_t   *ip6h = (ip6_t *)l3_start;
1886 		uint16_t ip6_hdrlen;
1887 		uint8_t	 nexthdr;
1888 
1889 		if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
1890 		    &nexthdr)) {
1891 			return (ENOBUFS);
1892 		}
1893 		l3info->l3_hdrsize = ip6_hdrlen;
1894 		l3info->l3_protocol = nexthdr;
1895 		l3info->l3_version = IPV6_VERSION;
1896 		l3info->l3_fragmented = B_FALSE;
1897 		break;
1898 	}
1899 	default:
1900 		return (EINVAL);
1901 	}
1902 	return (0);
1903 }
1904 
1905 /* ARGSUSED */
1906 static int
1907 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1908 {
1909 	flow_desc_t	*fd = &flent->fe_flow_desc;
1910 
1911 	switch (fd->fd_protocol) {
1912 	case IPPROTO_TCP:
1913 	case IPPROTO_UDP:
1914 	case IPPROTO_SCTP:
1915 	case IPPROTO_ICMP:
1916 	case IPPROTO_ICMPV6:
1917 		flent->fe_match = flow_ip_proto_match;
1918 		return (0);
1919 	default:
1920 		return (EINVAL);
1921 	}
1922 }
1923 
1924 /* ARGSUSED */
1925 static int
1926 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1927 {
1928 	flow_desc_t	*fd = &flent->fe_flow_desc;
1929 	flow_mask_t	mask;
1930 	uint8_t		version;
1931 	in6_addr_t	*addr, *netmask;
1932 
1933 	/*
1934 	 * DSField does not require a IP version.
1935 	 */
1936 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
1937 		if (fd->fd_dsfield_mask == 0)
1938 			return (EINVAL);
1939 
1940 		flent->fe_match = flow_ip_dsfield_match;
1941 		return (0);
1942 	}
1943 
1944 	/*
1945 	 * IP addresses must come with a version to avoid ambiguity.
1946 	 */
1947 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
1948 		return (EINVAL);
1949 
1950 	version = fd->fd_ipversion;
1951 	if (version != IPV4_VERSION && version != IPV6_VERSION)
1952 		return (EINVAL);
1953 
1954 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
1955 	switch (mask) {
1956 	case FLOW_IP_LOCAL:
1957 		addr = &fd->fd_local_addr;
1958 		netmask = &fd->fd_local_netmask;
1959 		break;
1960 	case FLOW_IP_REMOTE:
1961 		addr = &fd->fd_remote_addr;
1962 		netmask = &fd->fd_remote_netmask;
1963 		break;
1964 	default:
1965 		return (EINVAL);
1966 	}
1967 
1968 	/*
1969 	 * Apply netmask onto specified address.
1970 	 */
1971 	V6_MASK_COPY(*addr, *netmask, *addr);
1972 	if (version == IPV4_VERSION) {
1973 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
1974 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
1975 
1976 		if (v4addr == 0 || v4mask == 0)
1977 			return (EINVAL);
1978 		flent->fe_match = flow_ip_v4_match;
1979 	} else {
1980 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
1981 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
1982 			return (EINVAL);
1983 		flent->fe_match = flow_ip_v6_match;
1984 	}
1985 	return (0);
1986 }
1987 
1988 static uint32_t
1989 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1990 {
1991 	flow_desc_t	*fd = &flent->fe_flow_desc;
1992 
1993 	return (fd->fd_protocol % ft->ft_size);
1994 }
1995 
1996 static uint32_t
1997 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1998 {
1999 	flow_desc_t	*fd = &flent->fe_flow_desc;
2000 
2001 	/*
2002 	 * DSField flents are arranged as a single list.
2003 	 */
2004 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2005 		return (0);
2006 
2007 	/*
2008 	 * IP addr flents are hashed into two lists, v4 or v6.
2009 	 */
2010 	ASSERT(ft->ft_size >= 2);
2011 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2012 }
2013 
2014 /* ARGSUSED */
2015 static boolean_t
2016 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2017 {
2018 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2019 
2020 	return (fd1->fd_protocol == fd2->fd_protocol);
2021 }
2022 
2023 /* ARGSUSED */
2024 static boolean_t
2025 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2026 {
2027 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2028 	in6_addr_t	*a1, *m1, *a2, *m2;
2029 
2030 	ASSERT(fd1->fd_mask == fd2->fd_mask);
2031 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2032 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2033 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2034 	}
2035 
2036 	/*
2037 	 * flow_ip_accept_fe() already validated the version.
2038 	 */
2039 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2040 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2041 		return (B_FALSE);
2042 
2043 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2044 	case FLOW_IP_LOCAL:
2045 		a1 = &fd1->fd_local_addr;
2046 		m1 = &fd1->fd_local_netmask;
2047 		a2 = &fd2->fd_local_addr;
2048 		m2 = &fd2->fd_local_netmask;
2049 		break;
2050 	case FLOW_IP_REMOTE:
2051 		a1 = &fd1->fd_remote_addr;
2052 		m1 = &fd1->fd_remote_netmask;
2053 		a2 = &fd2->fd_remote_addr;
2054 		m2 = &fd2->fd_remote_netmask;
2055 		break;
2056 	default:
2057 		/*
2058 		 * This is unreachable given the checks in
2059 		 * flow_ip_accept_fe().
2060 		 */
2061 		return (B_FALSE);
2062 	}
2063 
2064 	if (fd1->fd_ipversion == IPV4_VERSION) {
2065 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2066 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2067 
2068 	} else {
2069 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2070 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2071 	}
2072 }
2073 
2074 static int
2075 flow_ip_mask2plen(in6_addr_t *v6mask)
2076 {
2077 	int		bits;
2078 	int		plen = IPV6_ABITS;
2079 	int		i;
2080 
2081 	for (i = 3; i >= 0; i--) {
2082 		if (v6mask->s6_addr32[i] == 0) {
2083 			plen -= 32;
2084 			continue;
2085 		}
2086 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2087 		if (bits == 0)
2088 			break;
2089 		plen -= bits;
2090 	}
2091 	return (plen);
2092 }
2093 
2094 /* ARGSUSED */
2095 static int
2096 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2097     flow_entry_t *flent)
2098 {
2099 	flow_entry_t	**p = headp;
2100 	flow_desc_t	*fd0, *fd;
2101 	in6_addr_t	*m0, *m;
2102 	int		plen0, plen;
2103 
2104 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2105 
2106 	/*
2107 	 * No special ordering needed for dsfield.
2108 	 */
2109 	fd0 = &flent->fe_flow_desc;
2110 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2111 		if (*p != NULL) {
2112 			ASSERT(flent->fe_next == NULL);
2113 			flent->fe_next = *p;
2114 		}
2115 		*p = flent;
2116 		return (0);
2117 	}
2118 
2119 	/*
2120 	 * IP address flows are arranged in descending prefix length order.
2121 	 */
2122 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2123 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2124 	plen0 = flow_ip_mask2plen(m0);
2125 	ASSERT(plen0 != 0);
2126 
2127 	for (; *p != NULL; p = &(*p)->fe_next) {
2128 		fd = &(*p)->fe_flow_desc;
2129 
2130 		/*
2131 		 * Normally a dsfield flent shouldn't end up on the same
2132 		 * list as an IP address because flow tables are (for now)
2133 		 * disjoint. If we decide to support both IP and dsfield
2134 		 * in the same table in the future, this check will allow
2135 		 * for that.
2136 		 */
2137 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2138 			continue;
2139 
2140 		/*
2141 		 * We also allow for the mixing of local and remote address
2142 		 * flents within one list.
2143 		 */
2144 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2145 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2146 		plen = flow_ip_mask2plen(m);
2147 
2148 		if (plen <= plen0)
2149 			break;
2150 	}
2151 	if (*p != NULL) {
2152 		ASSERT(flent->fe_next == NULL);
2153 		flent->fe_next = *p;
2154 	}
2155 	*p = flent;
2156 	return (0);
2157 }
2158 
2159 /*
2160  * Transport layer protocol and port matching functions.
2161  */
2162 
2163 /* ARGSUSED */
2164 static boolean_t
2165 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2166 {
2167 	flow_l3info_t	*l3info = &s->fs_l3info;
2168 	flow_l4info_t	*l4info = &s->fs_l4info;
2169 	flow_desc_t	*fd = &flent->fe_flow_desc;
2170 
2171 	return (fd->fd_protocol == l3info->l3_protocol &&
2172 	    fd->fd_local_port == l4info->l4_hash_port);
2173 }
2174 
2175 /* ARGSUSED */
2176 static boolean_t
2177 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2178 {
2179 	flow_l3info_t	*l3info = &s->fs_l3info;
2180 	flow_l4info_t	*l4info = &s->fs_l4info;
2181 	flow_desc_t	*fd = &flent->fe_flow_desc;
2182 
2183 	return (fd->fd_protocol == l3info->l3_protocol &&
2184 	    fd->fd_remote_port == l4info->l4_hash_port);
2185 }
2186 
2187 /*
2188  * Transport hash function.
2189  * Since we only support either local or remote port flows,
2190  * we only need to extract one of the ports to be used for
2191  * matching.
2192  */
2193 static uint32_t
2194 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2195 {
2196 	flow_l3info_t	*l3info = &s->fs_l3info;
2197 	flow_l4info_t	*l4info = &s->fs_l4info;
2198 	uint8_t		proto = l3info->l3_protocol;
2199 	boolean_t	dst_or_src;
2200 
2201 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2202 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2203 	} else {
2204 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2205 	}
2206 
2207 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2208 	    l4info->l4_src_port;
2209 
2210 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2211 }
2212 
2213 /*
2214  * Unlike other accept() functions above, we do not need to get the header
2215  * size because this is our highest layer so far. If we want to do support
2216  * other higher layer protocols, we would need to save the l4_hdrsize
2217  * in the code below.
2218  */
2219 
2220 /* ARGSUSED */
2221 static int
2222 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2223 {
2224 	flow_l3info_t	*l3info = &s->fs_l3info;
2225 	flow_l4info_t	*l4info = &s->fs_l4info;
2226 	uint8_t		proto = l3info->l3_protocol;
2227 	uchar_t		*l4_start;
2228 
2229 	l4_start = l3info->l3_start + l3info->l3_hdrsize;
2230 
2231 	/*
2232 	 * Adjust start pointer if we're at the end of an mblk.
2233 	 */
2234 	CHECK_AND_ADJUST_START_PTR(s, l4_start);
2235 
2236 	l4info->l4_start = l4_start;
2237 	if (!OK_32PTR(l4_start))
2238 		return (EINVAL);
2239 
2240 	if (l3info->l3_fragmented == B_TRUE)
2241 		return (EINVAL);
2242 
2243 	switch (proto) {
2244 	case IPPROTO_TCP: {
2245 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2246 
2247 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2248 			return (ENOBUFS);
2249 
2250 		l4info->l4_src_port = tcph->th_sport;
2251 		l4info->l4_dst_port = tcph->th_dport;
2252 		break;
2253 	}
2254 	case IPPROTO_UDP: {
2255 		struct udphdr	*udph = (struct udphdr *)l4_start;
2256 
2257 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2258 			return (ENOBUFS);
2259 
2260 		l4info->l4_src_port = udph->uh_sport;
2261 		l4info->l4_dst_port = udph->uh_dport;
2262 		break;
2263 	}
2264 	case IPPROTO_SCTP: {
2265 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2266 
2267 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2268 			return (ENOBUFS);
2269 
2270 		l4info->l4_src_port = sctph->sh_sport;
2271 		l4info->l4_dst_port = sctph->sh_dport;
2272 		break;
2273 	}
2274 	default:
2275 		return (EINVAL);
2276 	}
2277 
2278 	return (0);
2279 }
2280 
2281 /*
2282  * Validates transport flow entry.
2283  * The protocol field must be present.
2284  */
2285 
2286 /* ARGSUSED */
2287 static int
2288 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2289 {
2290 	flow_desc_t	*fd = &flent->fe_flow_desc;
2291 	flow_mask_t	mask = fd->fd_mask;
2292 
2293 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2294 		return (EINVAL);
2295 
2296 	switch (fd->fd_protocol) {
2297 	case IPPROTO_TCP:
2298 	case IPPROTO_UDP:
2299 	case IPPROTO_SCTP:
2300 		break;
2301 	default:
2302 		return (EINVAL);
2303 	}
2304 
2305 	switch (mask & ~FLOW_IP_PROTOCOL) {
2306 	case FLOW_ULP_PORT_LOCAL:
2307 		if (fd->fd_local_port == 0)
2308 			return (EINVAL);
2309 
2310 		flent->fe_match = flow_transport_lport_match;
2311 		break;
2312 	case FLOW_ULP_PORT_REMOTE:
2313 		if (fd->fd_remote_port == 0)
2314 			return (EINVAL);
2315 
2316 		flent->fe_match = flow_transport_rport_match;
2317 		break;
2318 	case 0:
2319 		/*
2320 		 * transport-only flows conflicts with our table type.
2321 		 */
2322 		return (EOPNOTSUPP);
2323 	default:
2324 		return (EINVAL);
2325 	}
2326 
2327 	return (0);
2328 }
2329 
2330 static uint32_t
2331 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2332 {
2333 	flow_desc_t	*fd = &flent->fe_flow_desc;
2334 	uint16_t	port = 0;
2335 
2336 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2337 	    fd->fd_local_port : fd->fd_remote_port;
2338 
2339 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2340 }
2341 
2342 /* ARGSUSED */
2343 static boolean_t
2344 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2345 {
2346 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2347 
2348 	if (fd1->fd_protocol != fd2->fd_protocol)
2349 		return (B_FALSE);
2350 
2351 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2352 		return (fd1->fd_local_port == fd2->fd_local_port);
2353 
2354 	return (fd1->fd_remote_port == fd2->fd_remote_port);
2355 }
2356 
2357 static flow_ops_t flow_l2_ops = {
2358 	flow_l2_accept_fe,
2359 	flow_l2_hash_fe,
2360 	flow_l2_match_fe,
2361 	flow_generic_insert_fe,
2362 	flow_l2_hash,
2363 	{flow_l2_accept}
2364 };
2365 
2366 static flow_ops_t flow_ip_ops = {
2367 	flow_ip_accept_fe,
2368 	flow_ip_hash_fe,
2369 	flow_ip_match_fe,
2370 	flow_ip_insert_fe,
2371 	flow_ip_hash,
2372 	{flow_l2_accept, flow_ip_accept}
2373 };
2374 
2375 static flow_ops_t flow_ip_proto_ops = {
2376 	flow_ip_proto_accept_fe,
2377 	flow_ip_proto_hash_fe,
2378 	flow_ip_proto_match_fe,
2379 	flow_generic_insert_fe,
2380 	flow_ip_proto_hash,
2381 	{flow_l2_accept, flow_ip_accept}
2382 };
2383 
2384 static flow_ops_t flow_transport_ops = {
2385 	flow_transport_accept_fe,
2386 	flow_transport_hash_fe,
2387 	flow_transport_match_fe,
2388 	flow_generic_insert_fe,
2389 	flow_transport_hash,
2390 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2391 };
2392 
2393 static flow_tab_info_t flow_tab_info_list[] = {
2394 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2395 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2396 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2397 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2398 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}
2399 };
2400 
2401 #define	FLOW_MAX_TAB_INFO \
2402 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2403 
2404 static flow_tab_info_t *
2405 mac_flow_tab_info_get(flow_mask_t mask)
2406 {
2407 	int	i;
2408 
2409 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2410 		if (mask == flow_tab_info_list[i].fti_mask)
2411 			return (&flow_tab_info_list[i]);
2412 	}
2413 	return (NULL);
2414 }
2415