xref: /titanic_41/usr/src/uts/common/io/mac/mac_flow.c (revision c8f74a56af6974058d11efe681daeb7f4cdb78d1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/mac_stat.h>
33 #include <sys/dls.h>
34 #include <sys/dls_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/ethernet.h>
37 #include <sys/cpupart.h>
38 #include <sys/pool.h>
39 #include <sys/pool_pset.h>
40 #include <sys/vlan.h>
41 #include <inet/ip.h>
42 #include <inet/ip6.h>
43 #include <netinet/tcp.h>
44 #include <netinet/udp.h>
45 #include <netinet/sctp.h>
46 
47 typedef struct flow_stats_s {
48 	uint64_t	fs_obytes;
49 	uint64_t	fs_opackets;
50 	uint64_t	fs_oerrors;
51 	uint64_t	fs_ibytes;
52 	uint64_t	fs_ipackets;
53 	uint64_t	fs_ierrors;
54 } flow_stats_t;
55 
56 
57 /* global flow table, will be a per exclusive-zone table later */
58 static mod_hash_t	*flow_hash;
59 static krwlock_t	flow_tab_lock;
60 
61 static kmem_cache_t	*flow_cache;
62 static kmem_cache_t	*flow_tab_cache;
63 static flow_ops_t	flow_l2_ops;
64 
65 typedef struct {
66 	const char	*fs_name;
67 	uint_t		fs_offset;
68 } flow_stats_info_t;
69 
70 #define	FS_OFF(f)	(offsetof(flow_stats_t, f))
71 static flow_stats_info_t flow_stats_list[] = {
72 	{"rbytes",	FS_OFF(fs_ibytes)},
73 	{"ipackets",	FS_OFF(fs_ipackets)},
74 	{"ierrors",	FS_OFF(fs_ierrors)},
75 	{"obytes",	FS_OFF(fs_obytes)},
76 	{"opackets",	FS_OFF(fs_opackets)},
77 	{"oerrors",	FS_OFF(fs_oerrors)}
78 };
79 #define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
80 
81 /*
82  * Checks whether a flow mask is legal.
83  */
84 static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
85 
86 static void
87 flow_stat_init(kstat_named_t *knp)
88 {
89 	int	i;
90 
91 	for (i = 0; i < FS_SIZE; i++, knp++) {
92 		kstat_named_init(knp, flow_stats_list[i].fs_name,
93 		    KSTAT_DATA_UINT64);
94 	}
95 }
96 
97 static int
98 flow_stat_update(kstat_t *ksp, int rw)
99 {
100 	flow_entry_t		*fep = ksp->ks_private;
101 	kstat_named_t		*knp = ksp->ks_data;
102 	uint64_t		*statp;
103 	int			i;
104 	mac_rx_stats_t		*mac_rx_stat;
105 	mac_tx_stats_t		*mac_tx_stat;
106 	flow_stats_t		flow_stats;
107 	mac_soft_ring_set_t	*mac_srs;
108 
109 	if (rw != KSTAT_READ)
110 		return (EACCES);
111 
112 	bzero(&flow_stats, sizeof (flow_stats_t));
113 
114 	for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
115 		mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
116 		if (mac_srs == NULL) 		/* Multicast flow */
117 			break;
118 		mac_rx_stat = &mac_srs->srs_rx.sr_stat;
119 
120 		flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
121 		    mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
122 
123 		flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
124 		    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
125 
126 		flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
127 	}
128 
129 	mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
130 	if (mac_srs == NULL) 		/* Multicast flow */
131 		goto done;
132 	mac_tx_stat = &mac_srs->srs_tx.st_stat;
133 
134 	flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
135 	flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
136 	flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
137 
138 done:
139 	for (i = 0; i < FS_SIZE; i++, knp++) {
140 		statp = (uint64_t *)
141 		    ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
142 		knp->value.ui64 = *statp;
143 	}
144 	return (0);
145 }
146 
147 static void
148 flow_stat_create(flow_entry_t *fep)
149 {
150 	kstat_t		*ksp;
151 	kstat_named_t	*knp;
152 	uint_t		nstats = FS_SIZE;
153 
154 	/*
155 	 * Fow now, flow entries are only manipulated and visible from the
156 	 * global zone.
157 	 */
158 	ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
159 	    KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
160 	if (ksp == NULL)
161 		return;
162 
163 	ksp->ks_update = flow_stat_update;
164 	ksp->ks_private = fep;
165 	fep->fe_ksp = ksp;
166 
167 	knp = (kstat_named_t *)ksp->ks_data;
168 	flow_stat_init(knp);
169 	kstat_install(ksp);
170 }
171 
172 void
173 flow_stat_destroy(flow_entry_t *fep)
174 {
175 	if (fep->fe_ksp != NULL) {
176 		kstat_delete(fep->fe_ksp);
177 		fep->fe_ksp = NULL;
178 	}
179 }
180 
181 /*
182  * Initialize the flow table
183  */
184 void
185 mac_flow_init()
186 {
187 	flow_cache = kmem_cache_create("flow_entry_cache",
188 	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
189 	flow_tab_cache = kmem_cache_create("flow_tab_cache",
190 	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
191 	flow_hash = mod_hash_create_extended("flow_hash",
192 	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
193 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
194 	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
195 }
196 
197 /*
198  * Cleanup and release the flow table
199  */
200 void
201 mac_flow_fini()
202 {
203 	kmem_cache_destroy(flow_cache);
204 	kmem_cache_destroy(flow_tab_cache);
205 	mod_hash_destroy_hash(flow_hash);
206 	rw_destroy(&flow_tab_lock);
207 }
208 
209 /*
210  * mac_create_flow(): create a flow_entry_t.
211  */
212 int
213 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
214     void *client_cookie, uint_t type, flow_entry_t **flentp)
215 {
216 	flow_entry_t		*flent = *flentp;
217 	int			err = 0;
218 
219 	if (mrp != NULL) {
220 		err = mac_validate_props(NULL, mrp);
221 		if (err != 0)
222 			return (err);
223 	}
224 
225 	if (flent == NULL) {
226 		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
227 		bzero(flent, sizeof (*flent));
228 		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
229 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
230 
231 		/* Initialize the receiver function to a safe routine */
232 		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
233 		flent->fe_index = -1;
234 	}
235 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
236 
237 	/* This is an initial flow, will be configured later */
238 	if (fd == NULL) {
239 		*flentp = flent;
240 		return (0);
241 	}
242 
243 	flent->fe_client_cookie = client_cookie;
244 	flent->fe_type = type;
245 
246 	/* Save flow desc */
247 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
248 
249 	if (mrp != NULL) {
250 		/*
251 		 * We have already set fe_resource_props for a Link.
252 		 */
253 		if (type & FLOW_USER) {
254 			bcopy(mrp, &flent->fe_resource_props,
255 			    sizeof (mac_resource_props_t));
256 		}
257 		/*
258 		 * The effective resource list should reflect the priority
259 		 * that we set implicitly.
260 		 */
261 		if (!(mrp->mrp_mask & MRP_PRIORITY))
262 			mrp->mrp_mask |= MRP_PRIORITY;
263 		if (type & FLOW_USER)
264 			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
265 		else
266 			mrp->mrp_priority = MPL_LINK_DEFAULT;
267 		bzero(mrp->mrp_pool, MAXPATHLEN);
268 		bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
269 		bcopy(mrp, &flent->fe_effective_props,
270 		    sizeof (mac_resource_props_t));
271 	}
272 	flow_stat_create(flent);
273 
274 	*flentp = flent;
275 	return (0);
276 }
277 
278 /*
279  * Validate flow entry and add it to a flow table.
280  */
281 int
282 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
283 {
284 	flow_entry_t	**headp, **p;
285 	flow_ops_t	*ops = &ft->ft_ops;
286 	flow_mask_t	mask;
287 	uint32_t	index;
288 	int		err;
289 
290 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
291 
292 	/*
293 	 * Check for invalid bits in mask.
294 	 */
295 	mask = flent->fe_flow_desc.fd_mask;
296 	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
297 		return (EOPNOTSUPP);
298 
299 	/*
300 	 * Validate flent.
301 	 */
302 	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
303 		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
304 		    flow_entry_t *, flent, int, err);
305 		return (err);
306 	}
307 
308 	/*
309 	 * Flent is valid. now calculate hash and insert it
310 	 * into hash table.
311 	 */
312 	index = ops->fo_hash_fe(ft, flent);
313 
314 	/*
315 	 * We do not need a lock up until now because we were
316 	 * not accessing the flow table.
317 	 */
318 	rw_enter(&ft->ft_lock, RW_WRITER);
319 	headp = &ft->ft_table[index];
320 
321 	/*
322 	 * Check for duplicate flow.
323 	 */
324 	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
325 		if ((*p)->fe_flow_desc.fd_mask !=
326 		    flent->fe_flow_desc.fd_mask)
327 			continue;
328 
329 		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
330 			rw_exit(&ft->ft_lock);
331 			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
332 			    flow_entry_t *, flent, int, err);
333 			return (EALREADY);
334 		}
335 	}
336 
337 	/*
338 	 * Insert flow to hash list.
339 	 */
340 	err = ops->fo_insert_fe(ft, headp, flent);
341 	if (err != 0) {
342 		rw_exit(&ft->ft_lock);
343 		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
344 		    flow_entry_t *, flent, int, err);
345 		return (err);
346 	}
347 
348 	/*
349 	 * Save the hash index so it can be used by mac_flow_remove().
350 	 */
351 	flent->fe_index = (int)index;
352 
353 	/*
354 	 * Save the flow tab back reference.
355 	 */
356 	flent->fe_flow_tab = ft;
357 	FLOW_MARK(flent, FE_FLOW_TAB);
358 	ft->ft_flow_count++;
359 	rw_exit(&ft->ft_lock);
360 	return (0);
361 }
362 
363 /*
364  * Remove a flow from a mac client's subflow table
365  */
366 void
367 mac_flow_rem_subflow(flow_entry_t *flent)
368 {
369 	flow_tab_t		*ft = flent->fe_flow_tab;
370 	mac_client_impl_t	*mcip = ft->ft_mcip;
371 	mac_handle_t		mh = (mac_handle_t)ft->ft_mip;
372 
373 	ASSERT(MAC_PERIM_HELD(mh));
374 
375 	mac_flow_remove(ft, flent, B_FALSE);
376 	if (flent->fe_mcip == NULL) {
377 		/*
378 		 * The interface is not yet plumbed and mac_client_flow_add
379 		 * was not done.
380 		 */
381 		if (FLOW_TAB_EMPTY(ft)) {
382 			mac_flow_tab_destroy(ft);
383 			mcip->mci_subflow_tab = NULL;
384 		}
385 	} else {
386 		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
387 		mac_link_flow_clean((mac_client_handle_t)mcip, flent);
388 	}
389 	mac_fastpath_enable(mh);
390 }
391 
392 /*
393  * Add a flow to a mac client's subflow table and instantiate the flow
394  * in the mac by creating the associated SRSs etc.
395  */
396 int
397 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
398     boolean_t instantiate_flow)
399 {
400 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
401 	mac_handle_t		mh = (mac_handle_t)mcip->mci_mip;
402 	flow_tab_info_t		*ftinfo;
403 	flow_mask_t		mask;
404 	flow_tab_t		*ft;
405 	int			err;
406 	boolean_t		ft_created = B_FALSE;
407 
408 	ASSERT(MAC_PERIM_HELD(mh));
409 
410 	if ((err = mac_fastpath_disable(mh)) != 0)
411 		return (err);
412 
413 	/*
414 	 * If the subflow table exists already just add the new subflow
415 	 * to the existing table, else we create a new subflow table below.
416 	 */
417 	ft = mcip->mci_subflow_tab;
418 	if (ft == NULL) {
419 		mask = flent->fe_flow_desc.fd_mask;
420 		/*
421 		 * Try to create a new table and then add the subflow to the
422 		 * newly created subflow table
423 		 */
424 		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
425 			mac_fastpath_enable(mh);
426 			return (EOPNOTSUPP);
427 		}
428 
429 		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
430 		    mcip->mci_mip, &ft);
431 		ft_created = B_TRUE;
432 	}
433 
434 	err = mac_flow_add(ft, flent);
435 	if (err != 0) {
436 		if (ft_created)
437 			mac_flow_tab_destroy(ft);
438 		mac_fastpath_enable(mh);
439 		return (err);
440 	}
441 
442 	if (instantiate_flow) {
443 		/* Now activate the flow by creating its SRSs */
444 		ASSERT(MCIP_DATAPATH_SETUP(mcip));
445 		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
446 		if (err != 0) {
447 			mac_flow_remove(ft, flent, B_FALSE);
448 			if (ft_created)
449 				mac_flow_tab_destroy(ft);
450 			mac_fastpath_enable(mh);
451 			return (err);
452 		}
453 	} else {
454 		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
455 	}
456 	if (ft_created) {
457 		ASSERT(mcip->mci_subflow_tab == NULL);
458 		ft->ft_mcip = mcip;
459 		mcip->mci_subflow_tab = ft;
460 		if (instantiate_flow)
461 			mac_client_update_classifier(mcip, B_TRUE);
462 	}
463 	return (0);
464 }
465 
466 /*
467  * Remove flow entry from flow table.
468  */
469 void
470 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
471 {
472 	flow_entry_t	**fp;
473 
474 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
475 	if (!(flent->fe_flags & FE_FLOW_TAB))
476 		return;
477 
478 	rw_enter(&ft->ft_lock, RW_WRITER);
479 	/*
480 	 * If this is a permanent removal from the flow table, mark it
481 	 * CONDEMNED to prevent future references. If this is a temporary
482 	 * removal from the table, say to update the flow descriptor then
483 	 * we don't mark it CONDEMNED
484 	 */
485 	if (!temp)
486 		FLOW_MARK(flent, FE_CONDEMNED);
487 	/*
488 	 * Locate the specified flent.
489 	 */
490 	fp = &ft->ft_table[flent->fe_index];
491 	while (*fp != flent)
492 		fp = &(*fp)->fe_next;
493 
494 	/*
495 	 * The flent must exist. Otherwise it's a bug.
496 	 */
497 	ASSERT(fp != NULL);
498 	*fp = flent->fe_next;
499 	flent->fe_next = NULL;
500 
501 	/*
502 	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
503 	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
504 	 * will panic.
505 	 */
506 	flent->fe_index = -1;
507 	FLOW_UNMARK(flent, FE_FLOW_TAB);
508 	ft->ft_flow_count--;
509 	rw_exit(&ft->ft_lock);
510 }
511 
512 /*
513  * This is the flow lookup routine used by the mac sw classifier engine.
514  */
515 int
516 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
517 {
518 	flow_state_t	s;
519 	flow_entry_t	*flent;
520 	flow_ops_t	*ops = &ft->ft_ops;
521 	boolean_t	retried = B_FALSE;
522 	int		i, err;
523 
524 	s.fs_flags = flags;
525 retry:
526 	s.fs_mp = mp;
527 
528 	/*
529 	 * Walk the list of predeclared accept functions.
530 	 * Each of these would accumulate enough state to allow the next
531 	 * accept routine to make progress.
532 	 */
533 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
534 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
535 			mblk_t	*last;
536 
537 			/*
538 			 * ENOBUFS indicates that the mp could be too short
539 			 * and may need a pullup.
540 			 */
541 			if (err != ENOBUFS || retried)
542 				return (err);
543 
544 			/*
545 			 * The pullup is done on the last processed mblk, not
546 			 * the starting one. pullup is not done if the mblk
547 			 * has references or if b_cont is NULL.
548 			 */
549 			last = s.fs_mp;
550 			if (DB_REF(last) > 1 || last->b_cont == NULL ||
551 			    pullupmsg(last, -1) == 0)
552 				return (EINVAL);
553 
554 			retried = B_TRUE;
555 			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
556 			    flow_state_t *, &s);
557 			goto retry;
558 		}
559 	}
560 
561 	/*
562 	 * The packet is considered sane. We may now attempt to
563 	 * find the corresponding flent.
564 	 */
565 	rw_enter(&ft->ft_lock, RW_READER);
566 	flent = ft->ft_table[ops->fo_hash(ft, &s)];
567 	for (; flent != NULL; flent = flent->fe_next) {
568 		if (flent->fe_match(ft, flent, &s)) {
569 			FLOW_TRY_REFHOLD(flent, err);
570 			if (err != 0)
571 				continue;
572 			*flentp = flent;
573 			rw_exit(&ft->ft_lock);
574 			return (0);
575 		}
576 	}
577 	rw_exit(&ft->ft_lock);
578 	return (ENOENT);
579 }
580 
581 /*
582  * Walk flow table.
583  * The caller is assumed to have proper perimeter protection.
584  */
585 int
586 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
587     void *arg)
588 {
589 	int		err, i, cnt = 0;
590 	flow_entry_t	*flent;
591 
592 	if (ft == NULL)
593 		return (0);
594 
595 	for (i = 0; i < ft->ft_size; i++) {
596 		for (flent = ft->ft_table[i]; flent != NULL;
597 		    flent = flent->fe_next) {
598 			cnt++;
599 			err = (*fn)(flent, arg);
600 			if (err != 0)
601 				return (err);
602 		}
603 	}
604 	VERIFY(cnt == ft->ft_flow_count);
605 	return (0);
606 }
607 
608 /*
609  * Same as the above except a mutex is used for protection here.
610  */
611 int
612 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
613     void *arg)
614 {
615 	int		err;
616 
617 	if (ft == NULL)
618 		return (0);
619 
620 	rw_enter(&ft->ft_lock, RW_WRITER);
621 	err = mac_flow_walk_nolock(ft, fn, arg);
622 	rw_exit(&ft->ft_lock);
623 	return (err);
624 }
625 
626 static boolean_t	mac_flow_clean(flow_entry_t *);
627 
628 /*
629  * Destroy a flow entry. Called when the last reference on a flow is released.
630  */
631 void
632 mac_flow_destroy(flow_entry_t *flent)
633 {
634 	ASSERT(flent->fe_refcnt == 0);
635 
636 	if ((flent->fe_type & FLOW_USER) != 0) {
637 		ASSERT(mac_flow_clean(flent));
638 	} else {
639 		mac_flow_cleanup(flent);
640 	}
641 	mac_misc_stat_delete(flent);
642 	mutex_destroy(&flent->fe_lock);
643 	cv_destroy(&flent->fe_cv);
644 	flow_stat_destroy(flent);
645 	kmem_cache_free(flow_cache, flent);
646 }
647 
648 /*
649  * XXX eric
650  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
651  * mac_link_flow_modify() should really be moved/reworked into the
652  * two functions below. This would consolidate all the mac property
653  * checking in one place. I'm leaving this alone for now since it's
654  * out of scope of the new flows work.
655  */
656 /* ARGSUSED */
657 uint32_t
658 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
659 {
660 	uint32_t		changed_mask = 0;
661 	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
662 	int			i;
663 
664 	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
665 	    (!(fmrp->mrp_mask & MRP_MAXBW) ||
666 	    (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
667 		changed_mask |= MRP_MAXBW;
668 		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
669 			fmrp->mrp_mask &= ~MRP_MAXBW;
670 			fmrp->mrp_maxbw = 0;
671 		} else {
672 			fmrp->mrp_mask |= MRP_MAXBW;
673 			fmrp->mrp_maxbw = mrp->mrp_maxbw;
674 		}
675 	}
676 
677 	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
678 		if (fmrp->mrp_priority != mrp->mrp_priority)
679 			changed_mask |= MRP_PRIORITY;
680 		if (mrp->mrp_priority == MPL_RESET) {
681 			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
682 			fmrp->mrp_mask &= ~MRP_PRIORITY;
683 		} else {
684 			fmrp->mrp_priority = mrp->mrp_priority;
685 			fmrp->mrp_mask |= MRP_PRIORITY;
686 		}
687 	}
688 
689 	/* modify fanout */
690 	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
691 		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
692 		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
693 			for (i = 0; i < mrp->mrp_ncpus; i++) {
694 				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
695 					break;
696 			}
697 			if (i == mrp->mrp_ncpus) {
698 				/*
699 				 * The new set of cpus passed is exactly
700 				 * the same as the existing set.
701 				 */
702 				return (changed_mask);
703 			}
704 		}
705 		changed_mask |= MRP_CPUS;
706 		MAC_COPY_CPUS(mrp, fmrp);
707 	}
708 
709 	/*
710 	 * Modify the rings property.
711 	 */
712 	if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
713 		mac_set_rings_effective(flent->fe_mcip);
714 
715 	if ((mrp->mrp_mask & MRP_POOL) != 0) {
716 		if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
717 			changed_mask |= MRP_POOL;
718 		if (strlen(mrp->mrp_pool) == 0)
719 			fmrp->mrp_mask &= ~MRP_POOL;
720 		else
721 			fmrp->mrp_mask |= MRP_POOL;
722 		(void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
723 	}
724 	return (changed_mask);
725 }
726 
727 void
728 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
729 {
730 	uint32_t changed_mask;
731 	mac_client_impl_t *mcip = flent->fe_mcip;
732 	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
733 	mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
734 	cpupart_t *cpupart = NULL;
735 	boolean_t use_default = B_FALSE;
736 
737 	ASSERT(flent != NULL);
738 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
739 
740 	rw_enter(&ft->ft_lock, RW_WRITER);
741 
742 	/* Update the cached values inside the subflow entry */
743 	changed_mask = mac_flow_modify_props(flent, mrp);
744 	rw_exit(&ft->ft_lock);
745 	/*
746 	 * Push the changed parameters to the scheduling code in the
747 	 * SRS's, to take effect right away.
748 	 */
749 	if (changed_mask & MRP_MAXBW) {
750 		mac_srs_update_bwlimit(flent, mrp);
751 		/*
752 		 * If bandwidth is changed, we may have to change
753 		 * the number of soft ring to be used for fanout.
754 		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
755 		 * is not set and there is no user supplied cpu
756 		 * info. This applies only to link at this time.
757 		 */
758 		if (!(flent->fe_type & FLOW_USER) &&
759 		    !(changed_mask & MRP_CPUS) &&
760 		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
761 			mac_fanout_setup(mcip, flent, mcip_mrp,
762 			    mac_rx_deliver, mcip, NULL, NULL);
763 		}
764 	}
765 	if (mrp->mrp_mask & MRP_PRIORITY)
766 		mac_flow_update_priority(mcip, flent);
767 
768 	if (changed_mask & MRP_CPUS)
769 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
770 		    NULL);
771 
772 	if (mrp->mrp_mask & MRP_POOL) {
773 		pool_lock();
774 		cpupart = mac_pset_find(mrp, &use_default);
775 		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
776 		    cpupart);
777 		mac_set_pool_effective(use_default, cpupart, mrp, emrp);
778 		pool_unlock();
779 	}
780 }
781 
782 /*
783  * This function waits for a certain condition to be met and is generally
784  * used before a destructive or quiescing operation.
785  */
786 void
787 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
788 {
789 	mutex_enter(&flent->fe_lock);
790 	flent->fe_flags |= FE_WAITER;
791 
792 	switch (event) {
793 	case FLOW_DRIVER_UPCALL:
794 		/*
795 		 * We want to make sure the driver upcalls have finished before
796 		 * we signal the Rx SRS worker to quit.
797 		 */
798 		while (flent->fe_refcnt != 1)
799 			cv_wait(&flent->fe_cv, &flent->fe_lock);
800 		break;
801 
802 	case FLOW_USER_REF:
803 		/*
804 		 * Wait for the fe_user_refcnt to drop to 0. The flow has
805 		 * been removed from the global flow hash.
806 		 */
807 		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
808 		while (flent->fe_user_refcnt != 0)
809 			cv_wait(&flent->fe_cv, &flent->fe_lock);
810 		break;
811 
812 	default:
813 		ASSERT(0);
814 	}
815 
816 	flent->fe_flags &= ~FE_WAITER;
817 	mutex_exit(&flent->fe_lock);
818 }
819 
820 static boolean_t
821 mac_flow_clean(flow_entry_t *flent)
822 {
823 	ASSERT(flent->fe_next == NULL);
824 	ASSERT(flent->fe_tx_srs == NULL);
825 	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
826 	ASSERT(flent->fe_mbg == NULL);
827 
828 	return (B_TRUE);
829 }
830 
831 void
832 mac_flow_cleanup(flow_entry_t *flent)
833 {
834 	if ((flent->fe_type & FLOW_USER) == 0) {
835 		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
836 		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
837 		ASSERT(flent->fe_refcnt == 0);
838 	} else {
839 		ASSERT(flent->fe_refcnt == 1);
840 	}
841 
842 	if (flent->fe_mbg != NULL) {
843 		ASSERT(flent->fe_tx_srs == NULL);
844 		/* This is a multicast or broadcast flow entry */
845 		mac_bcast_grp_free(flent->fe_mbg);
846 		flent->fe_mbg = NULL;
847 	}
848 
849 	if (flent->fe_tx_srs != NULL) {
850 		ASSERT(flent->fe_mbg == NULL);
851 		mac_srs_free(flent->fe_tx_srs);
852 		flent->fe_tx_srs = NULL;
853 	}
854 
855 	/*
856 	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
857 	 * when mac_unicast_add fails we may not have set up any SRS
858 	 * in which case fe_rx_srs_cnt will be zero.
859 	 */
860 	if (flent->fe_rx_srs_cnt != 0) {
861 		ASSERT(flent->fe_rx_srs_cnt == 1);
862 		mac_srs_free(flent->fe_rx_srs[0]);
863 		flent->fe_rx_srs[0] = NULL;
864 		flent->fe_rx_srs_cnt = 0;
865 	}
866 	ASSERT(flent->fe_rx_srs[0] == NULL);
867 }
868 
869 void
870 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
871 {
872 	/*
873 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
874 	 * Updates to the fe_flow_desc happen under the fe_lock
875 	 * after removing the flent from the flow table
876 	 */
877 	mutex_enter(&flent->fe_lock);
878 	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
879 	mutex_exit(&flent->fe_lock);
880 }
881 
882 /*
883  * Update a field of a flow entry. The mac perimeter ensures that
884  * this is the only thread doing a modify operation on this mac end point.
885  * So the flow table can't change or disappear. The ft_lock protects access
886  * to the flow entry, and holding the lock ensures that there isn't any thread
887  * accessing the flow entry or attempting a flow table lookup. However
888  * data threads that are using the flow entry based on the old descriptor
889  * will continue to use the flow entry. If strong coherence is required
890  * then the flow will have to be quiesced before the descriptor can be
891  * changed.
892  */
893 void
894 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
895 {
896 	flow_tab_t	*ft = flent->fe_flow_tab;
897 	flow_desc_t	old_desc;
898 	int		err;
899 
900 	if (ft == NULL) {
901 		/*
902 		 * The flow hasn't yet been inserted into the table,
903 		 * so only the caller knows about this flow, however for
904 		 * uniformity we grab the fe_lock here.
905 		 */
906 		mutex_enter(&flent->fe_lock);
907 		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
908 		mutex_exit(&flent->fe_lock);
909 	}
910 
911 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
912 
913 	/*
914 	 * Need to remove the flow entry from the table and reinsert it,
915 	 * into a potentially diference hash line. The hash depends on
916 	 * the new descriptor fields. However access to fe_desc itself
917 	 * is always under the fe_lock. This helps log and stat functions
918 	 * see a self-consistent fe_flow_desc.
919 	 */
920 	mac_flow_remove(ft, flent, B_TRUE);
921 	old_desc = flent->fe_flow_desc;
922 
923 	mutex_enter(&flent->fe_lock);
924 	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
925 	mutex_exit(&flent->fe_lock);
926 
927 	if (mac_flow_add(ft, flent) != 0) {
928 		/*
929 		 * The add failed say due to an invalid flow descriptor.
930 		 * Undo the update
931 		 */
932 		flent->fe_flow_desc = old_desc;
933 		err = mac_flow_add(ft, flent);
934 		ASSERT(err == 0);
935 	}
936 }
937 
938 void
939 mac_flow_set_name(flow_entry_t *flent, const char *name)
940 {
941 	flow_tab_t	*ft = flent->fe_flow_tab;
942 
943 	if (ft == NULL) {
944 		/*
945 		 *  The flow hasn't yet been inserted into the table,
946 		 * so only the caller knows about this flow
947 		 */
948 		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
949 	} else {
950 		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
951 	}
952 
953 	mutex_enter(&flent->fe_lock);
954 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
955 	mutex_exit(&flent->fe_lock);
956 }
957 
958 /*
959  * Return the client-private cookie that was associated with
960  * the flow when it was created.
961  */
962 void *
963 mac_flow_get_client_cookie(flow_entry_t *flent)
964 {
965 	return (flent->fe_client_cookie);
966 }
967 
968 /*
969  * Forward declarations.
970  */
971 static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
972 static uint32_t	flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
973 static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
974 static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
975 static uint32_t	flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
976 static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
977 
978 /*
979  * Create flow table.
980  */
981 void
982 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
983     mac_impl_t *mip, flow_tab_t **ftp)
984 {
985 	flow_tab_t	*ft;
986 	flow_ops_t	*new_ops;
987 
988 	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
989 	bzero(ft, sizeof (*ft));
990 
991 	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
992 
993 	/*
994 	 * We make a copy of the ops vector instead of just pointing to it
995 	 * because we might want to customize the ops vector on a per table
996 	 * basis (e.g. for optimization).
997 	 */
998 	new_ops = &ft->ft_ops;
999 	bcopy(ops, new_ops, sizeof (*ops));
1000 	ft->ft_mask = mask;
1001 	ft->ft_size = size;
1002 	ft->ft_mip = mip;
1003 
1004 	/*
1005 	 * Optimizations for DL_ETHER media.
1006 	 */
1007 	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1008 		if (new_ops->fo_hash == flow_l2_hash)
1009 			new_ops->fo_hash = flow_ether_hash;
1010 		if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1011 			new_ops->fo_hash_fe = flow_ether_hash_fe;
1012 		if (new_ops->fo_accept[0] == flow_l2_accept)
1013 			new_ops->fo_accept[0] = flow_ether_accept;
1014 	}
1015 	*ftp = ft;
1016 }
1017 
1018 void
1019 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1020 {
1021 	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1022 	    1024, mip, ftp);
1023 }
1024 
1025 /*
1026  * Destroy flow table.
1027  */
1028 void
1029 mac_flow_tab_destroy(flow_tab_t *ft)
1030 {
1031 	if (ft == NULL)
1032 		return;
1033 
1034 	ASSERT(ft->ft_flow_count == 0);
1035 	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1036 	bzero(ft, sizeof (*ft));
1037 	kmem_cache_free(flow_tab_cache, ft);
1038 }
1039 
1040 /*
1041  * Add a new flow entry to the global flow hash table
1042  */
1043 int
1044 mac_flow_hash_add(flow_entry_t *flent)
1045 {
1046 	int	err;
1047 
1048 	rw_enter(&flow_tab_lock, RW_WRITER);
1049 	err = mod_hash_insert(flow_hash,
1050 	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1051 	if (err != 0) {
1052 		rw_exit(&flow_tab_lock);
1053 		return (EEXIST);
1054 	}
1055 	/* Mark as inserted into the global flow hash table */
1056 	FLOW_MARK(flent, FE_G_FLOW_HASH);
1057 	rw_exit(&flow_tab_lock);
1058 	return (err);
1059 }
1060 
1061 /*
1062  * Remove a flow entry from the global flow hash table
1063  */
1064 void
1065 mac_flow_hash_remove(flow_entry_t *flent)
1066 {
1067 	mod_hash_val_t	val;
1068 
1069 	rw_enter(&flow_tab_lock, RW_WRITER);
1070 	VERIFY(mod_hash_remove(flow_hash,
1071 	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1072 
1073 	/* Clear the mark that says inserted into the global flow hash table */
1074 	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1075 	rw_exit(&flow_tab_lock);
1076 }
1077 
1078 /*
1079  * Retrieve a flow entry from the global flow hash table.
1080  */
1081 int
1082 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1083 {
1084 	int		err;
1085 	flow_entry_t	*flent;
1086 
1087 	rw_enter(&flow_tab_lock, RW_READER);
1088 	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1089 	    (mod_hash_val_t *)&flent);
1090 	if (err != 0) {
1091 		rw_exit(&flow_tab_lock);
1092 		return (ENOENT);
1093 	}
1094 	ASSERT(flent != NULL);
1095 	FLOW_USER_REFHOLD(flent);
1096 	rw_exit(&flow_tab_lock);
1097 
1098 	*flentp = flent;
1099 	return (0);
1100 }
1101 
1102 /*
1103  * Initialize or release mac client flows by walking the subflow table.
1104  * These are typically invoked during plumb/unplumb of links.
1105  */
1106 
1107 static int
1108 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1109 {
1110 	mac_client_impl_t	*mcip = arg;
1111 
1112 	if (mac_link_flow_init(arg, flent) != 0) {
1113 		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1114 		    flent->fe_flow_name, mcip->mci_name);
1115 	} else {
1116 		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1117 	}
1118 	return (0);
1119 }
1120 
1121 void
1122 mac_link_init_flows(mac_client_handle_t mch)
1123 {
1124 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1125 
1126 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1127 	    mac_link_init_flows_cb, mcip);
1128 	/*
1129 	 * If mac client had subflow(s) configured before plumb, change
1130 	 * function to mac_rx_srs_subflow_process and in case of hardware
1131 	 * classification, disable polling.
1132 	 */
1133 	mac_client_update_classifier(mcip, B_TRUE);
1134 
1135 }
1136 
1137 boolean_t
1138 mac_link_has_flows(mac_client_handle_t mch)
1139 {
1140 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1141 
1142 	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1143 		return (B_TRUE);
1144 
1145 	return (B_FALSE);
1146 }
1147 
1148 static int
1149 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1150 {
1151 	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1152 	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1153 	mac_link_flow_clean(arg, flent);
1154 	return (0);
1155 }
1156 
1157 void
1158 mac_link_release_flows(mac_client_handle_t mch)
1159 {
1160 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1161 
1162 	/*
1163 	 * Change the mci_flent callback back to mac_rx_srs_process()
1164 	 * because flows are about to be deactivated.
1165 	 */
1166 	mac_client_update_classifier(mcip, B_FALSE);
1167 	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1168 	    mac_link_release_flows_cb, mcip);
1169 }
1170 
1171 void
1172 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1173 {
1174 	mac_flow_set_name(fep, new_name);
1175 	if (fep->fe_ksp != NULL) {
1176 		flow_stat_destroy(fep);
1177 		flow_stat_create(fep);
1178 	}
1179 }
1180 
1181 /*
1182  * mac_link_flow_init()
1183  * Internal flow interface used for allocating SRSs and related
1184  * data structures. Not meant to be used by mac clients.
1185  */
1186 int
1187 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1188 {
1189 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1190 	mac_impl_t		*mip = mcip->mci_mip;
1191 	int			err;
1192 
1193 	ASSERT(mch != NULL);
1194 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1195 
1196 	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1197 		return (err);
1198 
1199 	sub_flow->fe_mcip = mcip;
1200 
1201 	return (0);
1202 }
1203 
1204 /*
1205  * mac_link_flow_add()
1206  * Used by flowadm(1m) or kernel mac clients for creating flows.
1207  */
1208 int
1209 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1210     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1211 {
1212 	flow_entry_t		*flent = NULL;
1213 	int			err;
1214 	dls_dl_handle_t		dlh;
1215 	dls_link_t		*dlp;
1216 	boolean_t		link_held = B_FALSE;
1217 	boolean_t		hash_added = B_FALSE;
1218 	mac_perim_handle_t	mph;
1219 
1220 	err = mac_flow_lookup_byname(flow_name, &flent);
1221 	if (err == 0) {
1222 		FLOW_USER_REFRELE(flent);
1223 		return (EEXIST);
1224 	}
1225 
1226 	/*
1227 	 * First create a flow entry given the description provided
1228 	 * by the caller.
1229 	 */
1230 	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1231 	    FLOW_USER | FLOW_OTHER, &flent);
1232 
1233 	if (err != 0)
1234 		return (err);
1235 
1236 	/*
1237 	 * We've got a local variable referencing this flow now, so we need
1238 	 * to hold it. We'll release this flow before returning.
1239 	 * All failures until we return will undo any action that may internally
1240 	 * held the flow, so the last REFRELE will assure a clean freeing
1241 	 * of resources.
1242 	 */
1243 	FLOW_REFHOLD(flent);
1244 
1245 	flent->fe_link_id = linkid;
1246 	FLOW_MARK(flent, FE_INCIPIENT);
1247 
1248 	err = mac_perim_enter_by_linkid(linkid, &mph);
1249 	if (err != 0) {
1250 		FLOW_FINAL_REFRELE(flent);
1251 		return (err);
1252 	}
1253 
1254 	/*
1255 	 * dls will eventually be merged with mac so it's ok
1256 	 * to call dls' internal functions.
1257 	 */
1258 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1259 	if (err != 0)
1260 		goto bail;
1261 
1262 	link_held = B_TRUE;
1263 
1264 	/*
1265 	 * Add the flow to the global flow table, this table will be per
1266 	 * exclusive zone so each zone can have its own flow namespace.
1267 	 * RFE 6625651 will fix this.
1268 	 *
1269 	 */
1270 	if ((err = mac_flow_hash_add(flent)) != 0)
1271 		goto bail;
1272 
1273 	hash_added = B_TRUE;
1274 
1275 	/*
1276 	 * do not allow flows to be configured on an anchor VNIC
1277 	 */
1278 	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1279 		err = ENOTSUP;
1280 		goto bail;
1281 	}
1282 
1283 	/*
1284 	 * Add the subflow to the subflow table. Also instantiate the flow
1285 	 * in the mac if there is an active user (we check if the MAC client's
1286 	 * datapath has been setup).
1287 	 */
1288 	err = mac_flow_add_subflow(dlp->dl_mch, flent,
1289 	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1290 	if (err != 0)
1291 		goto bail;
1292 
1293 	FLOW_UNMARK(flent, FE_INCIPIENT);
1294 	dls_devnet_rele_link(dlh, dlp);
1295 	mac_perim_exit(mph);
1296 	return (0);
1297 
1298 bail:
1299 	if (hash_added)
1300 		mac_flow_hash_remove(flent);
1301 
1302 	if (link_held)
1303 		dls_devnet_rele_link(dlh, dlp);
1304 
1305 	/*
1306 	 * Wait for any transient global flow hash refs to clear
1307 	 * and then release the creation reference on the flow
1308 	 */
1309 	mac_flow_wait(flent, FLOW_USER_REF);
1310 	FLOW_FINAL_REFRELE(flent);
1311 	mac_perim_exit(mph);
1312 	return (err);
1313 }
1314 
1315 /*
1316  * mac_link_flow_clean()
1317  * Internal flow interface used for freeing SRSs and related
1318  * data structures. Not meant to be used by mac clients.
1319  */
1320 void
1321 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1322 {
1323 	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
1324 	mac_impl_t		*mip = mcip->mci_mip;
1325 	boolean_t		last_subflow;
1326 
1327 	ASSERT(mch != NULL);
1328 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1329 
1330 	/*
1331 	 * This sub flow entry may fail to be fully initialized by
1332 	 * mac_link_flow_init(). If so, simply return.
1333 	 */
1334 	if (sub_flow->fe_mcip == NULL)
1335 		return;
1336 
1337 	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1338 	/*
1339 	 * Tear down the data path
1340 	 */
1341 	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1342 	sub_flow->fe_mcip = NULL;
1343 
1344 	/*
1345 	 * Delete the SRSs associated with this subflow. If this is being
1346 	 * driven by flowadm(1M) then the subflow will be deleted by
1347 	 * dls_rem_flow. However if this is a result of the interface being
1348 	 * unplumbed then the subflow itself won't be deleted.
1349 	 */
1350 	mac_flow_cleanup(sub_flow);
1351 
1352 	/*
1353 	 * If all the subflows are gone, renable some of the stuff
1354 	 * we disabled when adding a subflow, polling etc.
1355 	 */
1356 	if (last_subflow) {
1357 		/*
1358 		 * The subflow table itself is not protected by any locks or
1359 		 * refcnts. Hence quiesce the client upfront before clearing
1360 		 * mci_subflow_tab.
1361 		 */
1362 		mac_client_quiesce(mcip);
1363 		mac_client_update_classifier(mcip, B_FALSE);
1364 		mac_flow_tab_destroy(mcip->mci_subflow_tab);
1365 		mcip->mci_subflow_tab = NULL;
1366 		mac_client_restart(mcip);
1367 	}
1368 }
1369 
1370 /*
1371  * mac_link_flow_remove()
1372  * Used by flowadm(1m) or kernel mac clients for removing flows.
1373  */
1374 int
1375 mac_link_flow_remove(char *flow_name)
1376 {
1377 	flow_entry_t		*flent;
1378 	mac_perim_handle_t	mph;
1379 	int			err;
1380 	datalink_id_t		linkid;
1381 
1382 	err = mac_flow_lookup_byname(flow_name, &flent);
1383 	if (err != 0)
1384 		return (err);
1385 
1386 	linkid = flent->fe_link_id;
1387 	FLOW_USER_REFRELE(flent);
1388 
1389 	/*
1390 	 * The perim must be acquired before acquiring any other references
1391 	 * to maintain the lock and perimeter hierarchy. Please note the
1392 	 * FLOW_REFRELE above.
1393 	 */
1394 	err = mac_perim_enter_by_linkid(linkid, &mph);
1395 	if (err != 0)
1396 		return (err);
1397 
1398 	/*
1399 	 * Note the second lookup of the flow, because a concurrent thread
1400 	 * may have removed it already while we were waiting to enter the
1401 	 * link's perimeter.
1402 	 */
1403 	err = mac_flow_lookup_byname(flow_name, &flent);
1404 	if (err != 0) {
1405 		mac_perim_exit(mph);
1406 		return (err);
1407 	}
1408 	FLOW_USER_REFRELE(flent);
1409 
1410 	/*
1411 	 * Remove the flow from the subflow table and deactivate the flow
1412 	 * by quiescing and removings its SRSs
1413 	 */
1414 	mac_flow_rem_subflow(flent);
1415 
1416 	/*
1417 	 * Finally, remove the flow from the global table.
1418 	 */
1419 	mac_flow_hash_remove(flent);
1420 
1421 	/*
1422 	 * Wait for any transient global flow hash refs to clear
1423 	 * and then release the creation reference on the flow
1424 	 */
1425 	mac_flow_wait(flent, FLOW_USER_REF);
1426 	FLOW_FINAL_REFRELE(flent);
1427 
1428 	mac_perim_exit(mph);
1429 
1430 	return (0);
1431 }
1432 
1433 /*
1434  * mac_link_flow_modify()
1435  * Modifies the properties of a flow identified by its name.
1436  */
1437 int
1438 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1439 {
1440 	flow_entry_t		*flent;
1441 	mac_client_impl_t 	*mcip;
1442 	int			err = 0;
1443 	mac_perim_handle_t	mph;
1444 	datalink_id_t		linkid;
1445 	flow_tab_t		*flow_tab;
1446 
1447 	err = mac_validate_props(NULL, mrp);
1448 	if (err != 0)
1449 		return (err);
1450 
1451 	err = mac_flow_lookup_byname(flow_name, &flent);
1452 	if (err != 0)
1453 		return (err);
1454 
1455 	linkid = flent->fe_link_id;
1456 	FLOW_USER_REFRELE(flent);
1457 
1458 	/*
1459 	 * The perim must be acquired before acquiring any other references
1460 	 * to maintain the lock and perimeter hierarchy. Please note the
1461 	 * FLOW_REFRELE above.
1462 	 */
1463 	err = mac_perim_enter_by_linkid(linkid, &mph);
1464 	if (err != 0)
1465 		return (err);
1466 
1467 	/*
1468 	 * Note the second lookup of the flow, because a concurrent thread
1469 	 * may have removed it already while we were waiting to enter the
1470 	 * link's perimeter.
1471 	 */
1472 	err = mac_flow_lookup_byname(flow_name, &flent);
1473 	if (err != 0) {
1474 		mac_perim_exit(mph);
1475 		return (err);
1476 	}
1477 	FLOW_USER_REFRELE(flent);
1478 
1479 	/*
1480 	 * If this flow is attached to a MAC client, then pass the request
1481 	 * along to the client.
1482 	 * Otherwise, just update the cached values.
1483 	 */
1484 	mcip = flent->fe_mcip;
1485 	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1486 	if (mcip != NULL) {
1487 		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1488 			err = ENOENT;
1489 		} else {
1490 			mac_flow_modify(flow_tab, flent, mrp);
1491 		}
1492 	} else {
1493 		(void) mac_flow_modify_props(flent, mrp);
1494 	}
1495 
1496 done:
1497 	mac_perim_exit(mph);
1498 	return (err);
1499 }
1500 
1501 
1502 /*
1503  * State structure and misc functions used by mac_link_flow_walk().
1504  */
1505 typedef struct {
1506 	int	(*ws_func)(mac_flowinfo_t *, void *);
1507 	void	*ws_arg;
1508 } flow_walk_state_t;
1509 
1510 static void
1511 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1512 {
1513 	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1514 	    MAXFLOWNAMELEN);
1515 	finfop->fi_link_id = flent->fe_link_id;
1516 	finfop->fi_flow_desc = flent->fe_flow_desc;
1517 	finfop->fi_resource_props = flent->fe_resource_props;
1518 }
1519 
1520 static int
1521 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1522 {
1523 	flow_walk_state_t	*statep = arg;
1524 	mac_flowinfo_t		*finfo;
1525 	int			err;
1526 
1527 	finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1528 	mac_link_flowinfo_copy(finfo, flent);
1529 	err = statep->ws_func(finfo, statep->ws_arg);
1530 	kmem_free(finfo, sizeof (*finfo));
1531 	return (err);
1532 }
1533 
1534 /*
1535  * mac_link_flow_walk()
1536  * Invokes callback 'func' for all flows belonging to the specified link.
1537  */
1538 int
1539 mac_link_flow_walk(datalink_id_t linkid,
1540     int (*func)(mac_flowinfo_t *, void *), void *arg)
1541 {
1542 	mac_client_impl_t	*mcip;
1543 	mac_perim_handle_t	mph;
1544 	flow_walk_state_t	state;
1545 	dls_dl_handle_t		dlh;
1546 	dls_link_t		*dlp;
1547 	int			err;
1548 
1549 	err = mac_perim_enter_by_linkid(linkid, &mph);
1550 	if (err != 0)
1551 		return (err);
1552 
1553 	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1554 	if (err != 0) {
1555 		mac_perim_exit(mph);
1556 		return (err);
1557 	}
1558 
1559 	mcip = (mac_client_impl_t *)dlp->dl_mch;
1560 	state.ws_func = func;
1561 	state.ws_arg = arg;
1562 
1563 	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1564 	    mac_link_flow_walk_cb, &state);
1565 
1566 	dls_devnet_rele_link(dlh, dlp);
1567 	mac_perim_exit(mph);
1568 	return (err);
1569 }
1570 
1571 /*
1572  * mac_link_flow_info()
1573  * Retrieves information about a specific flow.
1574  */
1575 int
1576 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1577 {
1578 	flow_entry_t	*flent;
1579 	int		err;
1580 
1581 	err = mac_flow_lookup_byname(flow_name, &flent);
1582 	if (err != 0)
1583 		return (err);
1584 
1585 	mac_link_flowinfo_copy(finfo, flent);
1586 	FLOW_USER_REFRELE(flent);
1587 	return (0);
1588 }
1589 
1590 /*
1591  * Hash function macro that takes an Ethernet address and VLAN id as input.
1592  */
1593 #define	HASH_ETHER_VID(a, v, s)	\
1594 	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1595 
1596 /*
1597  * Generic layer-2 address hashing function that takes an address and address
1598  * length as input.  This is the DJB hash function.
1599  */
1600 static uint32_t
1601 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1602 {
1603 	uint32_t	hash = 5381;
1604 	size_t		i;
1605 
1606 	for (i = 0; i < addrlen; i++)
1607 		hash = ((hash << 5) + hash) + addr[i];
1608 	return (hash % htsize);
1609 }
1610 
1611 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1612 
1613 #define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
1614 	if ((s)->fs_mp->b_wptr == (start)) {		\
1615 		mblk_t	*next = (s)->fs_mp->b_cont;	\
1616 		if (next == NULL)			\
1617 			return (EINVAL);		\
1618 							\
1619 		(s)->fs_mp = next;			\
1620 		(start) = next->b_rptr;			\
1621 	}						\
1622 }
1623 
1624 /* ARGSUSED */
1625 static boolean_t
1626 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1627 {
1628 	flow_l2info_t		*l2 = &s->fs_l2info;
1629 	flow_desc_t		*fd = &flent->fe_flow_desc;
1630 
1631 	return (l2->l2_vid == fd->fd_vid &&
1632 	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1633 }
1634 
1635 /*
1636  * Layer 2 hash function.
1637  * Must be paired with flow_l2_accept() within a set of flow_ops
1638  * because it assumes the dest address is already extracted.
1639  */
1640 static uint32_t
1641 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1642 {
1643 	return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1644 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1645 }
1646 
1647 /*
1648  * This is the generic layer 2 accept function.
1649  * It makes use of mac_header_info() to extract the header length,
1650  * sap, vlan ID and destination address.
1651  */
1652 static int
1653 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1654 {
1655 	boolean_t		is_ether;
1656 	flow_l2info_t		*l2 = &s->fs_l2info;
1657 	mac_header_info_t	mhi;
1658 	int			err;
1659 
1660 	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1661 	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1662 	    s->fs_mp, &mhi)) != 0) {
1663 		if (err == EINVAL)
1664 			err = ENOBUFS;
1665 
1666 		return (err);
1667 	}
1668 
1669 	l2->l2_start = s->fs_mp->b_rptr;
1670 	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1671 
1672 	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1673 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1674 		struct ether_vlan_header	*evhp =
1675 		    (struct ether_vlan_header *)l2->l2_start;
1676 
1677 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1678 			return (ENOBUFS);
1679 
1680 		l2->l2_sap = ntohs(evhp->ether_type);
1681 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1682 		l2->l2_hdrsize = sizeof (*evhp);
1683 	} else {
1684 		l2->l2_sap = mhi.mhi_bindsap;
1685 		l2->l2_vid = 0;
1686 		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1687 	}
1688 	return (0);
1689 }
1690 
1691 /*
1692  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1693  * accept(). The notable difference is that dest address is now extracted
1694  * by hash() rather than by accept(). This saves a few memory references
1695  * for flow tables that do not care about mac addresses.
1696  */
1697 static uint32_t
1698 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1699 {
1700 	flow_l2info_t			*l2 = &s->fs_l2info;
1701 	struct ether_vlan_header	*evhp;
1702 
1703 	evhp = (struct ether_vlan_header *)l2->l2_start;
1704 	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1705 	return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1706 }
1707 
1708 static uint32_t
1709 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1710 {
1711 	flow_desc_t	*fd = &flent->fe_flow_desc;
1712 
1713 	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1714 	return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1715 }
1716 
1717 /* ARGSUSED */
1718 static int
1719 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1720 {
1721 	flow_l2info_t			*l2 = &s->fs_l2info;
1722 	struct ether_vlan_header	*evhp;
1723 	uint16_t			sap;
1724 
1725 	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1726 	l2->l2_start = (uchar_t *)evhp;
1727 
1728 	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1729 		return (ENOBUFS);
1730 
1731 	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1732 	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1733 		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1734 			return (ENOBUFS);
1735 
1736 		l2->l2_sap = ntohs(evhp->ether_type);
1737 		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1738 		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1739 	} else {
1740 		l2->l2_sap = sap;
1741 		l2->l2_vid = 0;
1742 		l2->l2_hdrsize = sizeof (struct ether_header);
1743 	}
1744 	return (0);
1745 }
1746 
1747 /*
1748  * Validates a layer 2 flow entry.
1749  */
1750 static int
1751 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1752 {
1753 	flow_desc_t	*fd = &flent->fe_flow_desc;
1754 
1755 	/*
1756 	 * Dest address is mandatory, and 0 length addresses are not yet
1757 	 * supported.
1758 	 */
1759 	if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1760 		return (EINVAL);
1761 
1762 	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1763 		/*
1764 		 * VLAN flows are only supported over ethernet macs.
1765 		 */
1766 		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1767 			return (EINVAL);
1768 
1769 		if (fd->fd_vid == 0)
1770 			return (EINVAL);
1771 
1772 	}
1773 	flent->fe_match = flow_l2_match;
1774 	return (0);
1775 }
1776 
1777 /*
1778  * Calculates hash index of flow entry.
1779  */
1780 static uint32_t
1781 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1782 {
1783 	flow_desc_t	*fd = &flent->fe_flow_desc;
1784 
1785 	ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1786 	return (flow_l2_addrhash(fd->fd_dst_mac,
1787 	    ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1788 }
1789 
1790 /*
1791  * This is used for duplicate flow checking.
1792  */
1793 /* ARGSUSED */
1794 static boolean_t
1795 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1796 {
1797 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1798 
1799 	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1800 	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1801 	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1802 }
1803 
1804 /*
1805  * Generic flow entry insertion function.
1806  * Used by flow tables that do not have ordering requirements.
1807  */
1808 /* ARGSUSED */
1809 static int
1810 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1811     flow_entry_t *flent)
1812 {
1813 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1814 
1815 	if (*headp != NULL) {
1816 		ASSERT(flent->fe_next == NULL);
1817 		flent->fe_next = *headp;
1818 	}
1819 	*headp = flent;
1820 	return (0);
1821 }
1822 
1823 /*
1824  * IP version independent DSField matching function.
1825  */
1826 /* ARGSUSED */
1827 static boolean_t
1828 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1829 {
1830 	flow_l3info_t	*l3info = &s->fs_l3info;
1831 	flow_desc_t	*fd = &flent->fe_flow_desc;
1832 
1833 	switch (l3info->l3_version) {
1834 	case IPV4_VERSION: {
1835 		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1836 
1837 		return ((ipha->ipha_type_of_service &
1838 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1839 	}
1840 	case IPV6_VERSION: {
1841 		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1842 
1843 		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1844 		    fd->fd_dsfield_mask) == fd->fd_dsfield);
1845 	}
1846 	default:
1847 		return (B_FALSE);
1848 	}
1849 }
1850 
1851 /*
1852  * IP v4 and v6 address matching.
1853  * The netmask only needs to be applied on the packet but not on the
1854  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1855  */
1856 
1857 /* ARGSUSED */
1858 static boolean_t
1859 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1860 {
1861 	flow_l3info_t	*l3info = &s->fs_l3info;
1862 	flow_desc_t	*fd = &flent->fe_flow_desc;
1863 	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
1864 	in_addr_t	addr;
1865 
1866 	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1867 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1868 		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1869 		    V4_PART_OF_V6(fd->fd_local_addr));
1870 	}
1871 	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1872 	    V4_PART_OF_V6(fd->fd_remote_addr));
1873 }
1874 
1875 /* ARGSUSED */
1876 static boolean_t
1877 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1878 {
1879 	flow_l3info_t	*l3info = &s->fs_l3info;
1880 	flow_desc_t	*fd = &flent->fe_flow_desc;
1881 	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
1882 	in6_addr_t	*addrp;
1883 
1884 	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1885 	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1886 		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1887 		    fd->fd_local_addr));
1888 	}
1889 	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1890 }
1891 
1892 /* ARGSUSED */
1893 static boolean_t
1894 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1895 {
1896 	flow_l3info_t	*l3info = &s->fs_l3info;
1897 	flow_desc_t	*fd = &flent->fe_flow_desc;
1898 
1899 	return (l3info->l3_protocol == fd->fd_protocol);
1900 }
1901 
1902 static uint32_t
1903 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1904 {
1905 	flow_l3info_t	*l3info = &s->fs_l3info;
1906 	flow_mask_t	mask = ft->ft_mask;
1907 
1908 	if ((mask & FLOW_IP_LOCAL) != 0) {
1909 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1910 	} else if ((mask & FLOW_IP_REMOTE) != 0) {
1911 		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1912 	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
1913 		/*
1914 		 * DSField flents are arranged as a single list.
1915 		 */
1916 		return (0);
1917 	}
1918 	/*
1919 	 * IP addr flents are hashed into two lists, v4 or v6.
1920 	 */
1921 	ASSERT(ft->ft_size >= 2);
1922 	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1923 }
1924 
1925 static uint32_t
1926 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1927 {
1928 	flow_l3info_t	*l3info = &s->fs_l3info;
1929 
1930 	return (l3info->l3_protocol % ft->ft_size);
1931 }
1932 
1933 /* ARGSUSED */
1934 static int
1935 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1936 {
1937 	flow_l2info_t	*l2info = &s->fs_l2info;
1938 	flow_l3info_t	*l3info = &s->fs_l3info;
1939 	uint16_t	sap = l2info->l2_sap;
1940 	uchar_t		*l3_start;
1941 
1942 	l3_start = l2info->l2_start + l2info->l2_hdrsize;
1943 
1944 	/*
1945 	 * Adjust start pointer if we're at the end of an mblk.
1946 	 */
1947 	CHECK_AND_ADJUST_START_PTR(s, l3_start);
1948 
1949 	l3info->l3_start = l3_start;
1950 	if (!OK_32PTR(l3_start))
1951 		return (EINVAL);
1952 
1953 	switch (sap) {
1954 	case ETHERTYPE_IP: {
1955 		ipha_t	*ipha = (ipha_t *)l3_start;
1956 
1957 		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1958 			return (ENOBUFS);
1959 
1960 		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1961 		l3info->l3_protocol = ipha->ipha_protocol;
1962 		l3info->l3_version = IPV4_VERSION;
1963 		l3info->l3_fragmented =
1964 		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1965 		break;
1966 	}
1967 	case ETHERTYPE_IPV6: {
1968 		ip6_t		*ip6h = (ip6_t *)l3_start;
1969 		ip6_frag_t	*frag = NULL;
1970 		uint16_t	ip6_hdrlen;
1971 		uint8_t		nexthdr;
1972 
1973 		if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
1974 		    &nexthdr, &frag)) {
1975 			return (ENOBUFS);
1976 		}
1977 		l3info->l3_hdrsize = ip6_hdrlen;
1978 		l3info->l3_protocol = nexthdr;
1979 		l3info->l3_version = IPV6_VERSION;
1980 		l3info->l3_fragmented = (frag != NULL);
1981 		break;
1982 	}
1983 	default:
1984 		return (EINVAL);
1985 	}
1986 	return (0);
1987 }
1988 
1989 /* ARGSUSED */
1990 static int
1991 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1992 {
1993 	flow_desc_t	*fd = &flent->fe_flow_desc;
1994 
1995 	switch (fd->fd_protocol) {
1996 	case IPPROTO_TCP:
1997 	case IPPROTO_UDP:
1998 	case IPPROTO_SCTP:
1999 	case IPPROTO_ICMP:
2000 	case IPPROTO_ICMPV6:
2001 		flent->fe_match = flow_ip_proto_match;
2002 		return (0);
2003 	default:
2004 		return (EINVAL);
2005 	}
2006 }
2007 
2008 /* ARGSUSED */
2009 static int
2010 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2011 {
2012 	flow_desc_t	*fd = &flent->fe_flow_desc;
2013 	flow_mask_t	mask;
2014 	uint8_t		version;
2015 	in6_addr_t	*addr, *netmask;
2016 
2017 	/*
2018 	 * DSField does not require a IP version.
2019 	 */
2020 	if (fd->fd_mask == FLOW_IP_DSFIELD) {
2021 		if (fd->fd_dsfield_mask == 0)
2022 			return (EINVAL);
2023 
2024 		flent->fe_match = flow_ip_dsfield_match;
2025 		return (0);
2026 	}
2027 
2028 	/*
2029 	 * IP addresses must come with a version to avoid ambiguity.
2030 	 */
2031 	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2032 		return (EINVAL);
2033 
2034 	version = fd->fd_ipversion;
2035 	if (version != IPV4_VERSION && version != IPV6_VERSION)
2036 		return (EINVAL);
2037 
2038 	mask = fd->fd_mask & ~FLOW_IP_VERSION;
2039 	switch (mask) {
2040 	case FLOW_IP_LOCAL:
2041 		addr = &fd->fd_local_addr;
2042 		netmask = &fd->fd_local_netmask;
2043 		break;
2044 	case FLOW_IP_REMOTE:
2045 		addr = &fd->fd_remote_addr;
2046 		netmask = &fd->fd_remote_netmask;
2047 		break;
2048 	default:
2049 		return (EINVAL);
2050 	}
2051 
2052 	/*
2053 	 * Apply netmask onto specified address.
2054 	 */
2055 	V6_MASK_COPY(*addr, *netmask, *addr);
2056 	if (version == IPV4_VERSION) {
2057 		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
2058 		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
2059 
2060 		if (v4addr == 0 || v4mask == 0)
2061 			return (EINVAL);
2062 		flent->fe_match = flow_ip_v4_match;
2063 	} else {
2064 		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2065 		    IN6_IS_ADDR_UNSPECIFIED(netmask))
2066 			return (EINVAL);
2067 		flent->fe_match = flow_ip_v6_match;
2068 	}
2069 	return (0);
2070 }
2071 
2072 static uint32_t
2073 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2074 {
2075 	flow_desc_t	*fd = &flent->fe_flow_desc;
2076 
2077 	return (fd->fd_protocol % ft->ft_size);
2078 }
2079 
2080 static uint32_t
2081 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2082 {
2083 	flow_desc_t	*fd = &flent->fe_flow_desc;
2084 
2085 	/*
2086 	 * DSField flents are arranged as a single list.
2087 	 */
2088 	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2089 		return (0);
2090 
2091 	/*
2092 	 * IP addr flents are hashed into two lists, v4 or v6.
2093 	 */
2094 	ASSERT(ft->ft_size >= 2);
2095 	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2096 }
2097 
2098 /* ARGSUSED */
2099 static boolean_t
2100 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2101 {
2102 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2103 
2104 	return (fd1->fd_protocol == fd2->fd_protocol);
2105 }
2106 
2107 /* ARGSUSED */
2108 static boolean_t
2109 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 {
2111 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112 	in6_addr_t	*a1, *m1, *a2, *m2;
2113 
2114 	ASSERT(fd1->fd_mask == fd2->fd_mask);
2115 	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2116 		return (fd1->fd_dsfield == fd2->fd_dsfield &&
2117 		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2118 	}
2119 
2120 	/*
2121 	 * flow_ip_accept_fe() already validated the version.
2122 	 */
2123 	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2124 	if (fd1->fd_ipversion != fd2->fd_ipversion)
2125 		return (B_FALSE);
2126 
2127 	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2128 	case FLOW_IP_LOCAL:
2129 		a1 = &fd1->fd_local_addr;
2130 		m1 = &fd1->fd_local_netmask;
2131 		a2 = &fd2->fd_local_addr;
2132 		m2 = &fd2->fd_local_netmask;
2133 		break;
2134 	case FLOW_IP_REMOTE:
2135 		a1 = &fd1->fd_remote_addr;
2136 		m1 = &fd1->fd_remote_netmask;
2137 		a2 = &fd2->fd_remote_addr;
2138 		m2 = &fd2->fd_remote_netmask;
2139 		break;
2140 	default:
2141 		/*
2142 		 * This is unreachable given the checks in
2143 		 * flow_ip_accept_fe().
2144 		 */
2145 		return (B_FALSE);
2146 	}
2147 
2148 	if (fd1->fd_ipversion == IPV4_VERSION) {
2149 		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2150 		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2151 
2152 	} else {
2153 		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2154 		    IN6_ARE_ADDR_EQUAL(m1, m2));
2155 	}
2156 }
2157 
2158 static int
2159 flow_ip_mask2plen(in6_addr_t *v6mask)
2160 {
2161 	int		bits;
2162 	int		plen = IPV6_ABITS;
2163 	int		i;
2164 
2165 	for (i = 3; i >= 0; i--) {
2166 		if (v6mask->s6_addr32[i] == 0) {
2167 			plen -= 32;
2168 			continue;
2169 		}
2170 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2171 		if (bits == 0)
2172 			break;
2173 		plen -= bits;
2174 	}
2175 	return (plen);
2176 }
2177 
2178 /* ARGSUSED */
2179 static int
2180 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2181     flow_entry_t *flent)
2182 {
2183 	flow_entry_t	**p = headp;
2184 	flow_desc_t	*fd0, *fd;
2185 	in6_addr_t	*m0, *m;
2186 	int		plen0, plen;
2187 
2188 	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2189 
2190 	/*
2191 	 * No special ordering needed for dsfield.
2192 	 */
2193 	fd0 = &flent->fe_flow_desc;
2194 	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2195 		if (*p != NULL) {
2196 			ASSERT(flent->fe_next == NULL);
2197 			flent->fe_next = *p;
2198 		}
2199 		*p = flent;
2200 		return (0);
2201 	}
2202 
2203 	/*
2204 	 * IP address flows are arranged in descending prefix length order.
2205 	 */
2206 	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2207 	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2208 	plen0 = flow_ip_mask2plen(m0);
2209 	ASSERT(plen0 != 0);
2210 
2211 	for (; *p != NULL; p = &(*p)->fe_next) {
2212 		fd = &(*p)->fe_flow_desc;
2213 
2214 		/*
2215 		 * Normally a dsfield flent shouldn't end up on the same
2216 		 * list as an IP address because flow tables are (for now)
2217 		 * disjoint. If we decide to support both IP and dsfield
2218 		 * in the same table in the future, this check will allow
2219 		 * for that.
2220 		 */
2221 		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2222 			continue;
2223 
2224 		/*
2225 		 * We also allow for the mixing of local and remote address
2226 		 * flents within one list.
2227 		 */
2228 		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2229 		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
2230 		plen = flow_ip_mask2plen(m);
2231 
2232 		if (plen <= plen0)
2233 			break;
2234 	}
2235 	if (*p != NULL) {
2236 		ASSERT(flent->fe_next == NULL);
2237 		flent->fe_next = *p;
2238 	}
2239 	*p = flent;
2240 	return (0);
2241 }
2242 
2243 /*
2244  * Transport layer protocol and port matching functions.
2245  */
2246 
2247 /* ARGSUSED */
2248 static boolean_t
2249 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2250 {
2251 	flow_l3info_t	*l3info = &s->fs_l3info;
2252 	flow_l4info_t	*l4info = &s->fs_l4info;
2253 	flow_desc_t	*fd = &flent->fe_flow_desc;
2254 
2255 	return (fd->fd_protocol == l3info->l3_protocol &&
2256 	    fd->fd_local_port == l4info->l4_hash_port);
2257 }
2258 
2259 /* ARGSUSED */
2260 static boolean_t
2261 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2262 {
2263 	flow_l3info_t	*l3info = &s->fs_l3info;
2264 	flow_l4info_t	*l4info = &s->fs_l4info;
2265 	flow_desc_t	*fd = &flent->fe_flow_desc;
2266 
2267 	return (fd->fd_protocol == l3info->l3_protocol &&
2268 	    fd->fd_remote_port == l4info->l4_hash_port);
2269 }
2270 
2271 /*
2272  * Transport hash function.
2273  * Since we only support either local or remote port flows,
2274  * we only need to extract one of the ports to be used for
2275  * matching.
2276  */
2277 static uint32_t
2278 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2279 {
2280 	flow_l3info_t	*l3info = &s->fs_l3info;
2281 	flow_l4info_t	*l4info = &s->fs_l4info;
2282 	uint8_t		proto = l3info->l3_protocol;
2283 	boolean_t	dst_or_src;
2284 
2285 	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2286 		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2287 	} else {
2288 		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2289 	}
2290 
2291 	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2292 	    l4info->l4_src_port;
2293 
2294 	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2295 }
2296 
2297 /*
2298  * Unlike other accept() functions above, we do not need to get the header
2299  * size because this is our highest layer so far. If we want to do support
2300  * other higher layer protocols, we would need to save the l4_hdrsize
2301  * in the code below.
2302  */
2303 
2304 /* ARGSUSED */
2305 static int
2306 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2307 {
2308 	flow_l3info_t	*l3info = &s->fs_l3info;
2309 	flow_l4info_t	*l4info = &s->fs_l4info;
2310 	uint8_t		proto = l3info->l3_protocol;
2311 	uchar_t		*l4_start;
2312 
2313 	l4_start = l3info->l3_start + l3info->l3_hdrsize;
2314 
2315 	/*
2316 	 * Adjust start pointer if we're at the end of an mblk.
2317 	 */
2318 	CHECK_AND_ADJUST_START_PTR(s, l4_start);
2319 
2320 	l4info->l4_start = l4_start;
2321 	if (!OK_32PTR(l4_start))
2322 		return (EINVAL);
2323 
2324 	if (l3info->l3_fragmented == B_TRUE)
2325 		return (EINVAL);
2326 
2327 	switch (proto) {
2328 	case IPPROTO_TCP: {
2329 		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
2330 
2331 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2332 			return (ENOBUFS);
2333 
2334 		l4info->l4_src_port = tcph->th_sport;
2335 		l4info->l4_dst_port = tcph->th_dport;
2336 		break;
2337 	}
2338 	case IPPROTO_UDP: {
2339 		struct udphdr	*udph = (struct udphdr *)l4_start;
2340 
2341 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2342 			return (ENOBUFS);
2343 
2344 		l4info->l4_src_port = udph->uh_sport;
2345 		l4info->l4_dst_port = udph->uh_dport;
2346 		break;
2347 	}
2348 	case IPPROTO_SCTP: {
2349 		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
2350 
2351 		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2352 			return (ENOBUFS);
2353 
2354 		l4info->l4_src_port = sctph->sh_sport;
2355 		l4info->l4_dst_port = sctph->sh_dport;
2356 		break;
2357 	}
2358 	default:
2359 		return (EINVAL);
2360 	}
2361 
2362 	return (0);
2363 }
2364 
2365 /*
2366  * Validates transport flow entry.
2367  * The protocol field must be present.
2368  */
2369 
2370 /* ARGSUSED */
2371 static int
2372 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2373 {
2374 	flow_desc_t	*fd = &flent->fe_flow_desc;
2375 	flow_mask_t	mask = fd->fd_mask;
2376 
2377 	if ((mask & FLOW_IP_PROTOCOL) == 0)
2378 		return (EINVAL);
2379 
2380 	switch (fd->fd_protocol) {
2381 	case IPPROTO_TCP:
2382 	case IPPROTO_UDP:
2383 	case IPPROTO_SCTP:
2384 		break;
2385 	default:
2386 		return (EINVAL);
2387 	}
2388 
2389 	switch (mask & ~FLOW_IP_PROTOCOL) {
2390 	case FLOW_ULP_PORT_LOCAL:
2391 		if (fd->fd_local_port == 0)
2392 			return (EINVAL);
2393 
2394 		flent->fe_match = flow_transport_lport_match;
2395 		break;
2396 	case FLOW_ULP_PORT_REMOTE:
2397 		if (fd->fd_remote_port == 0)
2398 			return (EINVAL);
2399 
2400 		flent->fe_match = flow_transport_rport_match;
2401 		break;
2402 	case 0:
2403 		/*
2404 		 * transport-only flows conflicts with our table type.
2405 		 */
2406 		return (EOPNOTSUPP);
2407 	default:
2408 		return (EINVAL);
2409 	}
2410 
2411 	return (0);
2412 }
2413 
2414 static uint32_t
2415 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2416 {
2417 	flow_desc_t	*fd = &flent->fe_flow_desc;
2418 	uint16_t	port = 0;
2419 
2420 	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2421 	    fd->fd_local_port : fd->fd_remote_port;
2422 
2423 	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2424 }
2425 
2426 /* ARGSUSED */
2427 static boolean_t
2428 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2429 {
2430 	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2431 
2432 	if (fd1->fd_protocol != fd2->fd_protocol)
2433 		return (B_FALSE);
2434 
2435 	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2436 		return (fd1->fd_local_port == fd2->fd_local_port);
2437 
2438 	if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2439 		return (fd1->fd_remote_port == fd2->fd_remote_port);
2440 
2441 	return (B_TRUE);
2442 }
2443 
2444 static flow_ops_t flow_l2_ops = {
2445 	flow_l2_accept_fe,
2446 	flow_l2_hash_fe,
2447 	flow_l2_match_fe,
2448 	flow_generic_insert_fe,
2449 	flow_l2_hash,
2450 	{flow_l2_accept}
2451 };
2452 
2453 static flow_ops_t flow_ip_ops = {
2454 	flow_ip_accept_fe,
2455 	flow_ip_hash_fe,
2456 	flow_ip_match_fe,
2457 	flow_ip_insert_fe,
2458 	flow_ip_hash,
2459 	{flow_l2_accept, flow_ip_accept}
2460 };
2461 
2462 static flow_ops_t flow_ip_proto_ops = {
2463 	flow_ip_proto_accept_fe,
2464 	flow_ip_proto_hash_fe,
2465 	flow_ip_proto_match_fe,
2466 	flow_generic_insert_fe,
2467 	flow_ip_proto_hash,
2468 	{flow_l2_accept, flow_ip_accept}
2469 };
2470 
2471 static flow_ops_t flow_transport_ops = {
2472 	flow_transport_accept_fe,
2473 	flow_transport_hash_fe,
2474 	flow_transport_match_fe,
2475 	flow_generic_insert_fe,
2476 	flow_transport_hash,
2477 	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
2478 };
2479 
2480 static flow_tab_info_t flow_tab_info_list[] = {
2481 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2482 	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2483 	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2484 	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2485 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2486 	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2487 };
2488 
2489 #define	FLOW_MAX_TAB_INFO \
2490 	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2491 
2492 static flow_tab_info_t *
2493 mac_flow_tab_info_get(flow_mask_t mask)
2494 {
2495 	int	i;
2496 
2497 	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2498 		if (mask == flow_tab_info_list[i].fti_mask)
2499 			return (&flow_tab_info_list[i]);
2500 	}
2501 	return (NULL);
2502 }
2503