1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/strsun.h>
28 #include <sys/sdt.h>
29 #include <sys/mac.h>
30 #include <sys/mac_impl.h>
31 #include <sys/mac_client_impl.h>
32 #include <sys/mac_stat.h>
33 #include <sys/dls.h>
34 #include <sys/dls_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/ethernet.h>
37 #include <sys/cpupart.h>
38 #include <sys/pool.h>
39 #include <sys/pool_pset.h>
40 #include <sys/vlan.h>
41 #include <inet/ip.h>
42 #include <inet/ip6.h>
43 #include <netinet/tcp.h>
44 #include <netinet/udp.h>
45 #include <netinet/sctp.h>
46
47 typedef struct flow_stats_s {
48 uint64_t fs_obytes;
49 uint64_t fs_opackets;
50 uint64_t fs_oerrors;
51 uint64_t fs_ibytes;
52 uint64_t fs_ipackets;
53 uint64_t fs_ierrors;
54 } flow_stats_t;
55
56
57 /* global flow table, will be a per exclusive-zone table later */
58 static mod_hash_t *flow_hash;
59 static krwlock_t flow_tab_lock;
60
61 static kmem_cache_t *flow_cache;
62 static kmem_cache_t *flow_tab_cache;
63 static flow_ops_t flow_l2_ops;
64
65 typedef struct {
66 const char *fs_name;
67 uint_t fs_offset;
68 } flow_stats_info_t;
69
70 #define FS_OFF(f) (offsetof(flow_stats_t, f))
71 static flow_stats_info_t flow_stats_list[] = {
72 {"rbytes", FS_OFF(fs_ibytes)},
73 {"ipackets", FS_OFF(fs_ipackets)},
74 {"ierrors", FS_OFF(fs_ierrors)},
75 {"obytes", FS_OFF(fs_obytes)},
76 {"opackets", FS_OFF(fs_opackets)},
77 {"oerrors", FS_OFF(fs_oerrors)}
78 };
79 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
80
81 /*
82 * Checks whether a flow mask is legal.
83 */
84 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t);
85
86 static void
flow_stat_init(kstat_named_t * knp)87 flow_stat_init(kstat_named_t *knp)
88 {
89 int i;
90
91 for (i = 0; i < FS_SIZE; i++, knp++) {
92 kstat_named_init(knp, flow_stats_list[i].fs_name,
93 KSTAT_DATA_UINT64);
94 }
95 }
96
97 static int
flow_stat_update(kstat_t * ksp,int rw)98 flow_stat_update(kstat_t *ksp, int rw)
99 {
100 flow_entry_t *fep = ksp->ks_private;
101 kstat_named_t *knp = ksp->ks_data;
102 uint64_t *statp;
103 int i;
104 mac_rx_stats_t *mac_rx_stat;
105 mac_tx_stats_t *mac_tx_stat;
106 flow_stats_t flow_stats;
107 mac_soft_ring_set_t *mac_srs;
108
109 if (rw != KSTAT_READ)
110 return (EACCES);
111
112 bzero(&flow_stats, sizeof (flow_stats_t));
113
114 for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
115 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
116 if (mac_srs == NULL) /* Multicast flow */
117 break;
118 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
119
120 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
121 mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
122
123 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
124 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
125
126 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
127 }
128
129 mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
130 if (mac_srs == NULL) /* Multicast flow */
131 goto done;
132 mac_tx_stat = &mac_srs->srs_tx.st_stat;
133
134 flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
135 flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
136 flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
137
138 done:
139 for (i = 0; i < FS_SIZE; i++, knp++) {
140 statp = (uint64_t *)
141 ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
142 knp->value.ui64 = *statp;
143 }
144 return (0);
145 }
146
147 static void
flow_stat_create(flow_entry_t * fep)148 flow_stat_create(flow_entry_t *fep)
149 {
150 kstat_t *ksp;
151 kstat_named_t *knp;
152 uint_t nstats = FS_SIZE;
153
154 /*
155 * Fow now, flow entries are only manipulated and visible from the
156 * global zone.
157 */
158 ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
159 KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
160 if (ksp == NULL)
161 return;
162
163 ksp->ks_update = flow_stat_update;
164 ksp->ks_private = fep;
165 fep->fe_ksp = ksp;
166
167 knp = (kstat_named_t *)ksp->ks_data;
168 flow_stat_init(knp);
169 kstat_install(ksp);
170 }
171
172 void
flow_stat_destroy(flow_entry_t * fep)173 flow_stat_destroy(flow_entry_t *fep)
174 {
175 if (fep->fe_ksp != NULL) {
176 kstat_delete(fep->fe_ksp);
177 fep->fe_ksp = NULL;
178 }
179 }
180
181 /*
182 * Initialize the flow table
183 */
184 void
mac_flow_init()185 mac_flow_init()
186 {
187 flow_cache = kmem_cache_create("flow_entry_cache",
188 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
189 flow_tab_cache = kmem_cache_create("flow_tab_cache",
190 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
191 flow_hash = mod_hash_create_extended("flow_hash",
192 100, mod_hash_null_keydtor, mod_hash_null_valdtor,
193 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
194 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
195 }
196
197 /*
198 * Cleanup and release the flow table
199 */
200 void
mac_flow_fini()201 mac_flow_fini()
202 {
203 kmem_cache_destroy(flow_cache);
204 kmem_cache_destroy(flow_tab_cache);
205 mod_hash_destroy_hash(flow_hash);
206 rw_destroy(&flow_tab_lock);
207 }
208
209 /*
210 * mac_create_flow(): create a flow_entry_t.
211 */
212 int
mac_flow_create(flow_desc_t * fd,mac_resource_props_t * mrp,char * name,void * client_cookie,uint_t type,flow_entry_t ** flentp)213 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
214 void *client_cookie, uint_t type, flow_entry_t **flentp)
215 {
216 flow_entry_t *flent = *flentp;
217 int err = 0;
218
219 if (mrp != NULL) {
220 err = mac_validate_props(NULL, mrp);
221 if (err != 0)
222 return (err);
223 }
224
225 if (flent == NULL) {
226 flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
227 bzero(flent, sizeof (*flent));
228 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
229 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
230
231 /* Initialize the receiver function to a safe routine */
232 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
233 flent->fe_index = -1;
234 }
235 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
236
237 /* This is an initial flow, will be configured later */
238 if (fd == NULL) {
239 *flentp = flent;
240 return (0);
241 }
242
243 flent->fe_client_cookie = client_cookie;
244 flent->fe_type = type;
245
246 /* Save flow desc */
247 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
248
249 if (mrp != NULL) {
250 /*
251 * We have already set fe_resource_props for a Link.
252 */
253 if (type & FLOW_USER) {
254 bcopy(mrp, &flent->fe_resource_props,
255 sizeof (mac_resource_props_t));
256 }
257 /*
258 * The effective resource list should reflect the priority
259 * that we set implicitly.
260 */
261 if (!(mrp->mrp_mask & MRP_PRIORITY))
262 mrp->mrp_mask |= MRP_PRIORITY;
263 if (type & FLOW_USER)
264 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
265 else
266 mrp->mrp_priority = MPL_LINK_DEFAULT;
267 bzero(mrp->mrp_pool, MAXPATHLEN);
268 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
269 bcopy(mrp, &flent->fe_effective_props,
270 sizeof (mac_resource_props_t));
271 }
272 flow_stat_create(flent);
273
274 *flentp = flent;
275 return (0);
276 }
277
278 /*
279 * Validate flow entry and add it to a flow table.
280 */
281 int
mac_flow_add(flow_tab_t * ft,flow_entry_t * flent)282 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
283 {
284 flow_entry_t **headp, **p;
285 flow_ops_t *ops = &ft->ft_ops;
286 flow_mask_t mask;
287 uint32_t index;
288 int err;
289
290 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
291
292 /*
293 * Check for invalid bits in mask.
294 */
295 mask = flent->fe_flow_desc.fd_mask;
296 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
297 return (EOPNOTSUPP);
298
299 /*
300 * Validate flent.
301 */
302 if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
303 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
304 flow_entry_t *, flent, int, err);
305 return (err);
306 }
307
308 /*
309 * Flent is valid. now calculate hash and insert it
310 * into hash table.
311 */
312 index = ops->fo_hash_fe(ft, flent);
313
314 /*
315 * We do not need a lock up until now because we were
316 * not accessing the flow table.
317 */
318 rw_enter(&ft->ft_lock, RW_WRITER);
319 headp = &ft->ft_table[index];
320
321 /*
322 * Check for duplicate flow.
323 */
324 for (p = headp; *p != NULL; p = &(*p)->fe_next) {
325 if ((*p)->fe_flow_desc.fd_mask !=
326 flent->fe_flow_desc.fd_mask)
327 continue;
328
329 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
330 rw_exit(&ft->ft_lock);
331 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
332 flow_entry_t *, flent, int, err);
333 return (EALREADY);
334 }
335 }
336
337 /*
338 * Insert flow to hash list.
339 */
340 err = ops->fo_insert_fe(ft, headp, flent);
341 if (err != 0) {
342 rw_exit(&ft->ft_lock);
343 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
344 flow_entry_t *, flent, int, err);
345 return (err);
346 }
347
348 /*
349 * Save the hash index so it can be used by mac_flow_remove().
350 */
351 flent->fe_index = (int)index;
352
353 /*
354 * Save the flow tab back reference.
355 */
356 flent->fe_flow_tab = ft;
357 FLOW_MARK(flent, FE_FLOW_TAB);
358 ft->ft_flow_count++;
359 rw_exit(&ft->ft_lock);
360 return (0);
361 }
362
363 /*
364 * Remove a flow from a mac client's subflow table
365 */
366 void
mac_flow_rem_subflow(flow_entry_t * flent)367 mac_flow_rem_subflow(flow_entry_t *flent)
368 {
369 flow_tab_t *ft = flent->fe_flow_tab;
370 mac_client_impl_t *mcip = ft->ft_mcip;
371 mac_handle_t mh = (mac_handle_t)ft->ft_mip;
372
373 ASSERT(MAC_PERIM_HELD(mh));
374
375 mac_flow_remove(ft, flent, B_FALSE);
376 if (flent->fe_mcip == NULL) {
377 /*
378 * The interface is not yet plumbed and mac_client_flow_add
379 * was not done.
380 */
381 if (FLOW_TAB_EMPTY(ft)) {
382 mac_flow_tab_destroy(ft);
383 mcip->mci_subflow_tab = NULL;
384 }
385 } else {
386 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
387 mac_link_flow_clean((mac_client_handle_t)mcip, flent);
388 }
389 mac_fastpath_enable(mh);
390 }
391
392 /*
393 * Add a flow to a mac client's subflow table and instantiate the flow
394 * in the mac by creating the associated SRSs etc.
395 */
396 int
mac_flow_add_subflow(mac_client_handle_t mch,flow_entry_t * flent,boolean_t instantiate_flow)397 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
398 boolean_t instantiate_flow)
399 {
400 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
401 mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
402 flow_tab_info_t *ftinfo;
403 flow_mask_t mask;
404 flow_tab_t *ft;
405 int err;
406 boolean_t ft_created = B_FALSE;
407
408 ASSERT(MAC_PERIM_HELD(mh));
409
410 if ((err = mac_fastpath_disable(mh)) != 0)
411 return (err);
412
413 /*
414 * If the subflow table exists already just add the new subflow
415 * to the existing table, else we create a new subflow table below.
416 */
417 ft = mcip->mci_subflow_tab;
418 if (ft == NULL) {
419 mask = flent->fe_flow_desc.fd_mask;
420 /*
421 * Try to create a new table and then add the subflow to the
422 * newly created subflow table
423 */
424 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
425 mac_fastpath_enable(mh);
426 return (EOPNOTSUPP);
427 }
428
429 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
430 mcip->mci_mip, &ft);
431 ft_created = B_TRUE;
432 }
433
434 err = mac_flow_add(ft, flent);
435 if (err != 0) {
436 if (ft_created)
437 mac_flow_tab_destroy(ft);
438 mac_fastpath_enable(mh);
439 return (err);
440 }
441
442 if (instantiate_flow) {
443 /* Now activate the flow by creating its SRSs */
444 ASSERT(MCIP_DATAPATH_SETUP(mcip));
445 err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
446 if (err != 0) {
447 mac_flow_remove(ft, flent, B_FALSE);
448 if (ft_created)
449 mac_flow_tab_destroy(ft);
450 mac_fastpath_enable(mh);
451 return (err);
452 }
453 } else {
454 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
455 }
456 if (ft_created) {
457 ASSERT(mcip->mci_subflow_tab == NULL);
458 ft->ft_mcip = mcip;
459 mcip->mci_subflow_tab = ft;
460 if (instantiate_flow)
461 mac_client_update_classifier(mcip, B_TRUE);
462 }
463 return (0);
464 }
465
466 /*
467 * Remove flow entry from flow table.
468 */
469 void
mac_flow_remove(flow_tab_t * ft,flow_entry_t * flent,boolean_t temp)470 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
471 {
472 flow_entry_t **fp;
473
474 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
475 if (!(flent->fe_flags & FE_FLOW_TAB))
476 return;
477
478 rw_enter(&ft->ft_lock, RW_WRITER);
479 /*
480 * If this is a permanent removal from the flow table, mark it
481 * CONDEMNED to prevent future references. If this is a temporary
482 * removal from the table, say to update the flow descriptor then
483 * we don't mark it CONDEMNED
484 */
485 if (!temp)
486 FLOW_MARK(flent, FE_CONDEMNED);
487 /*
488 * Locate the specified flent.
489 */
490 fp = &ft->ft_table[flent->fe_index];
491 while (*fp != flent)
492 fp = &(*fp)->fe_next;
493
494 /*
495 * The flent must exist. Otherwise it's a bug.
496 */
497 ASSERT(fp != NULL);
498 *fp = flent->fe_next;
499 flent->fe_next = NULL;
500
501 /*
502 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
503 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
504 * will panic.
505 */
506 flent->fe_index = -1;
507 FLOW_UNMARK(flent, FE_FLOW_TAB);
508 ft->ft_flow_count--;
509 rw_exit(&ft->ft_lock);
510 }
511
512 /*
513 * This is the flow lookup routine used by the mac sw classifier engine.
514 */
515 int
mac_flow_lookup(flow_tab_t * ft,mblk_t * mp,uint_t flags,flow_entry_t ** flentp)516 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
517 {
518 flow_state_t s;
519 flow_entry_t *flent;
520 flow_ops_t *ops = &ft->ft_ops;
521 boolean_t retried = B_FALSE;
522 int i, err;
523
524 s.fs_flags = flags;
525 retry:
526 s.fs_mp = mp;
527
528 /*
529 * Walk the list of predeclared accept functions.
530 * Each of these would accumulate enough state to allow the next
531 * accept routine to make progress.
532 */
533 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
534 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
535 mblk_t *last;
536
537 /*
538 * ENOBUFS indicates that the mp could be too short
539 * and may need a pullup.
540 */
541 if (err != ENOBUFS || retried)
542 return (err);
543
544 /*
545 * The pullup is done on the last processed mblk, not
546 * the starting one. pullup is not done if the mblk
547 * has references or if b_cont is NULL.
548 */
549 last = s.fs_mp;
550 if (DB_REF(last) > 1 || last->b_cont == NULL ||
551 pullupmsg(last, -1) == 0)
552 return (EINVAL);
553
554 retried = B_TRUE;
555 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
556 flow_state_t *, &s);
557 goto retry;
558 }
559 }
560
561 /*
562 * The packet is considered sane. We may now attempt to
563 * find the corresponding flent.
564 */
565 rw_enter(&ft->ft_lock, RW_READER);
566 flent = ft->ft_table[ops->fo_hash(ft, &s)];
567 for (; flent != NULL; flent = flent->fe_next) {
568 if (flent->fe_match(ft, flent, &s)) {
569 FLOW_TRY_REFHOLD(flent, err);
570 if (err != 0)
571 continue;
572 *flentp = flent;
573 rw_exit(&ft->ft_lock);
574 return (0);
575 }
576 }
577 rw_exit(&ft->ft_lock);
578 return (ENOENT);
579 }
580
581 /*
582 * Walk flow table.
583 * The caller is assumed to have proper perimeter protection.
584 */
585 int
mac_flow_walk_nolock(flow_tab_t * ft,int (* fn)(flow_entry_t *,void *),void * arg)586 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
587 void *arg)
588 {
589 int err, i, cnt = 0;
590 flow_entry_t *flent;
591
592 if (ft == NULL)
593 return (0);
594
595 for (i = 0; i < ft->ft_size; i++) {
596 for (flent = ft->ft_table[i]; flent != NULL;
597 flent = flent->fe_next) {
598 cnt++;
599 err = (*fn)(flent, arg);
600 if (err != 0)
601 return (err);
602 }
603 }
604 VERIFY(cnt == ft->ft_flow_count);
605 return (0);
606 }
607
608 /*
609 * Same as the above except a mutex is used for protection here.
610 */
611 int
mac_flow_walk(flow_tab_t * ft,int (* fn)(flow_entry_t *,void *),void * arg)612 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
613 void *arg)
614 {
615 int err;
616
617 if (ft == NULL)
618 return (0);
619
620 rw_enter(&ft->ft_lock, RW_WRITER);
621 err = mac_flow_walk_nolock(ft, fn, arg);
622 rw_exit(&ft->ft_lock);
623 return (err);
624 }
625
626 static boolean_t mac_flow_clean(flow_entry_t *);
627
628 /*
629 * Destroy a flow entry. Called when the last reference on a flow is released.
630 */
631 void
mac_flow_destroy(flow_entry_t * flent)632 mac_flow_destroy(flow_entry_t *flent)
633 {
634 ASSERT(flent->fe_refcnt == 0);
635
636 if ((flent->fe_type & FLOW_USER) != 0) {
637 ASSERT(mac_flow_clean(flent));
638 } else {
639 mac_flow_cleanup(flent);
640 }
641 mac_misc_stat_delete(flent);
642 mutex_destroy(&flent->fe_lock);
643 cv_destroy(&flent->fe_cv);
644 flow_stat_destroy(flent);
645 kmem_cache_free(flow_cache, flent);
646 }
647
648 /*
649 * XXX eric
650 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
651 * mac_link_flow_modify() should really be moved/reworked into the
652 * two functions below. This would consolidate all the mac property
653 * checking in one place. I'm leaving this alone for now since it's
654 * out of scope of the new flows work.
655 */
656 /* ARGSUSED */
657 uint32_t
mac_flow_modify_props(flow_entry_t * flent,mac_resource_props_t * mrp)658 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
659 {
660 uint32_t changed_mask = 0;
661 mac_resource_props_t *fmrp = &flent->fe_effective_props;
662 int i;
663
664 if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
665 (!(fmrp->mrp_mask & MRP_MAXBW) ||
666 (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
667 changed_mask |= MRP_MAXBW;
668 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
669 fmrp->mrp_mask &= ~MRP_MAXBW;
670 fmrp->mrp_maxbw = 0;
671 } else {
672 fmrp->mrp_mask |= MRP_MAXBW;
673 fmrp->mrp_maxbw = mrp->mrp_maxbw;
674 }
675 }
676
677 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
678 if (fmrp->mrp_priority != mrp->mrp_priority)
679 changed_mask |= MRP_PRIORITY;
680 if (mrp->mrp_priority == MPL_RESET) {
681 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
682 fmrp->mrp_mask &= ~MRP_PRIORITY;
683 } else {
684 fmrp->mrp_priority = mrp->mrp_priority;
685 fmrp->mrp_mask |= MRP_PRIORITY;
686 }
687 }
688
689 /* modify fanout */
690 if ((mrp->mrp_mask & MRP_CPUS) != 0) {
691 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
692 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
693 for (i = 0; i < mrp->mrp_ncpus; i++) {
694 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
695 break;
696 }
697 if (i == mrp->mrp_ncpus) {
698 /*
699 * The new set of cpus passed is exactly
700 * the same as the existing set.
701 */
702 return (changed_mask);
703 }
704 }
705 changed_mask |= MRP_CPUS;
706 MAC_COPY_CPUS(mrp, fmrp);
707 }
708
709 /*
710 * Modify the rings property.
711 */
712 if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
713 mac_set_rings_effective(flent->fe_mcip);
714
715 if ((mrp->mrp_mask & MRP_POOL) != 0) {
716 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
717 changed_mask |= MRP_POOL;
718 if (strlen(mrp->mrp_pool) == 0)
719 fmrp->mrp_mask &= ~MRP_POOL;
720 else
721 fmrp->mrp_mask |= MRP_POOL;
722 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
723 }
724 return (changed_mask);
725 }
726
727 void
mac_flow_modify(flow_tab_t * ft,flow_entry_t * flent,mac_resource_props_t * mrp)728 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
729 {
730 uint32_t changed_mask;
731 mac_client_impl_t *mcip = flent->fe_mcip;
732 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
733 mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
734 cpupart_t *cpupart = NULL;
735 boolean_t use_default = B_FALSE;
736
737 ASSERT(flent != NULL);
738 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
739
740 rw_enter(&ft->ft_lock, RW_WRITER);
741
742 /* Update the cached values inside the subflow entry */
743 changed_mask = mac_flow_modify_props(flent, mrp);
744 rw_exit(&ft->ft_lock);
745 /*
746 * Push the changed parameters to the scheduling code in the
747 * SRS's, to take effect right away.
748 */
749 if (changed_mask & MRP_MAXBW) {
750 mac_srs_update_bwlimit(flent, mrp);
751 /*
752 * If bandwidth is changed, we may have to change
753 * the number of soft ring to be used for fanout.
754 * Call mac_flow_update_fanout() if MAC_BIND_CPU
755 * is not set and there is no user supplied cpu
756 * info. This applies only to link at this time.
757 */
758 if (!(flent->fe_type & FLOW_USER) &&
759 !(changed_mask & MRP_CPUS) &&
760 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
761 mac_fanout_setup(mcip, flent, mcip_mrp,
762 mac_rx_deliver, mcip, NULL, NULL);
763 }
764 }
765 if (mrp->mrp_mask & MRP_PRIORITY)
766 mac_flow_update_priority(mcip, flent);
767
768 if (changed_mask & MRP_CPUS)
769 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
770 NULL);
771
772 if (mrp->mrp_mask & MRP_POOL) {
773 pool_lock();
774 cpupart = mac_pset_find(mrp, &use_default);
775 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
776 cpupart);
777 mac_set_pool_effective(use_default, cpupart, mrp, emrp);
778 pool_unlock();
779 }
780 }
781
782 /*
783 * This function waits for a certain condition to be met and is generally
784 * used before a destructive or quiescing operation.
785 */
786 void
mac_flow_wait(flow_entry_t * flent,mac_flow_state_t event)787 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
788 {
789 mutex_enter(&flent->fe_lock);
790 flent->fe_flags |= FE_WAITER;
791
792 switch (event) {
793 case FLOW_DRIVER_UPCALL:
794 /*
795 * We want to make sure the driver upcalls have finished before
796 * we signal the Rx SRS worker to quit.
797 */
798 while (flent->fe_refcnt != 1)
799 cv_wait(&flent->fe_cv, &flent->fe_lock);
800 break;
801
802 case FLOW_USER_REF:
803 /*
804 * Wait for the fe_user_refcnt to drop to 0. The flow has
805 * been removed from the global flow hash.
806 */
807 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
808 while (flent->fe_user_refcnt != 0)
809 cv_wait(&flent->fe_cv, &flent->fe_lock);
810 break;
811
812 default:
813 ASSERT(0);
814 }
815
816 flent->fe_flags &= ~FE_WAITER;
817 mutex_exit(&flent->fe_lock);
818 }
819
820 static boolean_t
mac_flow_clean(flow_entry_t * flent)821 mac_flow_clean(flow_entry_t *flent)
822 {
823 ASSERT(flent->fe_next == NULL);
824 ASSERT(flent->fe_tx_srs == NULL);
825 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
826 ASSERT(flent->fe_mbg == NULL);
827
828 return (B_TRUE);
829 }
830
831 void
mac_flow_cleanup(flow_entry_t * flent)832 mac_flow_cleanup(flow_entry_t *flent)
833 {
834 if ((flent->fe_type & FLOW_USER) == 0) {
835 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
836 (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
837 ASSERT(flent->fe_refcnt == 0);
838 } else {
839 ASSERT(flent->fe_refcnt == 1);
840 }
841
842 if (flent->fe_mbg != NULL) {
843 ASSERT(flent->fe_tx_srs == NULL);
844 /* This is a multicast or broadcast flow entry */
845 mac_bcast_grp_free(flent->fe_mbg);
846 flent->fe_mbg = NULL;
847 }
848
849 if (flent->fe_tx_srs != NULL) {
850 ASSERT(flent->fe_mbg == NULL);
851 mac_srs_free(flent->fe_tx_srs);
852 flent->fe_tx_srs = NULL;
853 }
854
855 /*
856 * In the normal case fe_rx_srs_cnt is 1. However in the error case
857 * when mac_unicast_add fails we may not have set up any SRS
858 * in which case fe_rx_srs_cnt will be zero.
859 */
860 if (flent->fe_rx_srs_cnt != 0) {
861 ASSERT(flent->fe_rx_srs_cnt == 1);
862 mac_srs_free(flent->fe_rx_srs[0]);
863 flent->fe_rx_srs[0] = NULL;
864 flent->fe_rx_srs_cnt = 0;
865 }
866 ASSERT(flent->fe_rx_srs[0] == NULL);
867 }
868
869 void
mac_flow_get_desc(flow_entry_t * flent,flow_desc_t * fd)870 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
871 {
872 /*
873 * Grab the fe_lock to see a self-consistent fe_flow_desc.
874 * Updates to the fe_flow_desc happen under the fe_lock
875 * after removing the flent from the flow table
876 */
877 mutex_enter(&flent->fe_lock);
878 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
879 mutex_exit(&flent->fe_lock);
880 }
881
882 /*
883 * Update a field of a flow entry. The mac perimeter ensures that
884 * this is the only thread doing a modify operation on this mac end point.
885 * So the flow table can't change or disappear. The ft_lock protects access
886 * to the flow entry, and holding the lock ensures that there isn't any thread
887 * accessing the flow entry or attempting a flow table lookup. However
888 * data threads that are using the flow entry based on the old descriptor
889 * will continue to use the flow entry. If strong coherence is required
890 * then the flow will have to be quiesced before the descriptor can be
891 * changed.
892 */
893 void
mac_flow_set_desc(flow_entry_t * flent,flow_desc_t * fd)894 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
895 {
896 flow_tab_t *ft = flent->fe_flow_tab;
897 flow_desc_t old_desc;
898 int err;
899
900 if (ft == NULL) {
901 /*
902 * The flow hasn't yet been inserted into the table,
903 * so only the caller knows about this flow, however for
904 * uniformity we grab the fe_lock here.
905 */
906 mutex_enter(&flent->fe_lock);
907 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
908 mutex_exit(&flent->fe_lock);
909 }
910
911 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
912
913 /*
914 * Need to remove the flow entry from the table and reinsert it,
915 * into a potentially diference hash line. The hash depends on
916 * the new descriptor fields. However access to fe_desc itself
917 * is always under the fe_lock. This helps log and stat functions
918 * see a self-consistent fe_flow_desc.
919 */
920 mac_flow_remove(ft, flent, B_TRUE);
921 old_desc = flent->fe_flow_desc;
922
923 mutex_enter(&flent->fe_lock);
924 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
925 mutex_exit(&flent->fe_lock);
926
927 if (mac_flow_add(ft, flent) != 0) {
928 /*
929 * The add failed say due to an invalid flow descriptor.
930 * Undo the update
931 */
932 flent->fe_flow_desc = old_desc;
933 err = mac_flow_add(ft, flent);
934 ASSERT(err == 0);
935 }
936 }
937
938 void
mac_flow_set_name(flow_entry_t * flent,const char * name)939 mac_flow_set_name(flow_entry_t *flent, const char *name)
940 {
941 flow_tab_t *ft = flent->fe_flow_tab;
942
943 if (ft == NULL) {
944 /*
945 * The flow hasn't yet been inserted into the table,
946 * so only the caller knows about this flow
947 */
948 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
949 } else {
950 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
951 }
952
953 mutex_enter(&flent->fe_lock);
954 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
955 mutex_exit(&flent->fe_lock);
956 }
957
958 /*
959 * Return the client-private cookie that was associated with
960 * the flow when it was created.
961 */
962 void *
mac_flow_get_client_cookie(flow_entry_t * flent)963 mac_flow_get_client_cookie(flow_entry_t *flent)
964 {
965 return (flent->fe_client_cookie);
966 }
967
968 /*
969 * Forward declarations.
970 */
971 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
972 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
973 static int flow_l2_accept(flow_tab_t *, flow_state_t *);
974 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
975 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
976 static int flow_ether_accept(flow_tab_t *, flow_state_t *);
977
978 /*
979 * Create flow table.
980 */
981 void
mac_flow_tab_create(flow_ops_t * ops,flow_mask_t mask,uint_t size,mac_impl_t * mip,flow_tab_t ** ftp)982 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
983 mac_impl_t *mip, flow_tab_t **ftp)
984 {
985 flow_tab_t *ft;
986 flow_ops_t *new_ops;
987
988 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
989 bzero(ft, sizeof (*ft));
990
991 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
992
993 /*
994 * We make a copy of the ops vector instead of just pointing to it
995 * because we might want to customize the ops vector on a per table
996 * basis (e.g. for optimization).
997 */
998 new_ops = &ft->ft_ops;
999 bcopy(ops, new_ops, sizeof (*ops));
1000 ft->ft_mask = mask;
1001 ft->ft_size = size;
1002 ft->ft_mip = mip;
1003
1004 /*
1005 * Optimizations for DL_ETHER media.
1006 */
1007 if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1008 if (new_ops->fo_hash == flow_l2_hash)
1009 new_ops->fo_hash = flow_ether_hash;
1010 if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1011 new_ops->fo_hash_fe = flow_ether_hash_fe;
1012 if (new_ops->fo_accept[0] == flow_l2_accept)
1013 new_ops->fo_accept[0] = flow_ether_accept;
1014 }
1015 *ftp = ft;
1016 }
1017
1018 void
mac_flow_l2tab_create(mac_impl_t * mip,flow_tab_t ** ftp)1019 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1020 {
1021 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1022 1024, mip, ftp);
1023 }
1024
1025 /*
1026 * Destroy flow table.
1027 */
1028 void
mac_flow_tab_destroy(flow_tab_t * ft)1029 mac_flow_tab_destroy(flow_tab_t *ft)
1030 {
1031 if (ft == NULL)
1032 return;
1033
1034 ASSERT(ft->ft_flow_count == 0);
1035 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1036 bzero(ft, sizeof (*ft));
1037 kmem_cache_free(flow_tab_cache, ft);
1038 }
1039
1040 /*
1041 * Add a new flow entry to the global flow hash table
1042 */
1043 int
mac_flow_hash_add(flow_entry_t * flent)1044 mac_flow_hash_add(flow_entry_t *flent)
1045 {
1046 int err;
1047
1048 rw_enter(&flow_tab_lock, RW_WRITER);
1049 err = mod_hash_insert(flow_hash,
1050 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1051 if (err != 0) {
1052 rw_exit(&flow_tab_lock);
1053 return (EEXIST);
1054 }
1055 /* Mark as inserted into the global flow hash table */
1056 FLOW_MARK(flent, FE_G_FLOW_HASH);
1057 rw_exit(&flow_tab_lock);
1058 return (err);
1059 }
1060
1061 /*
1062 * Remove a flow entry from the global flow hash table
1063 */
1064 void
mac_flow_hash_remove(flow_entry_t * flent)1065 mac_flow_hash_remove(flow_entry_t *flent)
1066 {
1067 mod_hash_val_t val;
1068
1069 rw_enter(&flow_tab_lock, RW_WRITER);
1070 VERIFY(mod_hash_remove(flow_hash,
1071 (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1072
1073 /* Clear the mark that says inserted into the global flow hash table */
1074 FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1075 rw_exit(&flow_tab_lock);
1076 }
1077
1078 /*
1079 * Retrieve a flow entry from the global flow hash table.
1080 */
1081 int
mac_flow_lookup_byname(char * name,flow_entry_t ** flentp)1082 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1083 {
1084 int err;
1085 flow_entry_t *flent;
1086
1087 rw_enter(&flow_tab_lock, RW_READER);
1088 err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1089 (mod_hash_val_t *)&flent);
1090 if (err != 0) {
1091 rw_exit(&flow_tab_lock);
1092 return (ENOENT);
1093 }
1094 ASSERT(flent != NULL);
1095 FLOW_USER_REFHOLD(flent);
1096 rw_exit(&flow_tab_lock);
1097
1098 *flentp = flent;
1099 return (0);
1100 }
1101
1102 /*
1103 * Initialize or release mac client flows by walking the subflow table.
1104 * These are typically invoked during plumb/unplumb of links.
1105 */
1106
1107 static int
mac_link_init_flows_cb(flow_entry_t * flent,void * arg)1108 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1109 {
1110 mac_client_impl_t *mcip = arg;
1111
1112 if (mac_link_flow_init(arg, flent) != 0) {
1113 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1114 flent->fe_flow_name, mcip->mci_name);
1115 } else {
1116 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1117 }
1118 return (0);
1119 }
1120
1121 void
mac_link_init_flows(mac_client_handle_t mch)1122 mac_link_init_flows(mac_client_handle_t mch)
1123 {
1124 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1125
1126 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1127 mac_link_init_flows_cb, mcip);
1128 /*
1129 * If mac client had subflow(s) configured before plumb, change
1130 * function to mac_rx_srs_subflow_process and in case of hardware
1131 * classification, disable polling.
1132 */
1133 mac_client_update_classifier(mcip, B_TRUE);
1134
1135 }
1136
1137 boolean_t
mac_link_has_flows(mac_client_handle_t mch)1138 mac_link_has_flows(mac_client_handle_t mch)
1139 {
1140 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1141
1142 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1143 return (B_TRUE);
1144
1145 return (B_FALSE);
1146 }
1147
1148 static int
mac_link_release_flows_cb(flow_entry_t * flent,void * arg)1149 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1150 {
1151 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1152 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1153 mac_link_flow_clean(arg, flent);
1154 return (0);
1155 }
1156
1157 void
mac_link_release_flows(mac_client_handle_t mch)1158 mac_link_release_flows(mac_client_handle_t mch)
1159 {
1160 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1161
1162 /*
1163 * Change the mci_flent callback back to mac_rx_srs_process()
1164 * because flows are about to be deactivated.
1165 */
1166 mac_client_update_classifier(mcip, B_FALSE);
1167 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1168 mac_link_release_flows_cb, mcip);
1169 }
1170
1171 void
mac_rename_flow(flow_entry_t * fep,const char * new_name)1172 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1173 {
1174 mac_flow_set_name(fep, new_name);
1175 if (fep->fe_ksp != NULL) {
1176 flow_stat_destroy(fep);
1177 flow_stat_create(fep);
1178 }
1179 }
1180
1181 /*
1182 * mac_link_flow_init()
1183 * Internal flow interface used for allocating SRSs and related
1184 * data structures. Not meant to be used by mac clients.
1185 */
1186 int
mac_link_flow_init(mac_client_handle_t mch,flow_entry_t * sub_flow)1187 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1188 {
1189 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1190 mac_impl_t *mip = mcip->mci_mip;
1191 int err;
1192
1193 ASSERT(mch != NULL);
1194 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1195
1196 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1197 return (err);
1198
1199 sub_flow->fe_mcip = mcip;
1200
1201 return (0);
1202 }
1203
1204 /*
1205 * mac_link_flow_add()
1206 * Used by flowadm(1m) or kernel mac clients for creating flows.
1207 */
1208 int
mac_link_flow_add(datalink_id_t linkid,char * flow_name,flow_desc_t * flow_desc,mac_resource_props_t * mrp)1209 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1210 flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1211 {
1212 flow_entry_t *flent = NULL;
1213 int err;
1214 dls_dl_handle_t dlh;
1215 dls_link_t *dlp;
1216 boolean_t link_held = B_FALSE;
1217 boolean_t hash_added = B_FALSE;
1218 mac_perim_handle_t mph;
1219
1220 err = mac_flow_lookup_byname(flow_name, &flent);
1221 if (err == 0) {
1222 FLOW_USER_REFRELE(flent);
1223 return (EEXIST);
1224 }
1225
1226 /*
1227 * First create a flow entry given the description provided
1228 * by the caller.
1229 */
1230 err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1231 FLOW_USER | FLOW_OTHER, &flent);
1232
1233 if (err != 0)
1234 return (err);
1235
1236 /*
1237 * We've got a local variable referencing this flow now, so we need
1238 * to hold it. We'll release this flow before returning.
1239 * All failures until we return will undo any action that may internally
1240 * held the flow, so the last REFRELE will assure a clean freeing
1241 * of resources.
1242 */
1243 FLOW_REFHOLD(flent);
1244
1245 flent->fe_link_id = linkid;
1246 FLOW_MARK(flent, FE_INCIPIENT);
1247
1248 err = mac_perim_enter_by_linkid(linkid, &mph);
1249 if (err != 0) {
1250 FLOW_FINAL_REFRELE(flent);
1251 return (err);
1252 }
1253
1254 /*
1255 * dls will eventually be merged with mac so it's ok
1256 * to call dls' internal functions.
1257 */
1258 err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1259 if (err != 0)
1260 goto bail;
1261
1262 link_held = B_TRUE;
1263
1264 /*
1265 * Add the flow to the global flow table, this table will be per
1266 * exclusive zone so each zone can have its own flow namespace.
1267 * RFE 6625651 will fix this.
1268 *
1269 */
1270 if ((err = mac_flow_hash_add(flent)) != 0)
1271 goto bail;
1272
1273 hash_added = B_TRUE;
1274
1275 /*
1276 * do not allow flows to be configured on an anchor VNIC
1277 */
1278 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1279 err = ENOTSUP;
1280 goto bail;
1281 }
1282
1283 /*
1284 * Add the subflow to the subflow table. Also instantiate the flow
1285 * in the mac if there is an active user (we check if the MAC client's
1286 * datapath has been setup).
1287 */
1288 err = mac_flow_add_subflow(dlp->dl_mch, flent,
1289 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1290 if (err != 0)
1291 goto bail;
1292
1293 FLOW_UNMARK(flent, FE_INCIPIENT);
1294 dls_devnet_rele_link(dlh, dlp);
1295 mac_perim_exit(mph);
1296 return (0);
1297
1298 bail:
1299 if (hash_added)
1300 mac_flow_hash_remove(flent);
1301
1302 if (link_held)
1303 dls_devnet_rele_link(dlh, dlp);
1304
1305 /*
1306 * Wait for any transient global flow hash refs to clear
1307 * and then release the creation reference on the flow
1308 */
1309 mac_flow_wait(flent, FLOW_USER_REF);
1310 FLOW_FINAL_REFRELE(flent);
1311 mac_perim_exit(mph);
1312 return (err);
1313 }
1314
1315 /*
1316 * mac_link_flow_clean()
1317 * Internal flow interface used for freeing SRSs and related
1318 * data structures. Not meant to be used by mac clients.
1319 */
1320 void
mac_link_flow_clean(mac_client_handle_t mch,flow_entry_t * sub_flow)1321 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1322 {
1323 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1324 mac_impl_t *mip = mcip->mci_mip;
1325 boolean_t last_subflow;
1326
1327 ASSERT(mch != NULL);
1328 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1329
1330 /*
1331 * This sub flow entry may fail to be fully initialized by
1332 * mac_link_flow_init(). If so, simply return.
1333 */
1334 if (sub_flow->fe_mcip == NULL)
1335 return;
1336
1337 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1338 /*
1339 * Tear down the data path
1340 */
1341 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1342 sub_flow->fe_mcip = NULL;
1343
1344 /*
1345 * Delete the SRSs associated with this subflow. If this is being
1346 * driven by flowadm(1M) then the subflow will be deleted by
1347 * dls_rem_flow. However if this is a result of the interface being
1348 * unplumbed then the subflow itself won't be deleted.
1349 */
1350 mac_flow_cleanup(sub_flow);
1351
1352 /*
1353 * If all the subflows are gone, renable some of the stuff
1354 * we disabled when adding a subflow, polling etc.
1355 */
1356 if (last_subflow) {
1357 /*
1358 * The subflow table itself is not protected by any locks or
1359 * refcnts. Hence quiesce the client upfront before clearing
1360 * mci_subflow_tab.
1361 */
1362 mac_client_quiesce(mcip);
1363 mac_client_update_classifier(mcip, B_FALSE);
1364 mac_flow_tab_destroy(mcip->mci_subflow_tab);
1365 mcip->mci_subflow_tab = NULL;
1366 mac_client_restart(mcip);
1367 }
1368 }
1369
1370 /*
1371 * mac_link_flow_remove()
1372 * Used by flowadm(1m) or kernel mac clients for removing flows.
1373 */
1374 int
mac_link_flow_remove(char * flow_name)1375 mac_link_flow_remove(char *flow_name)
1376 {
1377 flow_entry_t *flent;
1378 mac_perim_handle_t mph;
1379 int err;
1380 datalink_id_t linkid;
1381
1382 err = mac_flow_lookup_byname(flow_name, &flent);
1383 if (err != 0)
1384 return (err);
1385
1386 linkid = flent->fe_link_id;
1387 FLOW_USER_REFRELE(flent);
1388
1389 /*
1390 * The perim must be acquired before acquiring any other references
1391 * to maintain the lock and perimeter hierarchy. Please note the
1392 * FLOW_REFRELE above.
1393 */
1394 err = mac_perim_enter_by_linkid(linkid, &mph);
1395 if (err != 0)
1396 return (err);
1397
1398 /*
1399 * Note the second lookup of the flow, because a concurrent thread
1400 * may have removed it already while we were waiting to enter the
1401 * link's perimeter.
1402 */
1403 err = mac_flow_lookup_byname(flow_name, &flent);
1404 if (err != 0) {
1405 mac_perim_exit(mph);
1406 return (err);
1407 }
1408 FLOW_USER_REFRELE(flent);
1409
1410 /*
1411 * Remove the flow from the subflow table and deactivate the flow
1412 * by quiescing and removings its SRSs
1413 */
1414 mac_flow_rem_subflow(flent);
1415
1416 /*
1417 * Finally, remove the flow from the global table.
1418 */
1419 mac_flow_hash_remove(flent);
1420
1421 /*
1422 * Wait for any transient global flow hash refs to clear
1423 * and then release the creation reference on the flow
1424 */
1425 mac_flow_wait(flent, FLOW_USER_REF);
1426 FLOW_FINAL_REFRELE(flent);
1427
1428 mac_perim_exit(mph);
1429
1430 return (0);
1431 }
1432
1433 /*
1434 * mac_link_flow_modify()
1435 * Modifies the properties of a flow identified by its name.
1436 */
1437 int
mac_link_flow_modify(char * flow_name,mac_resource_props_t * mrp)1438 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1439 {
1440 flow_entry_t *flent;
1441 mac_client_impl_t *mcip;
1442 int err = 0;
1443 mac_perim_handle_t mph;
1444 datalink_id_t linkid;
1445 flow_tab_t *flow_tab;
1446
1447 err = mac_validate_props(NULL, mrp);
1448 if (err != 0)
1449 return (err);
1450
1451 err = mac_flow_lookup_byname(flow_name, &flent);
1452 if (err != 0)
1453 return (err);
1454
1455 linkid = flent->fe_link_id;
1456 FLOW_USER_REFRELE(flent);
1457
1458 /*
1459 * The perim must be acquired before acquiring any other references
1460 * to maintain the lock and perimeter hierarchy. Please note the
1461 * FLOW_REFRELE above.
1462 */
1463 err = mac_perim_enter_by_linkid(linkid, &mph);
1464 if (err != 0)
1465 return (err);
1466
1467 /*
1468 * Note the second lookup of the flow, because a concurrent thread
1469 * may have removed it already while we were waiting to enter the
1470 * link's perimeter.
1471 */
1472 err = mac_flow_lookup_byname(flow_name, &flent);
1473 if (err != 0) {
1474 mac_perim_exit(mph);
1475 return (err);
1476 }
1477 FLOW_USER_REFRELE(flent);
1478
1479 /*
1480 * If this flow is attached to a MAC client, then pass the request
1481 * along to the client.
1482 * Otherwise, just update the cached values.
1483 */
1484 mcip = flent->fe_mcip;
1485 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1486 if (mcip != NULL) {
1487 if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1488 err = ENOENT;
1489 } else {
1490 mac_flow_modify(flow_tab, flent, mrp);
1491 }
1492 } else {
1493 (void) mac_flow_modify_props(flent, mrp);
1494 }
1495
1496 done:
1497 mac_perim_exit(mph);
1498 return (err);
1499 }
1500
1501
1502 /*
1503 * State structure and misc functions used by mac_link_flow_walk().
1504 */
1505 typedef struct {
1506 int (*ws_func)(mac_flowinfo_t *, void *);
1507 void *ws_arg;
1508 } flow_walk_state_t;
1509
1510 static void
mac_link_flowinfo_copy(mac_flowinfo_t * finfop,flow_entry_t * flent)1511 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1512 {
1513 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1514 MAXFLOWNAMELEN);
1515 finfop->fi_link_id = flent->fe_link_id;
1516 finfop->fi_flow_desc = flent->fe_flow_desc;
1517 finfop->fi_resource_props = flent->fe_resource_props;
1518 }
1519
1520 static int
mac_link_flow_walk_cb(flow_entry_t * flent,void * arg)1521 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1522 {
1523 flow_walk_state_t *statep = arg;
1524 mac_flowinfo_t *finfo;
1525 int err;
1526
1527 finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1528 mac_link_flowinfo_copy(finfo, flent);
1529 err = statep->ws_func(finfo, statep->ws_arg);
1530 kmem_free(finfo, sizeof (*finfo));
1531 return (err);
1532 }
1533
1534 /*
1535 * mac_link_flow_walk()
1536 * Invokes callback 'func' for all flows belonging to the specified link.
1537 */
1538 int
mac_link_flow_walk(datalink_id_t linkid,int (* func)(mac_flowinfo_t *,void *),void * arg)1539 mac_link_flow_walk(datalink_id_t linkid,
1540 int (*func)(mac_flowinfo_t *, void *), void *arg)
1541 {
1542 mac_client_impl_t *mcip;
1543 mac_perim_handle_t mph;
1544 flow_walk_state_t state;
1545 dls_dl_handle_t dlh;
1546 dls_link_t *dlp;
1547 int err;
1548
1549 err = mac_perim_enter_by_linkid(linkid, &mph);
1550 if (err != 0)
1551 return (err);
1552
1553 err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1554 if (err != 0) {
1555 mac_perim_exit(mph);
1556 return (err);
1557 }
1558
1559 mcip = (mac_client_impl_t *)dlp->dl_mch;
1560 state.ws_func = func;
1561 state.ws_arg = arg;
1562
1563 err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1564 mac_link_flow_walk_cb, &state);
1565
1566 dls_devnet_rele_link(dlh, dlp);
1567 mac_perim_exit(mph);
1568 return (err);
1569 }
1570
1571 /*
1572 * mac_link_flow_info()
1573 * Retrieves information about a specific flow.
1574 */
1575 int
mac_link_flow_info(char * flow_name,mac_flowinfo_t * finfo)1576 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1577 {
1578 flow_entry_t *flent;
1579 int err;
1580
1581 err = mac_flow_lookup_byname(flow_name, &flent);
1582 if (err != 0)
1583 return (err);
1584
1585 mac_link_flowinfo_copy(finfo, flent);
1586 FLOW_USER_REFRELE(flent);
1587 return (0);
1588 }
1589
1590 /*
1591 * Hash function macro that takes an Ethernet address and VLAN id as input.
1592 */
1593 #define HASH_ETHER_VID(a, v, s) \
1594 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1595
1596 /*
1597 * Generic layer-2 address hashing function that takes an address and address
1598 * length as input. This is the DJB hash function.
1599 */
1600 static uint32_t
flow_l2_addrhash(uint8_t * addr,size_t addrlen,size_t htsize)1601 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1602 {
1603 uint32_t hash = 5381;
1604 size_t i;
1605
1606 for (i = 0; i < addrlen; i++)
1607 hash = ((hash << 5) + hash) + addr[i];
1608 return (hash % htsize);
1609 }
1610
1611 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1612
1613 #define CHECK_AND_ADJUST_START_PTR(s, start) { \
1614 if ((s)->fs_mp->b_wptr == (start)) { \
1615 mblk_t *next = (s)->fs_mp->b_cont; \
1616 if (next == NULL) \
1617 return (EINVAL); \
1618 \
1619 (s)->fs_mp = next; \
1620 (start) = next->b_rptr; \
1621 } \
1622 }
1623
1624 /* ARGSUSED */
1625 static boolean_t
flow_l2_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)1626 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1627 {
1628 flow_l2info_t *l2 = &s->fs_l2info;
1629 flow_desc_t *fd = &flent->fe_flow_desc;
1630
1631 return (l2->l2_vid == fd->fd_vid &&
1632 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1633 }
1634
1635 /*
1636 * Layer 2 hash function.
1637 * Must be paired with flow_l2_accept() within a set of flow_ops
1638 * because it assumes the dest address is already extracted.
1639 */
1640 static uint32_t
flow_l2_hash(flow_tab_t * ft,flow_state_t * s)1641 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1642 {
1643 return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1644 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1645 }
1646
1647 /*
1648 * This is the generic layer 2 accept function.
1649 * It makes use of mac_header_info() to extract the header length,
1650 * sap, vlan ID and destination address.
1651 */
1652 static int
flow_l2_accept(flow_tab_t * ft,flow_state_t * s)1653 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1654 {
1655 boolean_t is_ether;
1656 flow_l2info_t *l2 = &s->fs_l2info;
1657 mac_header_info_t mhi;
1658 int err;
1659
1660 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1661 if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1662 s->fs_mp, &mhi)) != 0) {
1663 if (err == EINVAL)
1664 err = ENOBUFS;
1665
1666 return (err);
1667 }
1668
1669 l2->l2_start = s->fs_mp->b_rptr;
1670 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1671
1672 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1673 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1674 struct ether_vlan_header *evhp =
1675 (struct ether_vlan_header *)l2->l2_start;
1676
1677 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1678 return (ENOBUFS);
1679
1680 l2->l2_sap = ntohs(evhp->ether_type);
1681 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1682 l2->l2_hdrsize = sizeof (*evhp);
1683 } else {
1684 l2->l2_sap = mhi.mhi_bindsap;
1685 l2->l2_vid = 0;
1686 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1687 }
1688 return (0);
1689 }
1690
1691 /*
1692 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1693 * accept(). The notable difference is that dest address is now extracted
1694 * by hash() rather than by accept(). This saves a few memory references
1695 * for flow tables that do not care about mac addresses.
1696 */
1697 static uint32_t
flow_ether_hash(flow_tab_t * ft,flow_state_t * s)1698 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1699 {
1700 flow_l2info_t *l2 = &s->fs_l2info;
1701 struct ether_vlan_header *evhp;
1702
1703 evhp = (struct ether_vlan_header *)l2->l2_start;
1704 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1705 return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1706 }
1707
1708 static uint32_t
flow_ether_hash_fe(flow_tab_t * ft,flow_entry_t * flent)1709 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1710 {
1711 flow_desc_t *fd = &flent->fe_flow_desc;
1712
1713 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1714 return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1715 }
1716
1717 /* ARGSUSED */
1718 static int
flow_ether_accept(flow_tab_t * ft,flow_state_t * s)1719 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1720 {
1721 flow_l2info_t *l2 = &s->fs_l2info;
1722 struct ether_vlan_header *evhp;
1723 uint16_t sap;
1724
1725 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1726 l2->l2_start = (uchar_t *)evhp;
1727
1728 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1729 return (ENOBUFS);
1730
1731 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1732 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1733 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1734 return (ENOBUFS);
1735
1736 l2->l2_sap = ntohs(evhp->ether_type);
1737 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1738 l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1739 } else {
1740 l2->l2_sap = sap;
1741 l2->l2_vid = 0;
1742 l2->l2_hdrsize = sizeof (struct ether_header);
1743 }
1744 return (0);
1745 }
1746
1747 /*
1748 * Validates a layer 2 flow entry.
1749 */
1750 static int
flow_l2_accept_fe(flow_tab_t * ft,flow_entry_t * flent)1751 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1752 {
1753 flow_desc_t *fd = &flent->fe_flow_desc;
1754
1755 /*
1756 * Dest address is mandatory, and 0 length addresses are not yet
1757 * supported.
1758 */
1759 if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1760 return (EINVAL);
1761
1762 if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1763 /*
1764 * VLAN flows are only supported over ethernet macs.
1765 */
1766 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1767 return (EINVAL);
1768
1769 if (fd->fd_vid == 0)
1770 return (EINVAL);
1771
1772 }
1773 flent->fe_match = flow_l2_match;
1774 return (0);
1775 }
1776
1777 /*
1778 * Calculates hash index of flow entry.
1779 */
1780 static uint32_t
flow_l2_hash_fe(flow_tab_t * ft,flow_entry_t * flent)1781 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1782 {
1783 flow_desc_t *fd = &flent->fe_flow_desc;
1784
1785 ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1786 return (flow_l2_addrhash(fd->fd_dst_mac,
1787 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1788 }
1789
1790 /*
1791 * This is used for duplicate flow checking.
1792 */
1793 /* ARGSUSED */
1794 static boolean_t
flow_l2_match_fe(flow_tab_t * ft,flow_entry_t * f1,flow_entry_t * f2)1795 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1796 {
1797 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1798
1799 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1800 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1801 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1802 }
1803
1804 /*
1805 * Generic flow entry insertion function.
1806 * Used by flow tables that do not have ordering requirements.
1807 */
1808 /* ARGSUSED */
1809 static int
flow_generic_insert_fe(flow_tab_t * ft,flow_entry_t ** headp,flow_entry_t * flent)1810 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1811 flow_entry_t *flent)
1812 {
1813 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1814
1815 if (*headp != NULL) {
1816 ASSERT(flent->fe_next == NULL);
1817 flent->fe_next = *headp;
1818 }
1819 *headp = flent;
1820 return (0);
1821 }
1822
1823 /*
1824 * IP version independent DSField matching function.
1825 */
1826 /* ARGSUSED */
1827 static boolean_t
flow_ip_dsfield_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)1828 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1829 {
1830 flow_l3info_t *l3info = &s->fs_l3info;
1831 flow_desc_t *fd = &flent->fe_flow_desc;
1832
1833 switch (l3info->l3_version) {
1834 case IPV4_VERSION: {
1835 ipha_t *ipha = (ipha_t *)l3info->l3_start;
1836
1837 return ((ipha->ipha_type_of_service &
1838 fd->fd_dsfield_mask) == fd->fd_dsfield);
1839 }
1840 case IPV6_VERSION: {
1841 ip6_t *ip6h = (ip6_t *)l3info->l3_start;
1842
1843 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1844 fd->fd_dsfield_mask) == fd->fd_dsfield);
1845 }
1846 default:
1847 return (B_FALSE);
1848 }
1849 }
1850
1851 /*
1852 * IP v4 and v6 address matching.
1853 * The netmask only needs to be applied on the packet but not on the
1854 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1855 */
1856
1857 /* ARGSUSED */
1858 static boolean_t
flow_ip_v4_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)1859 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1860 {
1861 flow_l3info_t *l3info = &s->fs_l3info;
1862 flow_desc_t *fd = &flent->fe_flow_desc;
1863 ipha_t *ipha = (ipha_t *)l3info->l3_start;
1864 in_addr_t addr;
1865
1866 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1867 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1868 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1869 V4_PART_OF_V6(fd->fd_local_addr));
1870 }
1871 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1872 V4_PART_OF_V6(fd->fd_remote_addr));
1873 }
1874
1875 /* ARGSUSED */
1876 static boolean_t
flow_ip_v6_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)1877 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1878 {
1879 flow_l3info_t *l3info = &s->fs_l3info;
1880 flow_desc_t *fd = &flent->fe_flow_desc;
1881 ip6_t *ip6h = (ip6_t *)l3info->l3_start;
1882 in6_addr_t *addrp;
1883
1884 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1885 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1886 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1887 fd->fd_local_addr));
1888 }
1889 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1890 }
1891
1892 /* ARGSUSED */
1893 static boolean_t
flow_ip_proto_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)1894 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1895 {
1896 flow_l3info_t *l3info = &s->fs_l3info;
1897 flow_desc_t *fd = &flent->fe_flow_desc;
1898
1899 return (l3info->l3_protocol == fd->fd_protocol);
1900 }
1901
1902 static uint32_t
flow_ip_hash(flow_tab_t * ft,flow_state_t * s)1903 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1904 {
1905 flow_l3info_t *l3info = &s->fs_l3info;
1906 flow_mask_t mask = ft->ft_mask;
1907
1908 if ((mask & FLOW_IP_LOCAL) != 0) {
1909 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1910 } else if ((mask & FLOW_IP_REMOTE) != 0) {
1911 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1912 } else if ((mask & FLOW_IP_DSFIELD) != 0) {
1913 /*
1914 * DSField flents are arranged as a single list.
1915 */
1916 return (0);
1917 }
1918 /*
1919 * IP addr flents are hashed into two lists, v4 or v6.
1920 */
1921 ASSERT(ft->ft_size >= 2);
1922 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1923 }
1924
1925 static uint32_t
flow_ip_proto_hash(flow_tab_t * ft,flow_state_t * s)1926 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1927 {
1928 flow_l3info_t *l3info = &s->fs_l3info;
1929
1930 return (l3info->l3_protocol % ft->ft_size);
1931 }
1932
1933 /* ARGSUSED */
1934 static int
flow_ip_accept(flow_tab_t * ft,flow_state_t * s)1935 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1936 {
1937 flow_l2info_t *l2info = &s->fs_l2info;
1938 flow_l3info_t *l3info = &s->fs_l3info;
1939 uint16_t sap = l2info->l2_sap;
1940 uchar_t *l3_start;
1941
1942 l3_start = l2info->l2_start + l2info->l2_hdrsize;
1943
1944 /*
1945 * Adjust start pointer if we're at the end of an mblk.
1946 */
1947 CHECK_AND_ADJUST_START_PTR(s, l3_start);
1948
1949 l3info->l3_start = l3_start;
1950 if (!OK_32PTR(l3_start))
1951 return (EINVAL);
1952
1953 switch (sap) {
1954 case ETHERTYPE_IP: {
1955 ipha_t *ipha = (ipha_t *)l3_start;
1956
1957 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1958 return (ENOBUFS);
1959
1960 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1961 l3info->l3_protocol = ipha->ipha_protocol;
1962 l3info->l3_version = IPV4_VERSION;
1963 l3info->l3_fragmented =
1964 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1965 break;
1966 }
1967 case ETHERTYPE_IPV6: {
1968 ip6_t *ip6h = (ip6_t *)l3_start;
1969 ip6_frag_t *frag = NULL;
1970 uint16_t ip6_hdrlen;
1971 uint8_t nexthdr;
1972
1973 if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
1974 &nexthdr, &frag)) {
1975 return (ENOBUFS);
1976 }
1977 l3info->l3_hdrsize = ip6_hdrlen;
1978 l3info->l3_protocol = nexthdr;
1979 l3info->l3_version = IPV6_VERSION;
1980 l3info->l3_fragmented = (frag != NULL);
1981 break;
1982 }
1983 default:
1984 return (EINVAL);
1985 }
1986 return (0);
1987 }
1988
1989 /* ARGSUSED */
1990 static int
flow_ip_proto_accept_fe(flow_tab_t * ft,flow_entry_t * flent)1991 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1992 {
1993 flow_desc_t *fd = &flent->fe_flow_desc;
1994
1995 switch (fd->fd_protocol) {
1996 case IPPROTO_TCP:
1997 case IPPROTO_UDP:
1998 case IPPROTO_SCTP:
1999 case IPPROTO_ICMP:
2000 case IPPROTO_ICMPV6:
2001 flent->fe_match = flow_ip_proto_match;
2002 return (0);
2003 default:
2004 return (EINVAL);
2005 }
2006 }
2007
2008 /* ARGSUSED */
2009 static int
flow_ip_accept_fe(flow_tab_t * ft,flow_entry_t * flent)2010 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2011 {
2012 flow_desc_t *fd = &flent->fe_flow_desc;
2013 flow_mask_t mask;
2014 uint8_t version;
2015 in6_addr_t *addr, *netmask;
2016
2017 /*
2018 * DSField does not require a IP version.
2019 */
2020 if (fd->fd_mask == FLOW_IP_DSFIELD) {
2021 if (fd->fd_dsfield_mask == 0)
2022 return (EINVAL);
2023
2024 flent->fe_match = flow_ip_dsfield_match;
2025 return (0);
2026 }
2027
2028 /*
2029 * IP addresses must come with a version to avoid ambiguity.
2030 */
2031 if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2032 return (EINVAL);
2033
2034 version = fd->fd_ipversion;
2035 if (version != IPV4_VERSION && version != IPV6_VERSION)
2036 return (EINVAL);
2037
2038 mask = fd->fd_mask & ~FLOW_IP_VERSION;
2039 switch (mask) {
2040 case FLOW_IP_LOCAL:
2041 addr = &fd->fd_local_addr;
2042 netmask = &fd->fd_local_netmask;
2043 break;
2044 case FLOW_IP_REMOTE:
2045 addr = &fd->fd_remote_addr;
2046 netmask = &fd->fd_remote_netmask;
2047 break;
2048 default:
2049 return (EINVAL);
2050 }
2051
2052 /*
2053 * Apply netmask onto specified address.
2054 */
2055 V6_MASK_COPY(*addr, *netmask, *addr);
2056 if (version == IPV4_VERSION) {
2057 ipaddr_t v4addr = V4_PART_OF_V6((*addr));
2058 ipaddr_t v4mask = V4_PART_OF_V6((*netmask));
2059
2060 if (v4addr == 0 || v4mask == 0)
2061 return (EINVAL);
2062 flent->fe_match = flow_ip_v4_match;
2063 } else {
2064 if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2065 IN6_IS_ADDR_UNSPECIFIED(netmask))
2066 return (EINVAL);
2067 flent->fe_match = flow_ip_v6_match;
2068 }
2069 return (0);
2070 }
2071
2072 static uint32_t
flow_ip_proto_hash_fe(flow_tab_t * ft,flow_entry_t * flent)2073 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2074 {
2075 flow_desc_t *fd = &flent->fe_flow_desc;
2076
2077 return (fd->fd_protocol % ft->ft_size);
2078 }
2079
2080 static uint32_t
flow_ip_hash_fe(flow_tab_t * ft,flow_entry_t * flent)2081 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2082 {
2083 flow_desc_t *fd = &flent->fe_flow_desc;
2084
2085 /*
2086 * DSField flents are arranged as a single list.
2087 */
2088 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2089 return (0);
2090
2091 /*
2092 * IP addr flents are hashed into two lists, v4 or v6.
2093 */
2094 ASSERT(ft->ft_size >= 2);
2095 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2096 }
2097
2098 /* ARGSUSED */
2099 static boolean_t
flow_ip_proto_match_fe(flow_tab_t * ft,flow_entry_t * f1,flow_entry_t * f2)2100 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2101 {
2102 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2103
2104 return (fd1->fd_protocol == fd2->fd_protocol);
2105 }
2106
2107 /* ARGSUSED */
2108 static boolean_t
flow_ip_match_fe(flow_tab_t * ft,flow_entry_t * f1,flow_entry_t * f2)2109 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 {
2111 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112 in6_addr_t *a1, *m1, *a2, *m2;
2113
2114 ASSERT(fd1->fd_mask == fd2->fd_mask);
2115 if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2116 return (fd1->fd_dsfield == fd2->fd_dsfield &&
2117 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2118 }
2119
2120 /*
2121 * flow_ip_accept_fe() already validated the version.
2122 */
2123 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2124 if (fd1->fd_ipversion != fd2->fd_ipversion)
2125 return (B_FALSE);
2126
2127 switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2128 case FLOW_IP_LOCAL:
2129 a1 = &fd1->fd_local_addr;
2130 m1 = &fd1->fd_local_netmask;
2131 a2 = &fd2->fd_local_addr;
2132 m2 = &fd2->fd_local_netmask;
2133 break;
2134 case FLOW_IP_REMOTE:
2135 a1 = &fd1->fd_remote_addr;
2136 m1 = &fd1->fd_remote_netmask;
2137 a2 = &fd2->fd_remote_addr;
2138 m2 = &fd2->fd_remote_netmask;
2139 break;
2140 default:
2141 /*
2142 * This is unreachable given the checks in
2143 * flow_ip_accept_fe().
2144 */
2145 return (B_FALSE);
2146 }
2147
2148 if (fd1->fd_ipversion == IPV4_VERSION) {
2149 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2150 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2151
2152 } else {
2153 return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2154 IN6_ARE_ADDR_EQUAL(m1, m2));
2155 }
2156 }
2157
2158 static int
flow_ip_mask2plen(in6_addr_t * v6mask)2159 flow_ip_mask2plen(in6_addr_t *v6mask)
2160 {
2161 int bits;
2162 int plen = IPV6_ABITS;
2163 int i;
2164
2165 for (i = 3; i >= 0; i--) {
2166 if (v6mask->s6_addr32[i] == 0) {
2167 plen -= 32;
2168 continue;
2169 }
2170 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2171 if (bits == 0)
2172 break;
2173 plen -= bits;
2174 }
2175 return (plen);
2176 }
2177
2178 /* ARGSUSED */
2179 static int
flow_ip_insert_fe(flow_tab_t * ft,flow_entry_t ** headp,flow_entry_t * flent)2180 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2181 flow_entry_t *flent)
2182 {
2183 flow_entry_t **p = headp;
2184 flow_desc_t *fd0, *fd;
2185 in6_addr_t *m0, *m;
2186 int plen0, plen;
2187
2188 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2189
2190 /*
2191 * No special ordering needed for dsfield.
2192 */
2193 fd0 = &flent->fe_flow_desc;
2194 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2195 if (*p != NULL) {
2196 ASSERT(flent->fe_next == NULL);
2197 flent->fe_next = *p;
2198 }
2199 *p = flent;
2200 return (0);
2201 }
2202
2203 /*
2204 * IP address flows are arranged in descending prefix length order.
2205 */
2206 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2207 &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2208 plen0 = flow_ip_mask2plen(m0);
2209 ASSERT(plen0 != 0);
2210
2211 for (; *p != NULL; p = &(*p)->fe_next) {
2212 fd = &(*p)->fe_flow_desc;
2213
2214 /*
2215 * Normally a dsfield flent shouldn't end up on the same
2216 * list as an IP address because flow tables are (for now)
2217 * disjoint. If we decide to support both IP and dsfield
2218 * in the same table in the future, this check will allow
2219 * for that.
2220 */
2221 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2222 continue;
2223
2224 /*
2225 * We also allow for the mixing of local and remote address
2226 * flents within one list.
2227 */
2228 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2229 &fd->fd_local_netmask : &fd->fd_remote_netmask;
2230 plen = flow_ip_mask2plen(m);
2231
2232 if (plen <= plen0)
2233 break;
2234 }
2235 if (*p != NULL) {
2236 ASSERT(flent->fe_next == NULL);
2237 flent->fe_next = *p;
2238 }
2239 *p = flent;
2240 return (0);
2241 }
2242
2243 /*
2244 * Transport layer protocol and port matching functions.
2245 */
2246
2247 /* ARGSUSED */
2248 static boolean_t
flow_transport_lport_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)2249 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2250 {
2251 flow_l3info_t *l3info = &s->fs_l3info;
2252 flow_l4info_t *l4info = &s->fs_l4info;
2253 flow_desc_t *fd = &flent->fe_flow_desc;
2254
2255 return (fd->fd_protocol == l3info->l3_protocol &&
2256 fd->fd_local_port == l4info->l4_hash_port);
2257 }
2258
2259 /* ARGSUSED */
2260 static boolean_t
flow_transport_rport_match(flow_tab_t * ft,flow_entry_t * flent,flow_state_t * s)2261 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2262 {
2263 flow_l3info_t *l3info = &s->fs_l3info;
2264 flow_l4info_t *l4info = &s->fs_l4info;
2265 flow_desc_t *fd = &flent->fe_flow_desc;
2266
2267 return (fd->fd_protocol == l3info->l3_protocol &&
2268 fd->fd_remote_port == l4info->l4_hash_port);
2269 }
2270
2271 /*
2272 * Transport hash function.
2273 * Since we only support either local or remote port flows,
2274 * we only need to extract one of the ports to be used for
2275 * matching.
2276 */
2277 static uint32_t
flow_transport_hash(flow_tab_t * ft,flow_state_t * s)2278 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2279 {
2280 flow_l3info_t *l3info = &s->fs_l3info;
2281 flow_l4info_t *l4info = &s->fs_l4info;
2282 uint8_t proto = l3info->l3_protocol;
2283 boolean_t dst_or_src;
2284
2285 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2286 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2287 } else {
2288 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2289 }
2290
2291 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2292 l4info->l4_src_port;
2293
2294 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2295 }
2296
2297 /*
2298 * Unlike other accept() functions above, we do not need to get the header
2299 * size because this is our highest layer so far. If we want to do support
2300 * other higher layer protocols, we would need to save the l4_hdrsize
2301 * in the code below.
2302 */
2303
2304 /* ARGSUSED */
2305 static int
flow_transport_accept(flow_tab_t * ft,flow_state_t * s)2306 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2307 {
2308 flow_l3info_t *l3info = &s->fs_l3info;
2309 flow_l4info_t *l4info = &s->fs_l4info;
2310 uint8_t proto = l3info->l3_protocol;
2311 uchar_t *l4_start;
2312
2313 l4_start = l3info->l3_start + l3info->l3_hdrsize;
2314
2315 /*
2316 * Adjust start pointer if we're at the end of an mblk.
2317 */
2318 CHECK_AND_ADJUST_START_PTR(s, l4_start);
2319
2320 l4info->l4_start = l4_start;
2321 if (!OK_32PTR(l4_start))
2322 return (EINVAL);
2323
2324 if (l3info->l3_fragmented == B_TRUE)
2325 return (EINVAL);
2326
2327 switch (proto) {
2328 case IPPROTO_TCP: {
2329 struct tcphdr *tcph = (struct tcphdr *)l4_start;
2330
2331 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2332 return (ENOBUFS);
2333
2334 l4info->l4_src_port = tcph->th_sport;
2335 l4info->l4_dst_port = tcph->th_dport;
2336 break;
2337 }
2338 case IPPROTO_UDP: {
2339 struct udphdr *udph = (struct udphdr *)l4_start;
2340
2341 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2342 return (ENOBUFS);
2343
2344 l4info->l4_src_port = udph->uh_sport;
2345 l4info->l4_dst_port = udph->uh_dport;
2346 break;
2347 }
2348 case IPPROTO_SCTP: {
2349 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start;
2350
2351 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2352 return (ENOBUFS);
2353
2354 l4info->l4_src_port = sctph->sh_sport;
2355 l4info->l4_dst_port = sctph->sh_dport;
2356 break;
2357 }
2358 default:
2359 return (EINVAL);
2360 }
2361
2362 return (0);
2363 }
2364
2365 /*
2366 * Validates transport flow entry.
2367 * The protocol field must be present.
2368 */
2369
2370 /* ARGSUSED */
2371 static int
flow_transport_accept_fe(flow_tab_t * ft,flow_entry_t * flent)2372 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2373 {
2374 flow_desc_t *fd = &flent->fe_flow_desc;
2375 flow_mask_t mask = fd->fd_mask;
2376
2377 if ((mask & FLOW_IP_PROTOCOL) == 0)
2378 return (EINVAL);
2379
2380 switch (fd->fd_protocol) {
2381 case IPPROTO_TCP:
2382 case IPPROTO_UDP:
2383 case IPPROTO_SCTP:
2384 break;
2385 default:
2386 return (EINVAL);
2387 }
2388
2389 switch (mask & ~FLOW_IP_PROTOCOL) {
2390 case FLOW_ULP_PORT_LOCAL:
2391 if (fd->fd_local_port == 0)
2392 return (EINVAL);
2393
2394 flent->fe_match = flow_transport_lport_match;
2395 break;
2396 case FLOW_ULP_PORT_REMOTE:
2397 if (fd->fd_remote_port == 0)
2398 return (EINVAL);
2399
2400 flent->fe_match = flow_transport_rport_match;
2401 break;
2402 case 0:
2403 /*
2404 * transport-only flows conflicts with our table type.
2405 */
2406 return (EOPNOTSUPP);
2407 default:
2408 return (EINVAL);
2409 }
2410
2411 return (0);
2412 }
2413
2414 static uint32_t
flow_transport_hash_fe(flow_tab_t * ft,flow_entry_t * flent)2415 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2416 {
2417 flow_desc_t *fd = &flent->fe_flow_desc;
2418 uint16_t port = 0;
2419
2420 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2421 fd->fd_local_port : fd->fd_remote_port;
2422
2423 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2424 }
2425
2426 /* ARGSUSED */
2427 static boolean_t
flow_transport_match_fe(flow_tab_t * ft,flow_entry_t * f1,flow_entry_t * f2)2428 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2429 {
2430 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2431
2432 if (fd1->fd_protocol != fd2->fd_protocol)
2433 return (B_FALSE);
2434
2435 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2436 return (fd1->fd_local_port == fd2->fd_local_port);
2437
2438 if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2439 return (fd1->fd_remote_port == fd2->fd_remote_port);
2440
2441 return (B_TRUE);
2442 }
2443
2444 static flow_ops_t flow_l2_ops = {
2445 flow_l2_accept_fe,
2446 flow_l2_hash_fe,
2447 flow_l2_match_fe,
2448 flow_generic_insert_fe,
2449 flow_l2_hash,
2450 {flow_l2_accept}
2451 };
2452
2453 static flow_ops_t flow_ip_ops = {
2454 flow_ip_accept_fe,
2455 flow_ip_hash_fe,
2456 flow_ip_match_fe,
2457 flow_ip_insert_fe,
2458 flow_ip_hash,
2459 {flow_l2_accept, flow_ip_accept}
2460 };
2461
2462 static flow_ops_t flow_ip_proto_ops = {
2463 flow_ip_proto_accept_fe,
2464 flow_ip_proto_hash_fe,
2465 flow_ip_proto_match_fe,
2466 flow_generic_insert_fe,
2467 flow_ip_proto_hash,
2468 {flow_l2_accept, flow_ip_accept}
2469 };
2470
2471 static flow_ops_t flow_transport_ops = {
2472 flow_transport_accept_fe,
2473 flow_transport_hash_fe,
2474 flow_transport_match_fe,
2475 flow_generic_insert_fe,
2476 flow_transport_hash,
2477 {flow_l2_accept, flow_ip_accept, flow_transport_accept}
2478 };
2479
2480 static flow_tab_info_t flow_tab_info_list[] = {
2481 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2482 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2483 {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2484 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2485 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2486 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2487 };
2488
2489 #define FLOW_MAX_TAB_INFO \
2490 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2491
2492 static flow_tab_info_t *
mac_flow_tab_info_get(flow_mask_t mask)2493 mac_flow_tab_info_get(flow_mask_t mask)
2494 {
2495 int i;
2496
2497 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2498 if (mask == flow_tab_info_list[i].fti_mask)
2499 return (&flow_tab_info_list[i]);
2500 }
2501 return (NULL);
2502 }
2503