1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 * Copyright 2025 Oxide Computer Company
25 */
26
27 /*
28 * MAC Services Module - misc utilities
29 */
30
31 #include <sys/types.h>
32 #include <sys/mac.h>
33 #include <sys/mac_impl.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/mac_client_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/vlan.h>
40 #include <sys/pattr.h>
41 #include <sys/pci_tools.h>
42 #include <inet/ip.h>
43 #include <inet/ip_impl.h>
44 #include <inet/ip6.h>
45 #include <sys/vtrace.h>
46 #include <sys/dlpi.h>
47 #include <sys/sunndi.h>
48 #include <inet/ipsec_impl.h>
49 #include <inet/sadb.h>
50 #include <inet/ipsecesp.h>
51 #include <inet/ipsecah.h>
52 #include <inet/tcp.h>
53 #include <inet/sctp_ip.h>
54
55 /*
56 * The next two functions are used for dropping packets or chains of
57 * packets, respectively. We could use one function for both but
58 * separating the use cases allows us to specify intent and prevent
59 * dropping more data than intended.
60 *
61 * The purpose of these functions is to aid the debugging effort,
62 * especially in production. Rather than use freemsg()/freemsgchain(),
63 * it's preferable to use these functions when dropping a packet in
64 * the MAC layer. These functions should only be used during
65 * unexpected conditions. That is, any time a packet is dropped
66 * outside of the regular, successful datapath. Consolidating all
67 * drops on these functions allows the user to trace one location and
68 * determine why the packet was dropped based on the msg. It also
69 * allows the user to inspect the packet before it is freed. Finally,
70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus
71 * keeping the hot path running as efficiently as possible.
72 *
73 * NOTE: At this time not all MAC drops are aggregated on these
74 * functions; but that is the plan. This comment should be erased once
75 * completed.
76 */
77
78 /*PRINTFLIKE2*/
79 void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
81 {
82 va_list adx;
83 char msg[128];
84 char *msgp = msg;
85
86 ASSERT3P(mp->b_next, ==, NULL);
87
88 va_start(adx, fmt);
89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
90 va_end(adx);
91
92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
93 freemsg(mp);
94 }
95
96 /*PRINTFLIKE2*/
97 void
mac_drop_chain(mblk_t * chain,const char * fmt,...)98 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
99 {
100 va_list adx;
101 char msg[128];
102 char *msgp = msg;
103
104 va_start(adx, fmt);
105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
106 va_end(adx);
107
108 /*
109 * We could use freemsgchain() for the actual freeing but
110 * since we are already walking the chain to fire the dtrace
111 * probe we might as well free the msg here too.
112 */
113 for (mblk_t *mp = chain, *next; mp != NULL; ) {
114 next = mp->b_next;
115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
116 mp->b_next = NULL;
117 freemsg(mp);
118 mp = next;
119 }
120 }
121
122 /*
123 * Copy an mblk, preserving its hardware checksum flags.
124 */
125 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)126 mac_copymsg_cksum(mblk_t *mp)
127 {
128 mblk_t *mp1;
129
130 mp1 = copymsg(mp);
131 if (mp1 == NULL)
132 return (NULL);
133
134 mac_hcksum_clone(mp, mp1);
135
136 return (mp1);
137 }
138
139 /*
140 * Copy an mblk chain, presenting the hardware checksum flags of the
141 * individual mblks.
142 */
143 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)144 mac_copymsgchain_cksum(mblk_t *mp)
145 {
146 mblk_t *nmp = NULL;
147 mblk_t **nmpp = &nmp;
148
149 for (; mp != NULL; mp = mp->b_next) {
150 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
151 freemsgchain(nmp);
152 return (NULL);
153 }
154
155 nmpp = &((*nmpp)->b_next);
156 }
157
158 return (nmp);
159 }
160
161 /*
162 * Perform software checksum on a single message, if needed. The emulation
163 * performed is determined by an intersection of the mblk's flags and the emul
164 * flags requested. The emul flags are documented in mac.h.
165 */
166 static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
168 {
169 mac_ether_offload_info_t meoi = { 0 };
170 const char *err = "";
171
172 /*
173 * The only current caller is mac_hw_emul(), which handles any chaining
174 * of mblks prior to now.
175 */
176 VERIFY3P(mp->b_next, ==, NULL);
177
178 uint32_t flags = DB_CKSUMFLAGS(mp);
179
180 /* Why call this if checksum emulation isn't needed? */
181 ASSERT3U(flags & (HCK_FLAGS), !=, 0);
182 /* But also, requesting both ULP cksum types is improper */
183 if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) {
184 err = "full and partial ULP cksum requested";
185 goto bail;
186 }
187
188 const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 &&
189 (flags & HCK_IPV4_HDRCKSUM) != 0;
190 const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 &&
191 (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0;
192 const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0;
193
194 mac_ether_offload_info(mp, &meoi);
195 if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 ||
196 (meoi.meoi_l3proto != ETHERTYPE_IP &&
197 meoi.meoi_l3proto != ETHERTYPE_IPV6)) {
198 /* Non-IP traffic (like ARP) is left alone */
199 return (mp);
200 }
201
202 /*
203 * Ensure that requested checksum type(s) are supported by the
204 * protocols encoded in the packet headers.
205 */
206 if (do_v4_cksum) {
207 if (meoi.meoi_l3proto != ETHERTYPE_IP) {
208 err = "IPv4 csum requested on non-IPv4 packet";
209 goto bail;
210 }
211 }
212 if (do_ulp_cksum) {
213 if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) {
214 err = "missing ULP header";
215 goto bail;
216 }
217 switch (meoi.meoi_l4proto) {
218 case IPPROTO_TCP:
219 case IPPROTO_UDP:
220 case IPPROTO_ICMP:
221 case IPPROTO_ICMPV6:
222 case IPPROTO_SCTP:
223 break;
224 default:
225 err = "unexpected ULP";
226 goto bail;
227 }
228 }
229
230 /*
231 * If the first mblk of this packet contains only the Ethernet header,
232 * skip past it for now. Packets with their data contained in only a
233 * single mblk can then use the fastpaths tuned to that possibility.
234 */
235 mblk_t *skipped_hdr = NULL;
236 if (MBLKL(mp) == meoi.meoi_l2hlen) {
237 meoi.meoi_len -= meoi.meoi_l2hlen;
238 meoi.meoi_l2hlen = 0;
239 skipped_hdr = mp;
240 mp = mp->b_cont;
241
242 ASSERT(mp != NULL);
243 }
244
245 /*
246 * Ensure that all of the headers we need to access are:
247 * 1. Collected in the first mblk
248 * 2. Held in a data-block which is safe for us to modify
249 * (It must have a refcount of 1)
250 */
251 const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) +
252 (do_ulp_cksum ? meoi.meoi_l4hlen : 0);
253 if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1) {
254 mblk_t *hdrmp = msgpullup(mp, hdr_len_reqd);
255
256 if (hdrmp == NULL) {
257 err = "could not pullup msg headers";
258 goto bail;
259 }
260
261 mac_hcksum_clone(mp, hdrmp);
262 if (skipped_hdr != NULL) {
263 ASSERT3P(skipped_hdr->b_cont, ==, mp);
264 skipped_hdr->b_cont = hdrmp;
265 }
266 freemsg(mp);
267 mp = hdrmp;
268 }
269
270 /* Calculate IPv4 header checksum, if requested */
271 if (do_v4_cksum) {
272 /*
273 * While unlikely, it's possible to write code that might end up
274 * calling mac_sw_cksum() twice on the same mblk (performing
275 * both LSO and checksum emulation in a single mblk chain loop
276 * -- the LSO emulation inserts a new chain into the existing
277 * chain and then the loop iterates back over the new segments
278 * and emulates the checksum a second time). Normally this
279 * wouldn't be a problem, because the HCK_*_OK flags are
280 * supposed to indicate that we don't need to do peform the
281 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
282 * same value; so we cannot use these flags to determine if the
283 * IP header checksum has already been calculated or not. For
284 * this reason, we zero out the the checksum first. In the
285 * future, we should fix the HCK_* flags.
286 */
287 ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
288 ipha->ipha_hdr_checksum = 0;
289 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
290 flags &= ~HCK_IPV4_HDRCKSUM;
291 flags |= HCK_IPV4_HDRCKSUM_OK;
292 }
293
294 /*
295 * The SCTP is different from all the other protocols in that it uses
296 * CRC32 for its checksum, rather than ones' complement.
297 */
298 if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) {
299 if (ulp_prefer_partial) {
300 err = "SCTP does not support partial checksum";
301 goto bail;
302 }
303
304 const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen;
305 sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off);
306
307 sctph->sh_chksum = 0;
308 sctph->sh_chksum = sctp_cksum(mp, ulp_off);
309
310 flags &= ~HCK_FULLCKSUM;
311 flags |= HCK_FULLCKSUM_OK;
312 goto success;
313 }
314
315 /* Calculate full ULP checksum, if requested */
316 if (do_ulp_cksum && !ulp_prefer_partial) {
317 /*
318 * Calculate address and length portions of pseudo-header csum
319 */
320 uint32_t cksum = 0;
321 if (meoi.meoi_l3proto == ETHERTYPE_IP) {
322 const ipha_t *ipha =
323 (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
324 const uint16_t *ipp =
325 (const uint16_t *)(&ipha->ipha_src);
326
327 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3];
328
329 /*
330 * While it is tempting to calculate the payload length
331 * solely from `meoi`, like as done below for IPv6,
332 * doing so is a trap. Packets shorter than 60 bytes
333 * will get padded out to that length in order to meet
334 * the minimums for Ethernet. Instead, we pull the
335 * length from the IP header.
336 */
337 const uint16_t payload_len =
338 ntohs(ipha->ipha_length) - meoi.meoi_l3hlen;
339 cksum += htons(payload_len);
340 } else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) {
341 const ip6_t *ip6h =
342 (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen);
343 const uint16_t *ipp =
344 (const uint16_t *)(&ip6h->ip6_src);
345
346 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] +
347 ipp[4] + ipp[5] + ipp[6] + ipp[7];
348 cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] +
349 ipp[12] + ipp[13] + ipp[14] + ipp[15];
350
351 const uint16_t payload_len = meoi.meoi_len -
352 ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen);
353 cksum += htons(payload_len);
354 } else {
355 /*
356 * Since we already checked for recognized L3 protocols
357 * earlier, this should not be reachable.
358 */
359 panic("L3 protocol unexpectedly changed");
360 }
361
362 /* protocol portion of pseudo-header */
363 uint_t cksum_off;
364 switch (meoi.meoi_l4proto) {
365 case IPPROTO_TCP:
366 cksum += IP_TCP_CSUM_COMP;
367 cksum_off = TCP_CHECKSUM_OFFSET;
368 break;
369 case IPPROTO_UDP:
370 cksum += IP_UDP_CSUM_COMP;
371 cksum_off = UDP_CHECKSUM_OFFSET;
372 break;
373 case IPPROTO_ICMP:
374 /* ICMP cksum does not include pseudo-header contents */
375 cksum = 0;
376 cksum_off = ICMP_CHECKSUM_OFFSET;
377 break;
378 case IPPROTO_ICMPV6:
379 cksum += IP_ICMPV6_CSUM_COMP;
380 cksum_off = ICMPV6_CHECKSUM_OFFSET;
381 break;
382 default:
383 err = "unrecognized L4 protocol";
384 goto bail;
385 }
386
387 /*
388 * With IP_CSUM() taking into account the pseudo-header
389 * checksum, make sure the ULP checksum field is zeroed before
390 * computing the rest;
391 */
392 const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen;
393 uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off);
394 *up = 0;
395 cksum = IP_CSUM(mp, l4_off, cksum);
396
397 if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) {
398 /*
399 * A zero checksum is not allowed on UDPv6, and on UDPv4
400 * implies no checksum. In either case, invert to a
401 * values of all-1s.
402 */
403 *up = 0xffff;
404 } else {
405 *up = cksum;
406 }
407
408 flags &= ~HCK_FULLCKSUM;
409 flags |= HCK_FULLCKSUM_OK;
410 goto success;
411 }
412
413 /* Calculate partial ULP checksum, if requested */
414 if (do_ulp_cksum && ulp_prefer_partial) {
415 uint32_t start, stuff, end, value;
416 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
417
418 ASSERT3S(end, >, start);
419
420 /*
421 * The prior size checks against the header length data ensure
422 * that the mblk contains everything through at least the ULP
423 * header, but if the partial checksum (unexpectedly) requests
424 * its result be stored past that, we cannot continue.
425 */
426 if (stuff + sizeof (uint16_t) > MBLKL(mp)) {
427 err = "partial csum request is out of bounds";
428 goto bail;
429 }
430
431 uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen);
432 uint16_t *up = (uint16_t *)(ipp + stuff);
433
434 const uint16_t partial = *up;
435 *up = 0;
436 const uint16_t cksum =
437 ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial);
438 *up = cksum != 0 ? cksum : ~cksum;
439
440 flags &= ~HCK_PARTIALCKSUM;
441 flags |= HCK_FULLCKSUM_OK;
442 }
443
444 success:
445 /*
446 * With the checksum(s) calculated, store the updated flags to reflect
447 * the current status, and zero out any of the partial-checksum fields
448 * which would be irrelevant now.
449 */
450 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
451
452 /* Don't forget to reattach the header. */
453 if (skipped_hdr != NULL) {
454 ASSERT3P(skipped_hdr->b_cont, ==, mp);
455
456 /*
457 * Duplicate the HCKSUM data into the header mblk.
458 *
459 * This mimics mac_add_vlan_tag() which ensures that both the
460 * first mblk _and_ the first data bearing mblk possess the
461 * HCKSUM information. Consumers like IP will end up discarding
462 * the ether_header mblk, so for now, it is important that the
463 * data be available in both places.
464 */
465 mac_hcksum_clone(mp, skipped_hdr);
466 mp = skipped_hdr;
467 }
468 return (mp);
469
470 bail:
471 if (skipped_hdr != NULL) {
472 ASSERT3P(skipped_hdr->b_cont, ==, mp);
473 mp = skipped_hdr;
474 }
475
476 mac_drop_pkt(mp, err);
477 return (NULL);
478 }
479
480 /*
481 * Build a single data segment from an LSO packet. The mblk chain
482 * returned, seg_head, represents the data segment and is always
483 * exactly seg_len bytes long. The lso_mp and offset input/output
484 * parameters track our position in the LSO packet. This function
485 * exists solely as a helper to mac_sw_lso().
486 *
487 * Case A
488 *
489 * The current lso_mp is larger than the requested seg_len. The
490 * beginning of seg_head may start at the beginning of lso_mp or
491 * offset into it. In either case, a single mblk is returned, and
492 * *offset is updated to reflect our new position in the current
493 * lso_mp.
494 *
495 * +----------------------------+
496 * | in *lso_mp / out *lso_mp |
497 * +----------------------------+
498 * ^ ^
499 * | |
500 * | |
501 * | |
502 * +------------------------+
503 * | seg_head |
504 * +------------------------+
505 * ^ ^
506 * | |
507 * in *offset = 0 out *offset = seg_len
508 *
509 * |------ seg_len ----|
510 *
511 *
512 * +------------------------------+
513 * | in *lso_mp / out *lso_mp |
514 * +------------------------------+
515 * ^ ^
516 * | |
517 * | |
518 * | |
519 * +------------------------+
520 * | seg_head |
521 * +------------------------+
522 * ^ ^
523 * | |
524 * in *offset = N out *offset = N + seg_len
525 *
526 * |------ seg_len ----|
527 *
528 *
529 *
530 * Case B
531 *
532 * The requested seg_len consumes exactly the rest of the lso_mp.
533 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
534 * The seg_head may start at the beginning of the lso_mp or at some
535 * offset into it. In either case we return a single mblk, reset
536 * *offset to zero, and walk to the next lso_mp.
537 *
538 * +------------------------+ +------------------------+
539 * | in *lso_mp |---------->| out *lso_mp |
540 * +------------------------+ +------------------------+
541 * ^ ^ ^
542 * | | |
543 * | | out *offset = 0
544 * | |
545 * +------------------------+
546 * | seg_head |
547 * +------------------------+
548 * ^
549 * |
550 * in *offset = 0
551 *
552 * |------ seg_len ----|
553 *
554 *
555 *
556 * +----------------------------+ +------------------------+
557 * | in *lso_mp |---------->| out *lso_mp |
558 * +----------------------------+ +------------------------+
559 * ^ ^ ^
560 * | | |
561 * | | out *offset = 0
562 * | |
563 * +------------------------+
564 * | seg_head |
565 * +------------------------+
566 * ^
567 * |
568 * in *offset = N
569 *
570 * |------ seg_len ----|
571 *
572 *
573 * Case C
574 *
575 * The requested seg_len is greater than the current lso_mp. In
576 * this case we must consume LSO mblks until we have enough data to
577 * satisfy either case (A) or (B) above. We will return multiple
578 * mblks linked via b_cont, offset will be set based on the cases
579 * above, and lso_mp will walk forward at least one mblk, but maybe
580 * more.
581 *
582 * N.B. This digram is not exhaustive. The seg_head may start on
583 * the beginning of an lso_mp. The seg_tail may end exactly on the
584 * boundary of an lso_mp. And there may be two (in this case the
585 * middle block wouldn't exist), three, or more mblks in the
586 * seg_head chain. This is meant as one example of what might
587 * happen. The main thing to remember is that the seg_tail mblk
588 * must be one of case (A) or (B) above.
589 *
590 * +------------------+ +----------------+ +------------------+
591 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
592 * +------------------+ +----------------+ +------------------+
593 * ^ ^ ^ ^ ^ ^
594 * | | | | | |
595 * | | | | | |
596 * | | | | | |
597 * | | | | | |
598 * +------------+ +----------------+ +------------+
599 * | seg_head |--->| |--->| seg_tail |
600 * +------------+ +----------------+ +------------+
601 * ^ ^
602 * | |
603 * in *offset = N out *offset = MBLKL(seg_tail)
604 *
605 * |------------------- seg_len -------------------|
606 *
607 */
608 static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)609 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
610 {
611 mblk_t *seg_head, *seg_tail, *seg_mp;
612
613 ASSERT3P(*lso_mp, !=, NULL);
614 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
615
616 seg_mp = dupb(*lso_mp);
617 if (seg_mp == NULL)
618 return (NULL);
619
620 seg_head = seg_mp;
621 seg_tail = seg_mp;
622
623 /* Continue where we left off from in the lso_mp. */
624 seg_mp->b_rptr += *offset;
625
626 last_mblk:
627 /* Case (A) */
628 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
629 *offset += seg_len;
630 seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
631 return (seg_head);
632 }
633
634 /* Case (B) */
635 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
636 *offset = 0;
637 *lso_mp = (*lso_mp)->b_cont;
638 return (seg_head);
639 }
640
641 /* Case (C) */
642 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
643
644 /*
645 * The current LSO mblk doesn't have enough data to satisfy
646 * seg_len -- continue peeling off LSO mblks to build the new
647 * segment message. If allocation fails we free the previously
648 * allocated segment mblks and return NULL.
649 */
650 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
651 ASSERT3U(MBLKL(seg_mp), <=, seg_len);
652 seg_len -= MBLKL(seg_mp);
653 *offset = 0;
654 *lso_mp = (*lso_mp)->b_cont;
655 seg_mp = dupb(*lso_mp);
656
657 if (seg_mp == NULL) {
658 freemsgchain(seg_head);
659 return (NULL);
660 }
661
662 seg_tail->b_cont = seg_mp;
663 seg_tail = seg_mp;
664 }
665
666 /*
667 * We've walked enough LSO mblks that we can now satisfy the
668 * remaining seg_len. At this point we need to jump back to
669 * determine if we have arrived at case (A) or (B).
670 */
671
672 /* Just to be paranoid that we didn't underflow. */
673 ASSERT3U(seg_len, <, IP_MAXPACKET);
674 ASSERT3U(seg_len, >, 0);
675 goto last_mblk;
676 }
677
678 /*
679 * Perform software segmentation of a single LSO message. Take an LSO
680 * message as input and return head/tail pointers as output. This
681 * function should not be invoked directly but instead through
682 * mac_hw_emul().
683 *
684 * The resulting chain is comprised of multiple (nsegs) MSS sized
685 * segments. Each segment will consist of two or more mblks joined by
686 * b_cont: a header and one or more data mblks. The header mblk is
687 * allocated anew for each message. The first segment's header is used
688 * as a template for the rest with adjustments made for things such as
689 * ID, sequence, length, TCP flags, etc. The data mblks reference into
690 * the existing LSO mblk (passed in as omp) by way of dupb(). Their
691 * b_rptr/b_wptr values are adjusted to reference only the fraction of
692 * the LSO message they are responsible for. At the successful
693 * completion of this function the original mblk (omp) is freed,
694 * leaving the newely created segment chain as the only remaining
695 * reference to the data.
696 */
697 static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)698 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
699 uint_t *count)
700 {
701 uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
702 uint32_t mss;
703 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
704 uint32_t oleft;
705 uint_t nsegs, seg;
706 int len;
707
708 struct ether_vlan_header *oevh;
709 const ipha_t *oiph;
710 const tcph_t *otcph;
711 ipha_t *niph;
712 tcph_t *ntcph;
713 uint16_t ip_id;
714 uint32_t tcp_seq, tcp_sum, otcp_sum;
715
716 uint32_t offset;
717 mblk_t *odatamp;
718 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
719 mblk_t *tmptail;
720
721 ASSERT3P(head, !=, NULL);
722 ASSERT3P(tail, !=, NULL);
723 ASSERT3P(count, !=, NULL);
724 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
725
726 /* Assume we are dealing with a single LSO message. */
727 ASSERT3P(omp->b_next, ==, NULL);
728
729 /*
730 * XXX: This is a hack to deal with mac_add_vlan_tag().
731 *
732 * When VLANs are in play, mac_add_vlan_tag() creates a new
733 * mblk with just the ether_vlan_header and tacks it onto the
734 * front of 'omp'. This breaks the assumptions made below;
735 * namely that the TCP/IP headers are in the first mblk. In
736 * this case, since we already have to pay the cost of LSO
737 * emulation, we simply pull up everything. While this might
738 * seem irksome, keep in mind this will only apply in a couple
739 * of scenarios: a) an LSO-capable VLAN client sending to a
740 * non-LSO-capable client over the "MAC/bridge loopback"
741 * datapath or b) an LSO-capable VLAN client is sending to a
742 * client that, for whatever reason, doesn't have DLS-bypass
743 * enabled. Finally, we have to check for both a tagged and
744 * untagged sized mblk depending on if the mblk came via
745 * mac_promisc_dispatch() or mac_rx_deliver().
746 *
747 * In the future, two things should be done:
748 *
749 * 1. This function should make use of some yet to be
750 * implemented "mblk helpers". These helper functions would
751 * perform all the b_cont walking for us and guarantee safe
752 * access to the mblk data.
753 *
754 * 2. We should add some slop to the mblks so that
755 * mac_add_vlan_tag() can just edit the first mblk instead
756 * of allocating on the hot path.
757 */
758 if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
759 MBLKL(omp) == sizeof (struct ether_header)) {
760 mblk_t *tmp = msgpullup(omp, -1);
761
762 if (tmp == NULL) {
763 mac_drop_pkt(omp, "failed to pull up");
764 goto fail;
765 }
766
767 mac_hcksum_clone(omp, tmp);
768 freemsg(omp);
769 omp = tmp;
770 }
771
772 mss = DB_LSOMSS(omp);
773 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
774 sizeof (struct ether_vlan_header));
775 opktlen = msgsize(omp);
776
777 /*
778 * First, get references to the IP and TCP headers and
779 * determine the total TCP length (header + data).
780 *
781 * Thanks to mac_hw_emul() we know that the first mblk must
782 * contain (at minimum) the full L2 header. However, this
783 * function assumes more than that. It assumes the L2/L3/L4
784 * headers are all contained in the first mblk of a message
785 * (i.e., no b_cont walking for headers). While this is a
786 * current reality (our native TCP stack and viona both
787 * enforce this) things may become more nuanced in the future
788 * (e.g. when introducing encap support or adding new
789 * clients). For now we guard against this case by dropping
790 * the packet.
791 */
792 oevh = (struct ether_vlan_header *)omp->b_rptr;
793 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
794 oehlen = sizeof (struct ether_vlan_header);
795 else
796 oehlen = sizeof (struct ether_header);
797
798 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
799 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
800 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
801 goto fail;
802 }
803
804 oiph = (ipha_t *)(omp->b_rptr + oehlen);
805 oiphlen = IPH_HDR_LENGTH(oiph);
806 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
807 otcphlen = TCP_HDR_LENGTH(otcph);
808
809 /*
810 * Currently we only support LSO for TCP/IPv4.
811 */
812 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
813 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
814 IPH_HDR_VERSION(oiph));
815 goto fail;
816 }
817
818 if (oiph->ipha_protocol != IPPROTO_TCP) {
819 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
820 oiph->ipha_protocol);
821 goto fail;
822 }
823
824 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
825 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
826 goto fail;
827 }
828
829 ohdrslen = oehlen + oiphlen + otcphlen;
830 if ((len = MBLKL(omp)) < ohdrslen) {
831 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
832 ohdrslen);
833 goto fail;
834 }
835
836 /*
837 * Either we have data in the first mblk or it's just the
838 * header. In either case, we need to set rptr to the start of
839 * the TCP data.
840 */
841 if (len > ohdrslen) {
842 odatamp = omp;
843 offset = ohdrslen;
844 } else {
845 ASSERT3U(len, ==, ohdrslen);
846 odatamp = omp->b_cont;
847 offset = 0;
848 }
849
850 /* Make sure we still have enough data. */
851 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
852
853 /*
854 * If a MAC negotiated LSO then it must negotioate both
855 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
856 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
857 * change during LSO segmentation (only the 3 fields of the
858 * pseudo header checksum don't change: src, dst, proto). Thus
859 * we would expect these flags (HCK_IPV4_HDRCKSUM |
860 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
861 * function to emulate those checksums in software. However,
862 * that assumes a world where we only expose LSO if the
863 * underlying hardware exposes LSO. Moving forward the plan is
864 * to assume LSO in the upper layers and have MAC perform
865 * software LSO when the underlying provider doesn't support
866 * it. In such a world, if the provider doesn't support LSO
867 * but does support hardware checksum offload, then we could
868 * simply perform the segmentation and allow the hardware to
869 * calculate the checksums. To the hardware it's just another
870 * chain of non-LSO packets.
871 */
872 ASSERT3S(DB_TYPE(omp), ==, M_DATA);
873 ocsum_flags = DB_CKSUMFLAGS(omp);
874 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
875 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
876
877 /*
878 * If hardware only provides partial checksum then software
879 * must supply the pseudo-header checksum. In the case of LSO
880 * we leave the TCP length at zero to be filled in by
881 * hardware. This function must handle two scenarios.
882 *
883 * 1. Being called by a MAC client on the Rx path to segment
884 * an LSO packet and calculate the checksum.
885 *
886 * 2. Being called by a MAC provider to segment an LSO packet.
887 * In this case the LSO segmentation is performed in
888 * software (by this routine) but the MAC provider should
889 * still calculate the TCP/IP checksums in hardware.
890 *
891 * To elaborate on the second case: we cannot have the
892 * scenario where IP sends LSO packets but the underlying HW
893 * doesn't support checksum offload -- because in that case
894 * TCP/IP would calculate the checksum in software (for the
895 * LSO packet) but then MAC would segment the packet and have
896 * to redo all the checksum work. So IP should never do LSO
897 * if HW doesn't support both IP and TCP checksum.
898 */
899 if (ocsum_flags & HCK_PARTIALCKSUM) {
900 ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
901 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
902 }
903
904 odatalen = opktlen - ohdrslen;
905
906 /*
907 * Subtract one to account for the case where the data length
908 * is evenly divisble by the MSS. Add one to account for the
909 * fact that the division will always result in one less
910 * segment than needed.
911 */
912 nsegs = ((odatalen - 1) / mss) + 1;
913 if (nsegs < 2) {
914 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
915 goto fail;
916 }
917
918 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
919 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
920 nsegs);
921
922 seg_chain = NULL;
923 tmptail = seg_chain;
924 oleft = odatalen;
925
926 for (uint_t i = 0; i < nsegs; i++) {
927 boolean_t last_seg = ((i + 1) == nsegs);
928 uint32_t seg_len;
929
930 /*
931 * If we fail to allocate, then drop the partially
932 * allocated chain as well as the LSO packet. Let the
933 * sender deal with the fallout.
934 */
935 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
936 freemsgchain(seg_chain);
937 mac_drop_pkt(omp, "failed to alloc segment header");
938 goto fail;
939 }
940 ASSERT3P(nhdrmp->b_cont, ==, NULL);
941
942 if (seg_chain == NULL) {
943 seg_chain = nhdrmp;
944 } else {
945 ASSERT3P(tmptail, !=, NULL);
946 tmptail->b_next = nhdrmp;
947 }
948
949 tmptail = nhdrmp;
950
951 /*
952 * Calculate this segment's lengh. It's either the MSS
953 * or whatever remains for the last segment.
954 */
955 seg_len = last_seg ? oleft : mss;
956 ASSERT3U(seg_len, <=, mss);
957 ndatamp = build_data_seg(&odatamp, &offset, seg_len);
958
959 if (ndatamp == NULL) {
960 freemsgchain(seg_chain);
961 mac_drop_pkt(omp, "LSO failed to segment data");
962 goto fail;
963 }
964
965 /* Attach data mblk to header mblk. */
966 nhdrmp->b_cont = ndatamp;
967 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
968 ASSERT3U(seg_len, <=, oleft);
969 oleft -= seg_len;
970 }
971
972 /* We should have consumed entire LSO msg. */
973 ASSERT3S(oleft, ==, 0);
974 ASSERT3P(odatamp, ==, NULL);
975
976 /*
977 * All seg data mblks are referenced by the header mblks, null
978 * out this pointer to catch any bad derefs.
979 */
980 ndatamp = NULL;
981
982 /*
983 * Set headers and checksum for first segment.
984 */
985 nhdrmp = seg_chain;
986 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
987 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
988 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
989 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
990 niph->ipha_length = htons(oiphlen + otcphlen + mss);
991 niph->ipha_hdr_checksum = 0;
992 ip_id = ntohs(niph->ipha_ident);
993 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
994 tcp_seq = BE32_TO_U32(ntcph->th_seq);
995 tcp_seq += mss;
996
997 /*
998 * The first segment shouldn't:
999 *
1000 * o indicate end of data transmission (FIN),
1001 * o indicate immediate handling of the data (PUSH).
1002 */
1003 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1004 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1005
1006 /*
1007 * If the underlying HW provides partial checksum, then make
1008 * sure to correct the pseudo header checksum before calling
1009 * mac_sw_cksum(). The native TCP stack doesn't include the
1010 * length field in the pseudo header when LSO is in play -- so
1011 * we need to calculate it here.
1012 */
1013 if (ocsum_flags & HCK_PARTIALCKSUM) {
1014 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1015 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1016 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1017 tcp_sum = BE16_TO_U16(ntcph->th_sum);
1018 otcp_sum = tcp_sum;
1019 tcp_sum += mss + otcphlen;
1020 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1021 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1022 }
1023
1024 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1025 (emul & MAC_HWCKSUM_EMULS)) {
1026 next_nhdrmp = nhdrmp->b_next;
1027 nhdrmp->b_next = NULL;
1028 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1029 /*
1030 * The mblk could be replaced (via pull-up) or freed (due to
1031 * failure) during mac_sw_cksum(), so we must take care with the
1032 * result here.
1033 */
1034 if (nhdrmp != NULL) {
1035 nhdrmp->b_next = next_nhdrmp;
1036 next_nhdrmp = NULL;
1037 seg_chain = nhdrmp;
1038 } else {
1039 freemsgchain(next_nhdrmp);
1040 /*
1041 * nhdrmp referenced the head of seg_chain when it was
1042 * freed, so further clean-up there is unnecessary
1043 */
1044 seg_chain = NULL;
1045 mac_drop_pkt(omp, "LSO cksum emulation failed");
1046 goto fail;
1047 }
1048 }
1049
1050 ASSERT3P(nhdrmp, !=, NULL);
1051
1052 seg = 1;
1053 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1054 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1055 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1056 uint_t, seg);
1057 seg++;
1058
1059 /* There better be at least 2 segs. */
1060 ASSERT3P(nhdrmp->b_next, !=, NULL);
1061 prev_nhdrmp = nhdrmp;
1062 nhdrmp = nhdrmp->b_next;
1063
1064 /*
1065 * Now adjust the headers of the middle segments. For each
1066 * header we need to adjust the following.
1067 *
1068 * o IP ID
1069 * o IP length
1070 * o TCP sequence
1071 * o TCP flags
1072 * o cksum flags
1073 * o cksum values (if MAC_HWCKSUM_EMUL is set)
1074 */
1075 for (; seg < nsegs; seg++) {
1076 /*
1077 * We use seg_chain as a reference to the first seg
1078 * header mblk -- this first header is a template for
1079 * the rest of the segments. This copy will include
1080 * the now updated checksum values from the first
1081 * header. We must reset these checksum values to
1082 * their original to make sure we produce the correct
1083 * value.
1084 */
1085 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1086 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1087 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1088 niph->ipha_ident = htons(++ip_id);
1089 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1090 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1091 niph->ipha_hdr_checksum = 0;
1092 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1093 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1094 tcp_seq += mss;
1095 /*
1096 * Just like the first segment, the middle segments
1097 * shouldn't have these flags set.
1098 */
1099 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1100 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1101
1102 if (ocsum_flags & HCK_PARTIALCKSUM) {
1103 /*
1104 * First and middle segs have same
1105 * pseudo-header checksum.
1106 */
1107 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1108 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1109 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1110 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1111 }
1112
1113 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1114 (emul & MAC_HWCKSUM_EMULS)) {
1115 next_nhdrmp = nhdrmp->b_next;
1116 nhdrmp->b_next = NULL;
1117 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1118 /*
1119 * Like above, handle cases where mac_sw_cksum() does a
1120 * pull-up or drop of the mblk.
1121 */
1122 if (nhdrmp != NULL) {
1123 nhdrmp->b_next = next_nhdrmp;
1124 next_nhdrmp = NULL;
1125 prev_nhdrmp->b_next = nhdrmp;
1126 } else {
1127 freemsgchain(next_nhdrmp);
1128 /*
1129 * Critical to de-link the now-freed nhdrmp
1130 * before freeing the rest of the preceding
1131 * chain.
1132 */
1133 prev_nhdrmp->b_next = NULL;
1134 freemsgchain(seg_chain);
1135 seg_chain = NULL;
1136 mac_drop_pkt(omp, "LSO cksum emulation failed");
1137 goto fail;
1138 }
1139 }
1140
1141 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1142 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1143 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1144 uint_t, mss, uint_t, seg);
1145
1146 ASSERT3P(nhdrmp->b_next, !=, NULL);
1147 prev_nhdrmp = nhdrmp;
1148 nhdrmp = nhdrmp->b_next;
1149 }
1150
1151 /* Make sure we are on the last segment. */
1152 ASSERT3U(seg, ==, nsegs);
1153 ASSERT3P(nhdrmp->b_next, ==, NULL);
1154
1155 /*
1156 * Now we set the last segment header. The difference being
1157 * that FIN/PSH/RST flags are allowed.
1158 */
1159 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1160 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1161 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1162 niph->ipha_ident = htons(++ip_id);
1163 len = msgsize(nhdrmp->b_cont);
1164 ASSERT3S(len, >, 0);
1165 niph->ipha_length = htons(oiphlen + otcphlen + len);
1166 niph->ipha_hdr_checksum = 0;
1167 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1168 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1169
1170 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1171 if (ocsum_flags & HCK_PARTIALCKSUM) {
1172 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1173 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1174 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1175 tcp_sum = otcp_sum;
1176 tcp_sum += len + otcphlen;
1177 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1178 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1179 }
1180
1181 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1182 (emul & MAC_HWCKSUM_EMULS)) {
1183 /* This should be the last mblk. */
1184 ASSERT3P(nhdrmp->b_next, ==, NULL);
1185 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1186 /*
1187 * If the final mblk happens to be dropped as part of
1188 * mac_sw_cksum(), that is unfortunate, but it need not be a
1189 * show-stopper at this point. We can just pretend that final
1190 * packet was dropped in transit.
1191 */
1192 prev_nhdrmp->b_next = nhdrmp;
1193 }
1194
1195 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1196 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1197 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1198 uint_t, seg);
1199
1200 /*
1201 * Free the reference to the original LSO message as it is
1202 * being replaced by seg_cahin.
1203 */
1204 freemsg(omp);
1205 *head = seg_chain;
1206 *tail = nhdrmp;
1207 *count = nsegs;
1208 return;
1209
1210 fail:
1211 *head = NULL;
1212 *tail = NULL;
1213 *count = 0;
1214 }
1215
1216 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1217
1218 /*
1219 * Emulate various hardware offload features in software. Take a chain
1220 * of packets as input and emulate the hardware features specified in
1221 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1222 * pointer given as input, and its tail pointer is written to
1223 * '*otail'. The number of packets in the new chain is written to
1224 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1225 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1226 * which case 'mp_chain' will simply stay a NULL chain.
1227 *
1228 * While unlikely, it is technically possible that this function could
1229 * receive a non-NULL chain as input and return a NULL chain as output
1230 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1231 * zero). This could happen if all the packets in the chain are
1232 * dropped or if we fail to allocate new mblks. In this case, there is
1233 * nothing for the caller to free. In any event, the caller shouldn't
1234 * assume that '*mp_chain' is non-NULL on return.
1235 *
1236 * This function was written with three main use cases in mind.
1237 *
1238 * 1. To emulate hardware offloads when traveling mac-loopback (two
1239 * clients on the same mac). This is wired up in mac_tx_send().
1240 *
1241 * 2. To provide hardware offloads to the client when the underlying
1242 * provider cannot. This is currently wired up in mac_tx() but we
1243 * still only negotiate offloads when the underlying provider
1244 * supports them.
1245 *
1246 * 3. To emulate real hardware in simnet.
1247 */
1248 void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1249 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1250 {
1251 mblk_t *head = NULL, *tail = NULL;
1252 uint_t count = 0;
1253
1254 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1255 ASSERT3P(mp_chain, !=, NULL);
1256
1257 for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1258 mblk_t *tmp, *next, *tmphead, *tmptail;
1259 struct ether_header *ehp;
1260 uint32_t flags;
1261 uint_t len = MBLKL(mp), l2len;
1262
1263 /* Perform LSO/cksum one message at a time. */
1264 next = mp->b_next;
1265 mp->b_next = NULL;
1266
1267 /*
1268 * For our sanity the first mblk should contain at
1269 * least the full L2 header.
1270 */
1271 if (len < sizeof (struct ether_header)) {
1272 mac_drop_pkt(mp, "packet too short (A): %u", len);
1273 mp = next;
1274 continue;
1275 }
1276
1277 ehp = (struct ether_header *)mp->b_rptr;
1278 if (ntohs(ehp->ether_type) == VLAN_TPID)
1279 l2len = sizeof (struct ether_vlan_header);
1280 else
1281 l2len = sizeof (struct ether_header);
1282
1283 /*
1284 * If the first mblk is solely the L2 header, then
1285 * there better be more data.
1286 */
1287 if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1288 mac_drop_pkt(mp, "packet too short (C): %u", len);
1289 mp = next;
1290 continue;
1291 }
1292
1293 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1294
1295 /*
1296 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1297 * because we don't want to mask-out the LSO flag.
1298 */
1299 flags = DB_CKSUMFLAGS(mp);
1300
1301 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1302 uint_t tmpcount = 0;
1303
1304 /*
1305 * LSO fix-up handles checksum emulation
1306 * inline (if requested). It also frees mp.
1307 */
1308 mac_sw_lso(mp, emul, &tmphead, &tmptail,
1309 &tmpcount);
1310 if (tmphead == NULL) {
1311 /* mac_sw_lso() freed the mp. */
1312 mp = next;
1313 continue;
1314 }
1315 count += tmpcount;
1316 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1317 tmp = mac_sw_cksum(mp, emul);
1318 if (tmp == NULL) {
1319 /* mac_sw_cksum() freed the mp. */
1320 mp = next;
1321 continue;
1322 }
1323 tmphead = tmp;
1324 tmptail = tmp;
1325 count++;
1326 } else {
1327 /* There is nothing to emulate. */
1328 tmp = mp;
1329 tmphead = tmp;
1330 tmptail = tmp;
1331 count++;
1332 }
1333
1334 /*
1335 * The tmp mblk chain is either the start of the new
1336 * chain or added to the tail of the new chain.
1337 */
1338 if (head == NULL) {
1339 head = tmphead;
1340 tail = tmptail;
1341 } else {
1342 /* Attach the new mblk to the end of the new chain. */
1343 tail->b_next = tmphead;
1344 tail = tmptail;
1345 }
1346
1347 mp = next;
1348 }
1349
1350 *mp_chain = head;
1351
1352 if (otail != NULL)
1353 *otail = tail;
1354
1355 if (ocount != NULL)
1356 *ocount = count;
1357 }
1358
1359 /*
1360 * Add VLAN tag to the specified mblk.
1361 */
1362 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1363 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1364 {
1365 mblk_t *hmp;
1366 struct ether_vlan_header *evhp;
1367 struct ether_header *ehp;
1368
1369 ASSERT(pri != 0 || vid != 0);
1370
1371 /*
1372 * Allocate an mblk for the new tagged ethernet header,
1373 * and copy the MAC addresses and ethertype from the
1374 * original header.
1375 */
1376
1377 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1378 if (hmp == NULL) {
1379 freemsg(mp);
1380 return (NULL);
1381 }
1382
1383 evhp = (struct ether_vlan_header *)hmp->b_rptr;
1384 ehp = (struct ether_header *)mp->b_rptr;
1385
1386 bcopy(ehp, evhp, (ETHERADDRL * 2));
1387 evhp->ether_type = ehp->ether_type;
1388 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1389
1390 hmp->b_wptr += sizeof (struct ether_vlan_header);
1391 mp->b_rptr += sizeof (struct ether_header);
1392
1393 /*
1394 * Free the original message if it's now empty. Link the
1395 * rest of messages to the header message.
1396 */
1397 mac_hcksum_clone(mp, hmp);
1398 if (MBLKL(mp) == 0) {
1399 hmp->b_cont = mp->b_cont;
1400 freeb(mp);
1401 } else {
1402 hmp->b_cont = mp;
1403 }
1404 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1405
1406 /*
1407 * Initialize the new TCI (Tag Control Information).
1408 */
1409 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1410
1411 return (hmp);
1412 }
1413
1414 /*
1415 * Adds a VLAN tag with the specified VID and priority to each mblk of
1416 * the specified chain.
1417 */
1418 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1419 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1420 {
1421 mblk_t *next_mp, **prev, *mp;
1422
1423 mp = mp_chain;
1424 prev = &mp_chain;
1425
1426 while (mp != NULL) {
1427 next_mp = mp->b_next;
1428 mp->b_next = NULL;
1429 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1430 freemsgchain(next_mp);
1431 break;
1432 }
1433 *prev = mp;
1434 prev = &mp->b_next;
1435 mp = mp->b_next = next_mp;
1436 }
1437
1438 return (mp_chain);
1439 }
1440
1441 /*
1442 * Strip VLAN tag
1443 */
1444 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1445 mac_strip_vlan_tag(mblk_t *mp)
1446 {
1447 mblk_t *newmp;
1448 struct ether_vlan_header *evhp;
1449
1450 evhp = (struct ether_vlan_header *)mp->b_rptr;
1451 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1452 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1453
1454 if (DB_REF(mp) > 1) {
1455 newmp = copymsg(mp);
1456 if (newmp == NULL)
1457 return (NULL);
1458 freemsg(mp);
1459 mp = newmp;
1460 }
1461
1462 evhp = (struct ether_vlan_header *)mp->b_rptr;
1463
1464 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1465 mp->b_rptr += VLAN_TAGSZ;
1466 }
1467 return (mp);
1468 }
1469
1470 /*
1471 * Strip VLAN tag from each mblk of the chain.
1472 */
1473 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1474 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1475 {
1476 mblk_t *mp, *next_mp, **prev;
1477
1478 mp = mp_chain;
1479 prev = &mp_chain;
1480
1481 while (mp != NULL) {
1482 next_mp = mp->b_next;
1483 mp->b_next = NULL;
1484 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1485 freemsgchain(next_mp);
1486 break;
1487 }
1488 *prev = mp;
1489 prev = &mp->b_next;
1490 mp = mp->b_next = next_mp;
1491 }
1492
1493 return (mp_chain);
1494 }
1495
1496 /*
1497 * Default callback function. Used when the datapath is not yet initialized.
1498 */
1499 /* ARGSUSED */
1500 void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1501 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1502 boolean_t loopback)
1503 {
1504 freemsgchain(mp_chain);
1505 }
1506
1507 /*
1508 * Determines the IPv6 header length accounting for all the optional IPv6
1509 * headers (hop-by-hop, destination, routing and fragment). The header length
1510 * and next header value (a transport header) is captured.
1511 *
1512 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1513 * returns B_TRUE.
1514 */
1515 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)1516 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1517 uint8_t *next_hdr, ip6_frag_t **fragp)
1518 {
1519 uint16_t length;
1520 uint_t ehdrlen;
1521 uint8_t *whereptr;
1522 uint8_t *nexthdrp;
1523 ip6_dest_t *desthdr;
1524 ip6_rthdr_t *rthdr;
1525 ip6_frag_t *fraghdr;
1526
1527 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1528 return (B_FALSE);
1529 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1530 length = IPV6_HDR_LEN;
1531 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1532
1533 if (fragp != NULL)
1534 *fragp = NULL;
1535
1536 nexthdrp = &ip6h->ip6_nxt;
1537 while (whereptr < endptr) {
1538 /* Is there enough left for len + nexthdr? */
1539 if (whereptr + MIN_EHDR_LEN > endptr)
1540 break;
1541
1542 switch (*nexthdrp) {
1543 case IPPROTO_HOPOPTS:
1544 case IPPROTO_DSTOPTS:
1545 /* Assumes the headers are identical for hbh and dst */
1546 desthdr = (ip6_dest_t *)whereptr;
1547 ehdrlen = 8 * (desthdr->ip6d_len + 1);
1548 if ((uchar_t *)desthdr + ehdrlen > endptr)
1549 return (B_FALSE);
1550 nexthdrp = &desthdr->ip6d_nxt;
1551 break;
1552 case IPPROTO_ROUTING:
1553 rthdr = (ip6_rthdr_t *)whereptr;
1554 ehdrlen = 8 * (rthdr->ip6r_len + 1);
1555 if ((uchar_t *)rthdr + ehdrlen > endptr)
1556 return (B_FALSE);
1557 nexthdrp = &rthdr->ip6r_nxt;
1558 break;
1559 case IPPROTO_FRAGMENT:
1560 fraghdr = (ip6_frag_t *)whereptr;
1561 ehdrlen = sizeof (ip6_frag_t);
1562 if ((uchar_t *)&fraghdr[1] > endptr)
1563 return (B_FALSE);
1564 nexthdrp = &fraghdr->ip6f_nxt;
1565 if (fragp != NULL)
1566 *fragp = fraghdr;
1567 break;
1568 case IPPROTO_NONE:
1569 /* No next header means we're finished */
1570 default:
1571 *hdr_length = length;
1572 *next_hdr = *nexthdrp;
1573 return (B_TRUE);
1574 }
1575 length += ehdrlen;
1576 whereptr += ehdrlen;
1577 *hdr_length = length;
1578 *next_hdr = *nexthdrp;
1579 }
1580 switch (*nexthdrp) {
1581 case IPPROTO_HOPOPTS:
1582 case IPPROTO_DSTOPTS:
1583 case IPPROTO_ROUTING:
1584 case IPPROTO_FRAGMENT:
1585 /*
1586 * If any know extension headers are still to be processed,
1587 * the packet's malformed (or at least all the IP header(s) are
1588 * not in the same mblk - and that should never happen.
1589 */
1590 return (B_FALSE);
1591
1592 default:
1593 /*
1594 * If we get here, we know that all of the IP headers were in
1595 * the same mblk, even if the ULP header is in the next mblk.
1596 */
1597 *hdr_length = length;
1598 *next_hdr = *nexthdrp;
1599 return (B_TRUE);
1600 }
1601 }
1602
1603 /*
1604 * The following set of routines are there to take care of interrupt
1605 * re-targeting for legacy (fixed) interrupts. Some older versions
1606 * of the popular NICs like e1000g do not support MSI-X interrupts
1607 * and they reserve fixed interrupts for RX/TX rings. To re-target
1608 * these interrupts, PCITOOL ioctls need to be used.
1609 */
1610 typedef struct mac_dladm_intr {
1611 int ino;
1612 int cpu_id;
1613 char driver_path[MAXPATHLEN];
1614 char nexus_path[MAXPATHLEN];
1615 } mac_dladm_intr_t;
1616
1617 /* Bind the interrupt to cpu_num */
1618 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)1619 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1620 {
1621 pcitool_intr_set_t iset;
1622 int err;
1623
1624 iset.old_cpu = oldcpuid;
1625 iset.ino = ino;
1626 iset.cpu_id = cpu_num;
1627 iset.user_version = PCITOOL_VERSION;
1628 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1629 kcred, NULL);
1630
1631 return (err);
1632 }
1633
1634 /*
1635 * Search interrupt information. iget is filled in with the info to search
1636 */
1637 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1638 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1639 {
1640 int i;
1641 char driver_path[2 * MAXPATHLEN];
1642
1643 for (i = 0; i < iget_p->num_devs; i++) {
1644 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1645 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1646 ":%s%d", iget_p->dev[i].driver_name,
1647 iget_p->dev[i].dev_inst);
1648 /* Match the device path for the device path */
1649 if (strcmp(driver_path, dln->driver_path) == 0) {
1650 dln->ino = iget_p->ino;
1651 dln->cpu_id = iget_p->cpu_id;
1652 return (B_TRUE);
1653 }
1654 }
1655 return (B_FALSE);
1656 }
1657
1658 /*
1659 * Get information about ino, i.e. if this is the interrupt for our
1660 * device and where it is bound etc.
1661 */
1662 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)1663 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1664 mac_dladm_intr_t *dln)
1665 {
1666 pcitool_intr_get_t *iget_p;
1667 int ipsz;
1668 int nipsz;
1669 int err;
1670 uint8_t inum;
1671
1672 /*
1673 * Check if SLEEP is OK, i.e if could come here in response to
1674 * changing the fanout due to some callback from the driver, say
1675 * link speed changes.
1676 */
1677 ipsz = PCITOOL_IGET_SIZE(0);
1678 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1679
1680 iget_p->num_devs_ret = 0;
1681 iget_p->user_version = PCITOOL_VERSION;
1682 iget_p->cpu_id = oldcpuid;
1683 iget_p->ino = ino;
1684
1685 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1686 FKIOCTL, kcred, NULL);
1687 if (err != 0) {
1688 kmem_free(iget_p, ipsz);
1689 return (B_FALSE);
1690 }
1691 if (iget_p->num_devs == 0) {
1692 kmem_free(iget_p, ipsz);
1693 return (B_FALSE);
1694 }
1695 inum = iget_p->num_devs;
1696 if (iget_p->num_devs_ret < iget_p->num_devs) {
1697 /* Reallocate */
1698 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1699
1700 kmem_free(iget_p, ipsz);
1701 ipsz = nipsz;
1702 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1703
1704 iget_p->num_devs_ret = inum;
1705 iget_p->cpu_id = oldcpuid;
1706 iget_p->ino = ino;
1707 iget_p->user_version = PCITOOL_VERSION;
1708 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1709 FKIOCTL, kcred, NULL);
1710 if (err != 0) {
1711 kmem_free(iget_p, ipsz);
1712 return (B_FALSE);
1713 }
1714 /* defensive */
1715 if (iget_p->num_devs != iget_p->num_devs_ret) {
1716 kmem_free(iget_p, ipsz);
1717 return (B_FALSE);
1718 }
1719 }
1720
1721 if (mac_search_intrinfo(iget_p, dln)) {
1722 kmem_free(iget_p, ipsz);
1723 return (B_TRUE);
1724 }
1725 kmem_free(iget_p, ipsz);
1726 return (B_FALSE);
1727 }
1728
1729 /*
1730 * Get the interrupts and check each one to see if it is for our device.
1731 */
1732 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1733 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1734 {
1735 pcitool_intr_info_t intr_info;
1736 int err;
1737 int ino;
1738 int oldcpuid;
1739
1740 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1741 FKIOCTL, kcred, NULL);
1742 if (err != 0)
1743 return (-1);
1744
1745 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1746 for (ino = 0; ino < intr_info.num_intr; ino++) {
1747 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1748 if (dln->cpu_id == cpuid)
1749 return (0);
1750 return (1);
1751 }
1752 }
1753 }
1754 return (-1);
1755 }
1756
1757 /*
1758 * Obtain the nexus parent node info. for mdip.
1759 */
1760 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1761 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1762 {
1763 struct dev_info *tdip = (struct dev_info *)mdip;
1764 struct ddi_minor_data *minordata;
1765 dev_info_t *pdip;
1766 char pathname[MAXPATHLEN];
1767
1768 while (tdip != NULL) {
1769 /*
1770 * The netboot code could call this function while walking the
1771 * device tree so we need to use ndi_devi_tryenter() here to
1772 * avoid deadlock.
1773 */
1774 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1775 break;
1776
1777 for (minordata = tdip->devi_minor; minordata != NULL;
1778 minordata = minordata->next) {
1779 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1780 strlen(DDI_NT_INTRCTL)) == 0) {
1781 pdip = minordata->dip;
1782 (void) ddi_pathname(pdip, pathname);
1783 (void) snprintf(dln->nexus_path, MAXPATHLEN,
1784 "/devices%s:intr", pathname);
1785 (void) ddi_pathname_minor(minordata, pathname);
1786 ndi_devi_exit((dev_info_t *)tdip);
1787 return (pdip);
1788 }
1789 }
1790 ndi_devi_exit((dev_info_t *)tdip);
1791 tdip = tdip->devi_parent;
1792 }
1793 return (NULL);
1794 }
1795
1796 /*
1797 * For a primary MAC client, if the user has set a list or CPUs or
1798 * we have obtained it implicitly, we try to retarget the interrupt
1799 * for that device on one of the CPUs in the list.
1800 * We assign the interrupt to the same CPU as the poll thread.
1801 */
1802 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1803 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1804 {
1805 ldi_handle_t lh = NULL;
1806 ldi_ident_t li = NULL;
1807 int err;
1808 int ret;
1809 mac_dladm_intr_t dln;
1810 dev_info_t *dip;
1811 struct ddi_minor_data *minordata;
1812
1813 dln.nexus_path[0] = '\0';
1814 dln.driver_path[0] = '\0';
1815
1816 minordata = ((struct dev_info *)mdip)->devi_minor;
1817 while (minordata != NULL) {
1818 if (minordata->type == DDM_MINOR)
1819 break;
1820 minordata = minordata->next;
1821 }
1822 if (minordata == NULL)
1823 return (B_FALSE);
1824
1825 (void) ddi_pathname_minor(minordata, dln.driver_path);
1826
1827 dip = mac_get_nexus_node(mdip, &dln);
1828 /* defensive */
1829 if (dip == NULL)
1830 return (B_FALSE);
1831
1832 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1833 if (err != 0)
1834 return (B_FALSE);
1835
1836 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1837 if (err != 0)
1838 return (B_FALSE);
1839
1840 ret = mac_validate_intr(lh, &dln, cpuid);
1841 if (ret < 0) {
1842 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1843 return (B_FALSE);
1844 }
1845 /* cmn_note? */
1846 if (ret != 0)
1847 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1848 != 0) {
1849 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1850 return (B_FALSE);
1851 }
1852 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1853 return (B_TRUE);
1854 }
1855
1856 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1857 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1858 {
1859 dev_info_t *mdip = (dev_info_t *)arg;
1860 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1861 mac_resource_props_t *mrp;
1862 mac_perim_handle_t mph;
1863 flow_entry_t *flent = mcip->mci_flent;
1864 mac_soft_ring_set_t *rx_srs;
1865 mac_cpus_t *srs_cpu;
1866
1867 if (!mac_check_interrupt_binding(mdip, cpuid))
1868 cpuid = -1;
1869 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1870 mrp = MCIP_RESOURCE_PROPS(mcip);
1871 mrp->mrp_rx_intr_cpu = cpuid;
1872 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1873 rx_srs = flent->fe_rx_srs[1];
1874 srs_cpu = &rx_srs->srs_cpu;
1875 srs_cpu->mc_rx_intr_cpu = cpuid;
1876 }
1877 mac_perim_exit(mph);
1878 }
1879
1880 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1881 mac_client_intr_cpu(mac_client_handle_t mch)
1882 {
1883 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1884 mac_cpus_t *srs_cpu;
1885 mac_soft_ring_set_t *rx_srs;
1886 flow_entry_t *flent = mcip->mci_flent;
1887 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
1888 mac_ring_t *ring;
1889 mac_intr_t *mintr;
1890
1891 /*
1892 * Check if we need to retarget the interrupt. We do this only
1893 * for the primary MAC client. We do this if we have the only
1894 * exclusive ring in the group.
1895 */
1896 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1897 rx_srs = flent->fe_rx_srs[1];
1898 srs_cpu = &rx_srs->srs_cpu;
1899 ring = rx_srs->srs_ring;
1900 mintr = &ring->mr_info.mri_intr;
1901 /*
1902 * If ddi_handle is present or the poll CPU is
1903 * already bound to the interrupt CPU, return -1.
1904 */
1905 if (mintr->mi_ddi_handle != NULL ||
1906 ((mrp->mrp_ncpus != 0) &&
1907 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1908 return (-1);
1909 }
1910 return (srs_cpu->mc_rx_pollid);
1911 }
1912 return (-1);
1913 }
1914
1915 void *
mac_get_devinfo(mac_handle_t mh)1916 mac_get_devinfo(mac_handle_t mh)
1917 {
1918 mac_impl_t *mip = (mac_impl_t *)mh;
1919
1920 return ((void *)mip->mi_dip);
1921 }
1922
1923 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1924 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1925 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1926
1927 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)1928 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
1929 {
1930 struct ether_header *ehp;
1931 uint64_t hash = 0;
1932 uint16_t sap;
1933 uint_t skip_len;
1934 uint8_t proto;
1935 boolean_t ip_fragmented;
1936
1937 /*
1938 * We may want to have one of these per MAC type plugin in the
1939 * future. For now supports only ethernet.
1940 */
1941 if (media != DL_ETHER)
1942 return (0L);
1943
1944 /* for now we support only outbound packets */
1945 ASSERT(is_outbound);
1946 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
1947 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
1948
1949 /* compute L2 hash */
1950
1951 ehp = (struct ether_header *)mp->b_rptr;
1952
1953 if ((policy & MAC_PKT_HASH_L2) != 0) {
1954 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
1955 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
1956 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
1957 policy &= ~MAC_PKT_HASH_L2;
1958 }
1959
1960 if (policy == 0)
1961 goto done;
1962
1963 /* skip ethernet header */
1964
1965 sap = ntohs(ehp->ether_type);
1966 if (sap == ETHERTYPE_VLAN) {
1967 struct ether_vlan_header *evhp;
1968 mblk_t *newmp = NULL;
1969
1970 skip_len = sizeof (struct ether_vlan_header);
1971 if (MBLKL(mp) < skip_len) {
1972 /* the vlan tag is the payload, pull up first */
1973 newmp = msgpullup(mp, -1);
1974 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
1975 goto done;
1976 }
1977 evhp = (struct ether_vlan_header *)newmp->b_rptr;
1978 } else {
1979 evhp = (struct ether_vlan_header *)mp->b_rptr;
1980 }
1981
1982 sap = ntohs(evhp->ether_type);
1983 freemsg(newmp);
1984 } else {
1985 skip_len = sizeof (struct ether_header);
1986 }
1987
1988 /* if ethernet header is in its own mblk, skip it */
1989 if (MBLKL(mp) <= skip_len) {
1990 skip_len -= MBLKL(mp);
1991 mp = mp->b_cont;
1992 if (mp == NULL)
1993 goto done;
1994 }
1995
1996 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
1997
1998 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
1999
2000 switch (sap) {
2001 case ETHERTYPE_IP: {
2002 ipha_t *iphp;
2003
2004 /*
2005 * If the header is not aligned or the header doesn't fit
2006 * in the mblk, bail now. Note that this may cause packets
2007 * reordering.
2008 */
2009 iphp = (ipha_t *)(mp->b_rptr + skip_len);
2010 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2011 !OK_32PTR((char *)iphp))
2012 goto done;
2013
2014 proto = iphp->ipha_protocol;
2015 skip_len += IPH_HDR_LENGTH(iphp);
2016
2017 /* Check if the packet is fragmented. */
2018 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2019 IPH_OFFSET;
2020
2021 /*
2022 * For fragmented packets, use addresses in addition to
2023 * the frag_id to generate the hash inorder to get
2024 * better distribution.
2025 */
2026 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2027 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2028 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2029
2030 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2031 PKT_HASH_4BYTES(ip_dst));
2032 policy &= ~MAC_PKT_HASH_L3;
2033 }
2034
2035 if (ip_fragmented) {
2036 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2037 hash ^= PKT_HASH_2BYTES(identp);
2038 goto done;
2039 }
2040 break;
2041 }
2042 case ETHERTYPE_IPV6: {
2043 ip6_t *ip6hp;
2044 ip6_frag_t *frag = NULL;
2045 uint16_t hdr_length;
2046
2047 /*
2048 * If the header is not aligned or the header doesn't fit
2049 * in the mblk, bail now. Note that this may cause packets
2050 * reordering.
2051 */
2052
2053 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2054 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2055 !OK_32PTR((char *)ip6hp))
2056 goto done;
2057
2058 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2059 &proto, &frag))
2060 goto done;
2061 skip_len += hdr_length;
2062
2063 /*
2064 * For fragmented packets, use addresses in addition to
2065 * the frag_id to generate the hash inorder to get
2066 * better distribution.
2067 */
2068 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2069 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2070 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2071
2072 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2073 PKT_HASH_4BYTES(ip_dst));
2074 policy &= ~MAC_PKT_HASH_L3;
2075 }
2076
2077 if (frag != NULL) {
2078 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2079 hash ^= PKT_HASH_4BYTES(identp);
2080 goto done;
2081 }
2082 break;
2083 }
2084 default:
2085 goto done;
2086 }
2087
2088 if (policy == 0)
2089 goto done;
2090
2091 /* if ip header is in its own mblk, skip it */
2092 if (MBLKL(mp) <= skip_len) {
2093 skip_len -= MBLKL(mp);
2094 mp = mp->b_cont;
2095 if (mp == NULL)
2096 goto done;
2097 }
2098
2099 /* parse ULP header */
2100 again:
2101 switch (proto) {
2102 case IPPROTO_TCP:
2103 case IPPROTO_UDP:
2104 case IPPROTO_ESP:
2105 case IPPROTO_SCTP:
2106 /*
2107 * These Internet Protocols are intentionally designed
2108 * for hashing from the git-go. Port numbers are in the first
2109 * word for transports, SPI is first for ESP.
2110 */
2111 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2112 goto done;
2113 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2114 break;
2115
2116 case IPPROTO_AH: {
2117 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2118 uint_t ah_length = AH_TOTAL_LEN(ah);
2119
2120 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2121 goto done;
2122
2123 proto = ah->ah_nexthdr;
2124 skip_len += ah_length;
2125
2126 /* if AH header is in its own mblk, skip it */
2127 if (MBLKL(mp) <= skip_len) {
2128 skip_len -= MBLKL(mp);
2129 mp = mp->b_cont;
2130 if (mp == NULL)
2131 goto done;
2132 }
2133
2134 goto again;
2135 }
2136 }
2137
2138 done:
2139 return (hash);
2140 }
2141