1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 * Copyright 2026 Oxide Computer Company
25 */
26
27 /*
28 * MAC Services Module - misc utilities
29 */
30
31 #include <sys/types.h>
32 #include <sys/mac.h>
33 #include <sys/mac_impl.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/mac_client_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/vlan.h>
40 #include <sys/pattr.h>
41 #include <sys/pci_tools.h>
42 #include <inet/ip.h>
43 #include <inet/ip_impl.h>
44 #include <inet/ip6.h>
45 #include <sys/vtrace.h>
46 #include <sys/dlpi.h>
47 #include <sys/sunndi.h>
48 #include <inet/ipsec_impl.h>
49 #include <inet/sadb.h>
50 #include <inet/ipsecesp.h>
51 #include <inet/ipsecah.h>
52 #include <inet/tcp.h>
53 #include <inet/sctp_ip.h>
54
55 /*
56 * The next two functions are used for dropping packets or chains of
57 * packets, respectively. We could use one function for both but
58 * separating the use cases allows us to specify intent and prevent
59 * dropping more data than intended.
60 *
61 * The purpose of these functions is to aid the debugging effort,
62 * especially in production. Rather than use freemsg()/freemsgchain(),
63 * it's preferable to use these functions when dropping a packet in
64 * the MAC layer. These functions should only be used during
65 * unexpected conditions. That is, any time a packet is dropped
66 * outside of the regular, successful datapath. Consolidating all
67 * drops on these functions allows the user to trace one location and
68 * determine why the packet was dropped based on the msg. It also
69 * allows the user to inspect the packet before it is freed. Finally,
70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus
71 * keeping the hot path running as efficiently as possible.
72 *
73 * NOTE: At this time not all MAC drops are aggregated on these
74 * functions; but that is the plan. This comment should be erased once
75 * completed.
76 */
77
78 /*PRINTFLIKE2*/
79 void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
81 {
82 va_list adx;
83 char msg[128];
84 char *msgp = msg;
85
86 ASSERT3P(mp->b_next, ==, NULL);
87
88 va_start(adx, fmt);
89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
90 va_end(adx);
91
92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
93 freemsg(mp);
94 }
95
96 /*PRINTFLIKE2*/
97 void
mac_drop_chain(mblk_t * chain,const char * fmt,...)98 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
99 {
100 va_list adx;
101 char msg[128];
102 char *msgp = msg;
103
104 va_start(adx, fmt);
105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
106 va_end(adx);
107
108 /*
109 * We could use freemsgchain() for the actual freeing but
110 * since we are already walking the chain to fire the dtrace
111 * probe we might as well free the msg here too.
112 */
113 for (mblk_t *mp = chain, *next; mp != NULL; ) {
114 next = mp->b_next;
115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
116 mp->b_next = NULL;
117 freemsg(mp);
118 mp = next;
119 }
120 }
121
122 /*
123 * Copy an mblk, preserving its hardware checksum flags.
124 */
125 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)126 mac_copymsg_cksum(mblk_t *mp)
127 {
128 mblk_t *mp1;
129
130 mp1 = copymsg(mp);
131 if (mp1 == NULL)
132 return (NULL);
133
134 mac_hcksum_clone(mp, mp1);
135
136 return (mp1);
137 }
138
139 /*
140 * Copy an mblk chain, presenting the hardware checksum flags of the
141 * individual mblks.
142 */
143 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)144 mac_copymsgchain_cksum(mblk_t *mp)
145 {
146 mblk_t *nmp = NULL;
147 mblk_t **nmpp = &nmp;
148
149 for (; mp != NULL; mp = mp->b_next) {
150 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
151 freemsgchain(nmp);
152 return (NULL);
153 }
154
155 nmpp = &((*nmpp)->b_next);
156 }
157
158 return (nmp);
159 }
160
161 /*
162 * Perform software checksum on a single message, if needed. The emulation
163 * performed is determined by an intersection of the mblk's flags and the emul
164 * flags requested. The emul flags are documented in mac.h.
165 */
166 static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
168 {
169 mac_ether_offload_info_t meoi = { 0 };
170 const char *err = "";
171
172 /*
173 * The only current caller is mac_hw_emul(), which handles any chaining
174 * of mblks prior to now.
175 */
176 VERIFY3P(mp->b_next, ==, NULL);
177
178 uint32_t flags = DB_CKSUMFLAGS(mp);
179
180 /* Why call this if checksum emulation isn't needed? */
181 ASSERT3U(flags & (HCK_FLAGS), !=, 0);
182 /* But also, requesting both ULP cksum types is improper */
183 if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) {
184 err = "full and partial ULP cksum requested";
185 goto bail;
186 }
187
188 const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 &&
189 (flags & HCK_IPV4_HDRCKSUM) != 0;
190 const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 &&
191 (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0;
192 const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0;
193
194 mac_ether_offload_info(mp, &meoi);
195 if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 ||
196 (meoi.meoi_l3proto != ETHERTYPE_IP &&
197 meoi.meoi_l3proto != ETHERTYPE_IPV6)) {
198 /* Non-IP traffic (like ARP) is left alone */
199 return (mp);
200 }
201
202 /*
203 * Ensure that requested checksum type(s) are supported by the
204 * protocols encoded in the packet headers.
205 */
206 if (do_v4_cksum) {
207 if (meoi.meoi_l3proto != ETHERTYPE_IP) {
208 err = "IPv4 csum requested on non-IPv4 packet";
209 goto bail;
210 }
211 }
212 if (do_ulp_cksum) {
213 if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) {
214 err = "missing ULP header";
215 goto bail;
216 }
217 switch (meoi.meoi_l4proto) {
218 case IPPROTO_TCP:
219 case IPPROTO_UDP:
220 case IPPROTO_ICMP:
221 case IPPROTO_ICMPV6:
222 case IPPROTO_SCTP:
223 break;
224 default:
225 err = "unexpected ULP";
226 goto bail;
227 }
228 }
229
230 /*
231 * If the first mblk of this packet contains only the Ethernet header,
232 * skip past it for now. Packets with their data contained in only a
233 * single mblk can then use the fastpaths tuned to that possibility.
234 */
235 mblk_t *skipped_hdr = NULL;
236 if (MBLKL(mp) == meoi.meoi_l2hlen) {
237 meoi.meoi_len -= meoi.meoi_l2hlen;
238 meoi.meoi_l2hlen = 0;
239 skipped_hdr = mp;
240 mp = mp->b_cont;
241
242 ASSERT(mp != NULL);
243 }
244
245 /*
246 * Ensure that all of the headers we need to access are:
247 * 1. Collected in the first mblk
248 * 2. Held in a data-block which is safe for us to modify
249 * (It must have a refcount of 1)
250 * 3. IP headers are 4-byte aligned. IP header size is always a multiple
251 * of 4 bytes, thus L4 headers will also be safe to access.
252 */
253 const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) +
254 (do_ulp_cksum ? meoi.meoi_l4hlen : 0);
255 if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1 ||
256 !OK_32PTR(mp->b_rptr + meoi.meoi_l2hlen)) {
257 const size_t pad_by = (4 - (meoi.meoi_l2hlen % 4)) % 4;
258 mblk_t *hdrmp = msgpullup_pad(mp, hdr_len_reqd, pad_by);
259
260 if (hdrmp == NULL) {
261 err = "could not pullup msg headers";
262 goto bail;
263 }
264
265 mac_hcksum_clone(mp, hdrmp);
266 if (skipped_hdr != NULL) {
267 ASSERT3P(skipped_hdr->b_cont, ==, mp);
268 skipped_hdr->b_cont = hdrmp;
269 }
270 freemsg(mp);
271 mp = hdrmp;
272 }
273
274 /* Calculate IPv4 header checksum, if requested */
275 if (do_v4_cksum) {
276 /*
277 * While unlikely, it's possible to write code that might end up
278 * calling mac_sw_cksum() twice on the same mblk (performing
279 * both LSO and checksum emulation in a single mblk chain loop
280 * -- the LSO emulation inserts a new chain into the existing
281 * chain and then the loop iterates back over the new segments
282 * and emulates the checksum a second time). Normally this
283 * wouldn't be a problem, because the HCK_*_OK flags are
284 * supposed to indicate that we don't need to do peform the
285 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
286 * same value; so we cannot use these flags to determine if the
287 * IP header checksum has already been calculated or not. For
288 * this reason, we zero out the the checksum first. In the
289 * future, we should fix the HCK_* flags.
290 */
291 ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
292 ipha->ipha_hdr_checksum = 0;
293 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
294 flags &= ~HCK_IPV4_HDRCKSUM;
295 flags |= HCK_IPV4_HDRCKSUM_OK;
296 }
297
298 /*
299 * The SCTP is different from all the other protocols in that it uses
300 * CRC32 for its checksum, rather than ones' complement.
301 */
302 if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) {
303 if (ulp_prefer_partial) {
304 err = "SCTP does not support partial checksum";
305 goto bail;
306 }
307
308 const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen;
309 sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off);
310
311 sctph->sh_chksum = 0;
312 sctph->sh_chksum = sctp_cksum(mp, ulp_off);
313
314 flags &= ~HCK_FULLCKSUM;
315 flags |= HCK_FULLCKSUM_OK;
316 goto success;
317 }
318
319 /* Calculate full ULP checksum, if requested */
320 if (do_ulp_cksum && !ulp_prefer_partial) {
321 /*
322 * Calculate address and length portions of pseudo-header csum
323 */
324 uint32_t cksum = 0;
325 if (meoi.meoi_l3proto == ETHERTYPE_IP) {
326 const ipha_t *ipha =
327 (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
328 const uint16_t *ipp =
329 (const uint16_t *)(&ipha->ipha_src);
330
331 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3];
332
333 /*
334 * While it is tempting to calculate the payload length
335 * solely from `meoi`, like as done below for IPv6,
336 * doing so is a trap. Packets shorter than 60 bytes
337 * will get padded out to that length in order to meet
338 * the minimums for Ethernet. Instead, we pull the
339 * length from the IP header.
340 */
341 const uint16_t payload_len =
342 ntohs(ipha->ipha_length) - meoi.meoi_l3hlen;
343 cksum += htons(payload_len);
344 } else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) {
345 const ip6_t *ip6h =
346 (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen);
347 const uint16_t *ipp =
348 (const uint16_t *)(&ip6h->ip6_src);
349
350 cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] +
351 ipp[4] + ipp[5] + ipp[6] + ipp[7];
352 cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] +
353 ipp[12] + ipp[13] + ipp[14] + ipp[15];
354
355 const uint16_t payload_len = meoi.meoi_len -
356 ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen);
357 cksum += htons(payload_len);
358 } else {
359 /*
360 * Since we already checked for recognized L3 protocols
361 * earlier, this should not be reachable.
362 */
363 panic("L3 protocol unexpectedly changed");
364 }
365
366 /* protocol portion of pseudo-header */
367 uint_t cksum_off;
368 switch (meoi.meoi_l4proto) {
369 case IPPROTO_TCP:
370 cksum += IP_TCP_CSUM_COMP;
371 cksum_off = TCP_CHECKSUM_OFFSET;
372 break;
373 case IPPROTO_UDP:
374 cksum += IP_UDP_CSUM_COMP;
375 cksum_off = UDP_CHECKSUM_OFFSET;
376 break;
377 case IPPROTO_ICMP:
378 /* ICMP cksum does not include pseudo-header contents */
379 cksum = 0;
380 cksum_off = ICMP_CHECKSUM_OFFSET;
381 break;
382 case IPPROTO_ICMPV6:
383 cksum += IP_ICMPV6_CSUM_COMP;
384 cksum_off = ICMPV6_CHECKSUM_OFFSET;
385 break;
386 default:
387 err = "unrecognized L4 protocol";
388 goto bail;
389 }
390
391 /*
392 * With IP_CSUM() taking into account the pseudo-header
393 * checksum, make sure the ULP checksum field is zeroed before
394 * computing the rest;
395 */
396 const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen;
397 uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off);
398 *up = 0;
399 cksum = IP_CSUM(mp, l4_off, cksum);
400
401 if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) {
402 /*
403 * A zero checksum is not allowed on UDPv6, and on UDPv4
404 * implies no checksum. In either case, invert to a
405 * values of all-1s.
406 */
407 *up = 0xffff;
408 } else {
409 *up = cksum;
410 }
411
412 flags &= ~HCK_FULLCKSUM;
413 flags |= HCK_FULLCKSUM_OK;
414 goto success;
415 }
416
417 /* Calculate partial ULP checksum, if requested */
418 if (do_ulp_cksum && ulp_prefer_partial) {
419 uint32_t start, stuff, end, value;
420 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
421
422 ASSERT3S(end, >, start);
423
424 /*
425 * The prior size checks against the header length data ensure
426 * that the mblk contains everything through at least the ULP
427 * header, but if the partial checksum (unexpectedly) requests
428 * its result be stored past that, we cannot continue.
429 */
430 if (stuff + sizeof (uint16_t) > MBLKL(mp)) {
431 err = "partial csum request is out of bounds";
432 goto bail;
433 }
434
435 uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen);
436 uint16_t *up = (uint16_t *)(ipp + stuff);
437
438 const uint16_t partial = *up;
439 *up = 0;
440 const uint16_t cksum =
441 ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial);
442 *up = cksum != 0 ? cksum : ~cksum;
443
444 flags &= ~HCK_PARTIALCKSUM;
445 flags |= HCK_FULLCKSUM_OK;
446 }
447
448 success:
449 /*
450 * With the checksum(s) calculated, store the updated flags to reflect
451 * the current status, and zero out any of the partial-checksum fields
452 * which would be irrelevant now.
453 */
454 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
455
456 /* Don't forget to reattach the header. */
457 if (skipped_hdr != NULL) {
458 ASSERT3P(skipped_hdr->b_cont, ==, mp);
459
460 /*
461 * Duplicate the HCKSUM data into the header mblk.
462 *
463 * This mimics mac_add_vlan_tag() which ensures that both the
464 * first mblk _and_ the first data bearing mblk possess the
465 * HCKSUM information. Consumers like IP will end up discarding
466 * the ether_header mblk, so for now, it is important that the
467 * data be available in both places.
468 */
469 mac_hcksum_clone(mp, skipped_hdr);
470 mp = skipped_hdr;
471 }
472 return (mp);
473
474 bail:
475 if (skipped_hdr != NULL) {
476 ASSERT3P(skipped_hdr->b_cont, ==, mp);
477 mp = skipped_hdr;
478 }
479
480 mac_drop_pkt(mp, err);
481 return (NULL);
482 }
483
484 /*
485 * Build a single data segment from an LSO packet. The mblk chain
486 * returned, seg_head, represents the data segment and is always
487 * exactly seg_len bytes long. The lso_mp and offset input/output
488 * parameters track our position in the LSO packet. This function
489 * exists solely as a helper to mac_sw_lso().
490 *
491 * Case A
492 *
493 * The current lso_mp is larger than the requested seg_len. The
494 * beginning of seg_head may start at the beginning of lso_mp or
495 * offset into it. In either case, a single mblk is returned, and
496 * *offset is updated to reflect our new position in the current
497 * lso_mp.
498 *
499 * +----------------------------+
500 * | in *lso_mp / out *lso_mp |
501 * +----------------------------+
502 * ^ ^
503 * | |
504 * | |
505 * | |
506 * +------------------------+
507 * | seg_head |
508 * +------------------------+
509 * ^ ^
510 * | |
511 * in *offset = 0 out *offset = seg_len
512 *
513 * |------ seg_len ----|
514 *
515 *
516 * +------------------------------+
517 * | in *lso_mp / out *lso_mp |
518 * +------------------------------+
519 * ^ ^
520 * | |
521 * | |
522 * | |
523 * +------------------------+
524 * | seg_head |
525 * +------------------------+
526 * ^ ^
527 * | |
528 * in *offset = N out *offset = N + seg_len
529 *
530 * |------ seg_len ----|
531 *
532 *
533 *
534 * Case B
535 *
536 * The requested seg_len consumes exactly the rest of the lso_mp.
537 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
538 * The seg_head may start at the beginning of the lso_mp or at some
539 * offset into it. In either case we return a single mblk, reset
540 * *offset to zero, and walk to the next lso_mp.
541 *
542 * +------------------------+ +------------------------+
543 * | in *lso_mp |---------->| out *lso_mp |
544 * +------------------------+ +------------------------+
545 * ^ ^ ^
546 * | | |
547 * | | out *offset = 0
548 * | |
549 * +------------------------+
550 * | seg_head |
551 * +------------------------+
552 * ^
553 * |
554 * in *offset = 0
555 *
556 * |------ seg_len ----|
557 *
558 *
559 *
560 * +----------------------------+ +------------------------+
561 * | in *lso_mp |---------->| out *lso_mp |
562 * +----------------------------+ +------------------------+
563 * ^ ^ ^
564 * | | |
565 * | | out *offset = 0
566 * | |
567 * +------------------------+
568 * | seg_head |
569 * +------------------------+
570 * ^
571 * |
572 * in *offset = N
573 *
574 * |------ seg_len ----|
575 *
576 *
577 * Case C
578 *
579 * The requested seg_len is greater than the current lso_mp. In
580 * this case we must consume LSO mblks until we have enough data to
581 * satisfy either case (A) or (B) above. We will return multiple
582 * mblks linked via b_cont, offset will be set based on the cases
583 * above, and lso_mp will walk forward at least one mblk, but maybe
584 * more.
585 *
586 * N.B. This digram is not exhaustive. The seg_head may start on
587 * the beginning of an lso_mp. The seg_tail may end exactly on the
588 * boundary of an lso_mp. And there may be two (in this case the
589 * middle block wouldn't exist), three, or more mblks in the
590 * seg_head chain. This is meant as one example of what might
591 * happen. The main thing to remember is that the seg_tail mblk
592 * must be one of case (A) or (B) above.
593 *
594 * +------------------+ +----------------+ +------------------+
595 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
596 * +------------------+ +----------------+ +------------------+
597 * ^ ^ ^ ^ ^ ^
598 * | | | | | |
599 * | | | | | |
600 * | | | | | |
601 * | | | | | |
602 * +------------+ +----------------+ +------------+
603 * | seg_head |--->| |--->| seg_tail |
604 * +------------+ +----------------+ +------------+
605 * ^ ^
606 * | |
607 * in *offset = N out *offset = MBLKL(seg_tail)
608 *
609 * |------------------- seg_len -------------------|
610 *
611 */
612 static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)613 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
614 {
615 mblk_t *seg_head, *seg_tail, *seg_mp;
616
617 ASSERT3P(*lso_mp, !=, NULL);
618 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
619
620 seg_mp = dupb(*lso_mp);
621 if (seg_mp == NULL)
622 return (NULL);
623
624 seg_head = seg_mp;
625 seg_tail = seg_mp;
626
627 /* Continue where we left off from in the lso_mp. */
628 seg_mp->b_rptr += *offset;
629
630 last_mblk:
631 /* Case (A) */
632 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
633 *offset += seg_len;
634 seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
635 return (seg_head);
636 }
637
638 /* Case (B) */
639 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
640 *offset = 0;
641 *lso_mp = (*lso_mp)->b_cont;
642 return (seg_head);
643 }
644
645 /* Case (C) */
646 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
647
648 /*
649 * The current LSO mblk doesn't have enough data to satisfy
650 * seg_len -- continue peeling off LSO mblks to build the new
651 * segment message. If allocation fails we free the previously
652 * allocated segment mblks and return NULL.
653 */
654 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
655 ASSERT3U(MBLKL(seg_mp), <=, seg_len);
656 seg_len -= MBLKL(seg_mp);
657 *offset = 0;
658 *lso_mp = (*lso_mp)->b_cont;
659 seg_mp = dupb(*lso_mp);
660
661 if (seg_mp == NULL) {
662 freemsgchain(seg_head);
663 return (NULL);
664 }
665
666 seg_tail->b_cont = seg_mp;
667 seg_tail = seg_mp;
668 }
669
670 /*
671 * We've walked enough LSO mblks that we can now satisfy the
672 * remaining seg_len. At this point we need to jump back to
673 * determine if we have arrived at case (A) or (B).
674 */
675
676 /* Just to be paranoid that we didn't underflow. */
677 ASSERT3U(seg_len, <, IP_MAXPACKET);
678 ASSERT3U(seg_len, >, 0);
679 goto last_mblk;
680 }
681
682 /*
683 * Perform software segmentation of a single LSO message. Take an LSO
684 * message as input and return head/tail pointers as output. This
685 * function should not be invoked directly but instead through
686 * mac_hw_emul().
687 *
688 * The resulting chain is comprised of multiple (nsegs) MSS sized
689 * segments. Each segment will consist of two or more mblks joined by
690 * b_cont: a header and one or more data mblks. The header mblk is
691 * allocated anew for each message. The first segment's header is used
692 * as a template for the rest with adjustments made for things such as
693 * ID, sequence, length, TCP flags, etc. The data mblks reference into
694 * the existing LSO mblk (passed in as omp) by way of dupb(). Their
695 * b_rptr/b_wptr values are adjusted to reference only the fraction of
696 * the LSO message they are responsible for. At the successful
697 * completion of this function the original mblk (omp) is freed,
698 * leaving the newely created segment chain as the only remaining
699 * reference to the data.
700 */
701 static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)702 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
703 uint_t *count)
704 {
705 uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
706 uint32_t mss;
707 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen;
708 uint32_t odatalen, oleft;
709 uint_t nsegs, seg;
710 int len;
711
712 const void *oiph;
713 const tcph_t *otcph;
714 ipha_t *niph;
715 tcph_t *ntcph;
716 uint16_t ip_id;
717 uint32_t tcp_seq, tcp_sum, otcp_sum;
718
719 boolean_t is_v6 = B_FALSE;
720 ip6_t *niph6;
721
722 uint32_t offset = 0;
723 mblk_t *odatamp;
724 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
725 mblk_t *tmptail;
726
727 mac_ether_offload_info_t meoi = { 0 };
728
729 ASSERT3P(head, !=, NULL);
730 ASSERT3P(tail, !=, NULL);
731 ASSERT3P(count, !=, NULL);
732 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
733
734 /* Assume we are dealing with a single LSO message. */
735 ASSERT3P(omp->b_next, ==, NULL);
736
737 mac_ether_offload_info(omp, &meoi);
738 opktlen = meoi.meoi_len;
739 oehlen = meoi.meoi_l2hlen;
740 oiphlen = meoi.meoi_l3hlen;
741 otcphlen = meoi.meoi_l4hlen;
742 ohdrslen = oehlen + oiphlen + otcphlen;
743
744 /* Performing LSO requires that we successfully read fully up to L4 */
745 if ((MEOI_L4INFO_SET & meoi.meoi_flags) == 0) {
746 mac_drop_pkt(omp, "unable to fully parse packet to L4");
747 goto fail;
748 }
749
750 if (meoi.meoi_l3proto != ETHERTYPE_IP &&
751 meoi.meoi_l3proto != ETHERTYPE_IPV6) {
752 mac_drop_pkt(omp, "LSO'd packet has non-IP L3 header: %x",
753 meoi.meoi_l3proto);
754 goto fail;
755 }
756
757 if (meoi.meoi_l4proto != IPPROTO_TCP) {
758 mac_drop_pkt(omp, "LSO unsupported protocol: %x",
759 meoi.meoi_l4proto);
760 goto fail;
761 }
762
763 is_v6 = meoi.meoi_l3proto == ETHERTYPE_IPV6;
764
765 mss = DB_LSOMSS(omp);
766 if (mss == 0) {
767 mac_drop_pkt(omp, "packet misconfigured for LSO (MSS == 0)");
768 goto fail;
769 }
770 ASSERT3U(opktlen, <=, IP_MAXPACKET + oehlen);
771
772 /*
773 * Ensure the headers are contiguous and that L3 and L4 headers are 4B
774 * aligned. The IP header is used only for the benefit of DTrace SDTs,
775 * whereas the TCP header is actively read. This small pullup should
776 * only practically happen when mac_add_vlan_tag is in play, which
777 * prepends a new mblk in front containing the amended Ethernet header.
778 */
779 const size_t pad_by = (4 - (meoi.meoi_l2hlen % 4)) % 4;
780 if (MBLKL(omp) < ohdrslen || !OK_32PTR(omp->b_rptr + oehlen)) {
781 mblk_t *tmp = msgpullup_pad(omp, ohdrslen, pad_by);
782
783 if (tmp == NULL) {
784 mac_drop_pkt(omp, "failed to pull up");
785 goto fail;
786 }
787
788 mac_hcksum_clone(omp, tmp);
789 freemsg(omp);
790 omp = tmp;
791 }
792
793 oiph = (void *)(omp->b_rptr + oehlen);
794 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
795
796 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
797 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
798 goto fail;
799 }
800
801 len = MBLKL(omp);
802
803 /*
804 * Either we have data in the first mblk or it's just the
805 * header. In either case, we need to set rptr to the start of
806 * the TCP data.
807 */
808 if (len > ohdrslen) {
809 odatamp = omp;
810 offset = ohdrslen;
811 } else {
812 ASSERT3U(len, ==, ohdrslen);
813 odatamp = omp->b_cont;
814 offset = 0;
815 }
816
817 /* Make sure we still have enough data. */
818 odatalen = opktlen - ohdrslen;
819 ASSERT3U(msgsize(odatamp), >=, odatalen);
820
821 /*
822 * If a MAC negotiated LSO then it must negotiate both
823 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
824 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
825 * change during LSO segmentation (only the 3 fields of the
826 * pseudo header checksum don't change: src, dst, proto). Thus
827 * we would expect these flags (HCK_IPV4_HDRCKSUM |
828 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
829 * function to emulate those checksums in software. However,
830 * that assumes a world where we only expose LSO if the
831 * underlying hardware exposes LSO. Moving forward the plan is
832 * to assume LSO in the upper layers and have MAC perform
833 * software LSO when the underlying provider doesn't support
834 * it. In such a world, if the provider doesn't support LSO
835 * but does support hardware checksum offload, then we could
836 * simply perform the segmentation and allow the hardware to
837 * calculate the checksums. To the hardware it's just another
838 * chain of non-LSO packets.
839 */
840 ASSERT3S(DB_TYPE(omp), ==, M_DATA);
841 ocsum_flags = DB_CKSUMFLAGS(omp);
842 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
843
844 /*
845 * If hardware only provides partial checksum then software
846 * must supply the pseudo-header checksum. In the case of LSO
847 * we leave the TCP length at zero to be filled in by
848 * hardware. This function must handle two scenarios.
849 *
850 * 1. Being called by a MAC client on the Rx path to segment
851 * an LSO packet and calculate the checksum.
852 *
853 * 2. Being called by a MAC provider to segment an LSO packet.
854 * In this case the LSO segmentation is performed in
855 * software (by this routine) but the MAC provider should
856 * still calculate the TCP/IP checksums in hardware.
857 *
858 * To elaborate on the second case: we cannot have the
859 * scenario where IP sends LSO packets but the underlying HW
860 * doesn't support checksum offload -- because in that case
861 * TCP/IP would calculate the checksum in software (for the
862 * LSO packet) but then MAC would segment the packet and have
863 * to redo all the checksum work. So IP should never do LSO
864 * if HW doesn't support both IP and TCP checksum.
865 */
866 if (ocsum_flags & HCK_PARTIALCKSUM) {
867 ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
868 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
869 }
870
871 /*
872 * Subtract one to account for the case where the data length
873 * is evenly divisble by the MSS. Add one to account for the
874 * fact that the division will always result in one less
875 * segment than needed.
876 */
877 nsegs = ((odatalen - 1) / mss) + 1;
878 if (nsegs < 2) {
879 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
880 goto fail;
881 }
882
883 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
884 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss,
885 uint_t, nsegs);
886
887 seg_chain = NULL;
888 tmptail = seg_chain;
889 oleft = odatalen;
890
891 for (uint_t i = 0; i < nsegs; i++) {
892 boolean_t last_seg = ((i + 1) == nsegs);
893 uint32_t seg_len;
894
895 /*
896 * Ensure that we have 4B L3/L4 alignment for any output frames.
897 * If we fail to allocate, then drop the partially
898 * allocated chain as well as the LSO packet. Let the
899 * sender deal with the fallout.
900 */
901 if ((nhdrmp = allocb(pad_by + ohdrslen, 0)) == NULL) {
902 freemsgchain(seg_chain);
903 mac_drop_pkt(omp, "failed to alloc segment header");
904 goto fail;
905 }
906 ASSERT3P(nhdrmp->b_cont, ==, NULL);
907
908 /* Copy over the header stack. */
909 nhdrmp->b_rptr += pad_by;
910 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
911 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
912
913 if (seg_chain == NULL) {
914 seg_chain = nhdrmp;
915 } else {
916 ASSERT3P(tmptail, !=, NULL);
917 tmptail->b_next = nhdrmp;
918 }
919
920 tmptail = nhdrmp;
921
922 /*
923 * Calculate this segment's length. It's either the MSS
924 * or whatever remains for the last segment.
925 */
926 seg_len = last_seg ? oleft : mss;
927 ASSERT3U(seg_len, <=, mss);
928 ndatamp = build_data_seg(&odatamp, &offset, seg_len);
929
930 if (ndatamp == NULL) {
931 freemsgchain(seg_chain);
932 mac_drop_pkt(omp, "LSO failed to segment data");
933 goto fail;
934 }
935
936 /* Attach data mblk to header mblk. */
937 nhdrmp->b_cont = ndatamp;
938 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
939 ASSERT3U(seg_len, <=, oleft);
940 oleft -= seg_len;
941
942 /* Setup partial checksum offsets. */
943 if (ocsum_flags & HCK_PARTIALCKSUM) {
944 DB_CKSUMSTART(nhdrmp) = ocsum_start;
945 DB_CKSUMEND(nhdrmp) = oiphlen + otcphlen + seg_len;
946 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
947 }
948 }
949
950 /* We should have consumed entire LSO msg. */
951 ASSERT3S(oleft, ==, 0);
952 ASSERT3P(odatamp, ==, NULL);
953
954 /*
955 * All seg data mblks are referenced by the header mblks, null
956 * out this pointer to catch any bad derefs.
957 */
958 ndatamp = NULL;
959
960 /*
961 * Set headers and checksum for first segment.
962 */
963 nhdrmp = seg_chain;
964 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
965
966 if (is_v6) {
967 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
968 niph6->ip6_plen = htons(
969 (oiphlen - IPV6_HDR_LEN) + otcphlen + mss);
970 } else {
971 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
972 niph->ipha_length = htons(oiphlen + otcphlen + mss);
973 /*
974 * If the v4 checksum was filled, we won't have a v4 offload
975 * flag. We can't write zero checksums without inserting said
976 * flag, but our output frames won't necessarily be rechecked by
977 * the caller! As a compromise, we need to force emulation to
978 * uphold the same contracts the packet already agreed to.
979 */
980 if (niph->ipha_hdr_checksum != 0) {
981 emul |= MAC_IPCKSUM_EMUL;
982 ocsum_flags |= HCK_IPV4_HDRCKSUM;
983 }
984 niph->ipha_hdr_checksum = 0;
985 ip_id = ntohs(niph->ipha_ident);
986 }
987
988 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
989 tcp_seq = BE32_TO_U32(ntcph->th_seq);
990 tcp_seq += mss;
991
992 /*
993 * The first segment shouldn't:
994 *
995 * o indicate end of data transmission (FIN),
996 * o indicate immediate handling of the data (PUSH).
997 */
998 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
999 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1000
1001 /*
1002 * If the underlying HW provides partial checksum, then make
1003 * sure to correct the pseudo header checksum before calling
1004 * mac_sw_cksum(). The native TCP stack doesn't include the
1005 * length field in the pseudo header when LSO is in play -- so
1006 * we need to calculate it here.
1007 */
1008 if (ocsum_flags & HCK_PARTIALCKSUM) {
1009 tcp_sum = BE16_TO_U16(ntcph->th_sum);
1010 otcp_sum = tcp_sum;
1011 tcp_sum += mss + otcphlen;
1012 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1013 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1014 }
1015
1016 if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) {
1017 next_nhdrmp = nhdrmp->b_next;
1018 nhdrmp->b_next = NULL;
1019 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1020 /*
1021 * The mblk could be replaced (via pull-up) or freed (due to
1022 * failure) during mac_sw_cksum(), so we must take care with the
1023 * result here.
1024 */
1025 if (nhdrmp != NULL) {
1026 nhdrmp->b_next = next_nhdrmp;
1027 next_nhdrmp = NULL;
1028 seg_chain = nhdrmp;
1029 } else {
1030 freemsgchain(next_nhdrmp);
1031 /*
1032 * nhdrmp referenced the head of seg_chain when it was
1033 * freed, so further clean-up there is unnecessary
1034 */
1035 seg_chain = NULL;
1036 mac_drop_pkt(omp, "LSO cksum emulation failed");
1037 goto fail;
1038 }
1039 }
1040
1041 ASSERT3P(nhdrmp, !=, NULL);
1042
1043 seg = 1;
1044 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1045 (is_v6 ? (void *)niph6 : (void *)niph),
1046 __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, int_t, seg);
1047 seg++;
1048
1049 /* There better be at least 2 segs. */
1050 ASSERT3P(nhdrmp->b_next, !=, NULL);
1051 prev_nhdrmp = nhdrmp;
1052 nhdrmp = nhdrmp->b_next;
1053
1054 /*
1055 * Now adjust the headers of the middle segments. For each
1056 * header we need to adjust the following.
1057 *
1058 * o IP ID
1059 * o IP length
1060 * o TCP sequence
1061 * o TCP flags
1062 * o cksum flags
1063 * o cksum values (if MAC_HWCKSUM_EMUL is set)
1064 */
1065 for (; seg < nsegs; seg++) {
1066 /*
1067 * We use seg_chain as a reference to the first seg
1068 * header mblk -- this first header is a template for
1069 * the rest of the segments. This copy will include
1070 * the now updated checksum values from the first
1071 * header. We must reset these checksum values to
1072 * their original to make sure we produce the correct
1073 * value.
1074 */
1075 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1076 if (is_v6) {
1077 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
1078 niph6->ip6_plen = htons(
1079 (oiphlen - IPV6_HDR_LEN) + otcphlen + mss);
1080 } else {
1081 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1082 niph->ipha_ident = htons(++ip_id);
1083 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1084 niph->ipha_hdr_checksum = 0;
1085 }
1086 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1087 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1088 tcp_seq += mss;
1089 /*
1090 * Just like the first segment, the middle segments
1091 * shouldn't have these flags set.
1092 */
1093 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1094 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1095
1096 /*
1097 * First and middle segs have same
1098 * pseudo-header checksum.
1099 */
1100 if (ocsum_flags & HCK_PARTIALCKSUM)
1101 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1102
1103 if ((ocsum_flags & HCK_TX_FLAGS) &&
1104 (emul & MAC_HWCKSUM_EMULS)) {
1105 next_nhdrmp = nhdrmp->b_next;
1106 nhdrmp->b_next = NULL;
1107 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1108 /*
1109 * Like above, handle cases where mac_sw_cksum() does a
1110 * pull-up or drop of the mblk.
1111 */
1112 if (nhdrmp != NULL) {
1113 nhdrmp->b_next = next_nhdrmp;
1114 next_nhdrmp = NULL;
1115 prev_nhdrmp->b_next = nhdrmp;
1116 } else {
1117 freemsgchain(next_nhdrmp);
1118 /*
1119 * Critical to de-link the now-freed nhdrmp
1120 * before freeing the rest of the preceding
1121 * chain.
1122 */
1123 prev_nhdrmp->b_next = NULL;
1124 freemsgchain(seg_chain);
1125 seg_chain = NULL;
1126 mac_drop_pkt(omp, "LSO cksum emulation failed");
1127 goto fail;
1128 }
1129 }
1130
1131 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1132 (is_v6 ? (void *)niph6 : (void *)niph),
1133 __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, uint_t, seg);
1134
1135 ASSERT3P(nhdrmp->b_next, !=, NULL);
1136 prev_nhdrmp = nhdrmp;
1137 nhdrmp = nhdrmp->b_next;
1138 }
1139
1140 /* Make sure we are on the last segment. */
1141 ASSERT3U(seg, ==, nsegs);
1142 ASSERT3P(nhdrmp->b_next, ==, NULL);
1143
1144 /*
1145 * Now we set the last segment header. The difference being
1146 * that FIN/PSH/RST flags are allowed.
1147 */
1148 len = msgsize(nhdrmp->b_cont);
1149 ASSERT3S(len, >, 0);
1150 if (is_v6) {
1151 niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
1152 niph6->ip6_plen = htons(
1153 (oiphlen - IPV6_HDR_LEN) + otcphlen + len);
1154 } else {
1155 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1156 niph->ipha_ident = htons(++ip_id);
1157 niph->ipha_length = htons(oiphlen + otcphlen + len);
1158 niph->ipha_hdr_checksum = 0;
1159 }
1160 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1161 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1162
1163 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1164 if (ocsum_flags & HCK_PARTIALCKSUM) {
1165 tcp_sum = otcp_sum;
1166 tcp_sum += len + otcphlen;
1167 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1168 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1169 }
1170
1171 if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) {
1172 /* This should be the last mblk. */
1173 ASSERT3P(nhdrmp->b_next, ==, NULL);
1174 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1175 /*
1176 * If the final mblk happens to be dropped as part of
1177 * mac_sw_cksum(), that is unfortunate, but it need not be a
1178 * show-stopper at this point. We can just pretend that final
1179 * packet was dropped in transit.
1180 */
1181 prev_nhdrmp->b_next = nhdrmp;
1182 }
1183
1184 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1185 (is_v6 ? (void *)niph6 : (void *)niph),
1186 __dtrace_tcp_tcph_t *, ntcph, uint_t, len, uint_t, seg);
1187
1188 /*
1189 * Free the reference to the original LSO message as it is
1190 * being replaced by seg_cahin.
1191 */
1192 freemsg(omp);
1193 *head = seg_chain;
1194 *tail = nhdrmp;
1195 *count = nsegs;
1196 return;
1197
1198 fail:
1199 *head = NULL;
1200 *tail = NULL;
1201 *count = 0;
1202 }
1203
1204 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1205
1206 /*
1207 * Emulate various hardware offload features in software. Take a chain
1208 * of packets as input and emulate the hardware features specified in
1209 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1210 * pointer given as input, and its tail pointer is written to
1211 * '*otail'. The number of packets in the new chain is written to
1212 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1213 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1214 * which case 'mp_chain' will simply stay a NULL chain.
1215 *
1216 * While unlikely, it is technically possible that this function could
1217 * receive a non-NULL chain as input and return a NULL chain as output
1218 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1219 * zero). This could happen if all the packets in the chain are
1220 * dropped or if we fail to allocate new mblks. In this case, there is
1221 * nothing for the caller to free. In any event, the caller shouldn't
1222 * assume that '*mp_chain' is non-NULL on return.
1223 *
1224 * This function was written with three main use cases in mind.
1225 *
1226 * 1. To emulate hardware offloads when traveling mac-loopback (two
1227 * clients on the same mac). This is wired up in mac_tx_send().
1228 *
1229 * 2. To provide hardware offloads to the client when the underlying
1230 * provider cannot. This is currently wired up in mac_tx() but we
1231 * still only negotiate offloads when the underlying provider
1232 * supports them.
1233 *
1234 * 3. To emulate real hardware in simnet.
1235 */
1236 void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1237 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1238 {
1239 mblk_t *head = NULL, *tail = NULL;
1240 uint_t count = 0;
1241
1242 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1243 ASSERT3P(mp_chain, !=, NULL);
1244
1245 for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1246 mblk_t *tmp, *next, *tmphead, *tmptail;
1247 struct ether_header *ehp;
1248 uint32_t flags;
1249 uint_t len = MBLKL(mp), l2len;
1250
1251 /* Perform LSO/cksum one message at a time. */
1252 next = mp->b_next;
1253 mp->b_next = NULL;
1254
1255 /*
1256 * For our sanity the first mblk should contain at
1257 * least the full L2 header.
1258 */
1259 if (len < sizeof (struct ether_header)) {
1260 mac_drop_pkt(mp, "packet too short (A): %u", len);
1261 mp = next;
1262 continue;
1263 }
1264
1265 ehp = (struct ether_header *)mp->b_rptr;
1266 if (ntohs(ehp->ether_type) == VLAN_TPID)
1267 l2len = sizeof (struct ether_vlan_header);
1268 else
1269 l2len = sizeof (struct ether_header);
1270
1271 /*
1272 * If the first mblk is solely the L2 header, then
1273 * there better be more data.
1274 */
1275 if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1276 mac_drop_pkt(mp, "packet too short (C): %u", len);
1277 mp = next;
1278 continue;
1279 }
1280
1281 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1282
1283 /*
1284 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1285 * because we don't want to mask-out the LSO flag.
1286 */
1287 flags = DB_CKSUMFLAGS(mp);
1288
1289 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1290 uint_t tmpcount = 0;
1291
1292 /*
1293 * LSO fix-up handles checksum emulation
1294 * inline (if requested). It also frees mp.
1295 */
1296 mac_sw_lso(mp, emul, &tmphead, &tmptail,
1297 &tmpcount);
1298 if (tmphead == NULL) {
1299 /* mac_sw_lso() freed the mp. */
1300 mp = next;
1301 continue;
1302 }
1303 count += tmpcount;
1304 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1305 tmp = mac_sw_cksum(mp, emul);
1306 if (tmp == NULL) {
1307 /* mac_sw_cksum() freed the mp. */
1308 mp = next;
1309 continue;
1310 }
1311 tmphead = tmp;
1312 tmptail = tmp;
1313 count++;
1314 } else {
1315 /* There is nothing to emulate. */
1316 tmp = mp;
1317 tmphead = tmp;
1318 tmptail = tmp;
1319 count++;
1320 }
1321
1322 /*
1323 * The tmp mblk chain is either the start of the new
1324 * chain or added to the tail of the new chain.
1325 */
1326 if (head == NULL) {
1327 head = tmphead;
1328 tail = tmptail;
1329 } else {
1330 /* Attach the new mblk to the end of the new chain. */
1331 tail->b_next = tmphead;
1332 tail = tmptail;
1333 }
1334
1335 mp = next;
1336 }
1337
1338 *mp_chain = head;
1339
1340 if (otail != NULL)
1341 *otail = tail;
1342
1343 if (ocount != NULL)
1344 *ocount = count;
1345 }
1346
1347 /*
1348 * Add VLAN tag to the specified mblk.
1349 */
1350 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1351 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1352 {
1353 mblk_t *hmp;
1354 struct ether_vlan_header *evhp;
1355 struct ether_header *ehp;
1356
1357 ASSERT(pri != 0 || vid != 0);
1358
1359 /*
1360 * Allocate an mblk for the new tagged ethernet header,
1361 * and copy the MAC addresses and ethertype from the
1362 * original header.
1363 */
1364
1365 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1366 if (hmp == NULL) {
1367 freemsg(mp);
1368 return (NULL);
1369 }
1370
1371 evhp = (struct ether_vlan_header *)hmp->b_rptr;
1372 ehp = (struct ether_header *)mp->b_rptr;
1373
1374 bcopy(ehp, evhp, (ETHERADDRL * 2));
1375 evhp->ether_type = ehp->ether_type;
1376 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1377
1378 hmp->b_wptr += sizeof (struct ether_vlan_header);
1379 mp->b_rptr += sizeof (struct ether_header);
1380
1381 /*
1382 * Free the original message if it's now empty. Link the
1383 * rest of messages to the header message.
1384 */
1385 mac_hcksum_clone(mp, hmp);
1386 if (MBLKL(mp) == 0) {
1387 hmp->b_cont = mp->b_cont;
1388 freeb(mp);
1389 } else {
1390 hmp->b_cont = mp;
1391 }
1392 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1393
1394 /*
1395 * Initialize the new TCI (Tag Control Information).
1396 */
1397 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1398
1399 return (hmp);
1400 }
1401
1402 /*
1403 * Adds a VLAN tag with the specified VID and priority to each mblk of
1404 * the specified chain.
1405 */
1406 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1407 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1408 {
1409 mblk_t *next_mp, **prev, *mp;
1410
1411 mp = mp_chain;
1412 prev = &mp_chain;
1413
1414 while (mp != NULL) {
1415 next_mp = mp->b_next;
1416 mp->b_next = NULL;
1417 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1418 freemsgchain(next_mp);
1419 break;
1420 }
1421 *prev = mp;
1422 prev = &mp->b_next;
1423 mp = mp->b_next = next_mp;
1424 }
1425
1426 return (mp_chain);
1427 }
1428
1429 /*
1430 * Strip VLAN tag
1431 */
1432 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1433 mac_strip_vlan_tag(mblk_t *mp)
1434 {
1435 mblk_t *newmp;
1436 struct ether_vlan_header *evhp;
1437
1438 evhp = (struct ether_vlan_header *)mp->b_rptr;
1439 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1440 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1441
1442 if (DB_REF(mp) > 1) {
1443 newmp = copymsg(mp);
1444 if (newmp == NULL)
1445 return (NULL);
1446 freemsg(mp);
1447 mp = newmp;
1448 }
1449
1450 evhp = (struct ether_vlan_header *)mp->b_rptr;
1451
1452 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1453 mp->b_rptr += VLAN_TAGSZ;
1454 }
1455 return (mp);
1456 }
1457
1458 /*
1459 * Strip VLAN tag from each mblk of the chain.
1460 */
1461 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1462 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1463 {
1464 mblk_t *mp, *next_mp, **prev;
1465
1466 mp = mp_chain;
1467 prev = &mp_chain;
1468
1469 while (mp != NULL) {
1470 next_mp = mp->b_next;
1471 mp->b_next = NULL;
1472 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1473 freemsgchain(next_mp);
1474 break;
1475 }
1476 *prev = mp;
1477 prev = &mp->b_next;
1478 mp = mp->b_next = next_mp;
1479 }
1480
1481 return (mp_chain);
1482 }
1483
1484 /*
1485 * Default callback function. Used when the datapath is not yet initialized.
1486 */
1487 /* ARGSUSED */
1488 void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1489 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1490 boolean_t loopback)
1491 {
1492 freemsgchain(mp_chain);
1493 }
1494
1495 /*
1496 * Determines the IPv6 header length accounting for all the optional IPv6
1497 * headers (hop-by-hop, destination, routing and fragment). The header length
1498 * and next header value (a transport header) is captured.
1499 *
1500 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1501 * returns B_TRUE.
1502 */
1503 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)1504 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1505 uint8_t *next_hdr, ip6_frag_t **fragp)
1506 {
1507 uint16_t length;
1508 uint_t ehdrlen;
1509 uint8_t *whereptr;
1510 uint8_t *nexthdrp;
1511 ip6_dest_t *desthdr;
1512 ip6_rthdr_t *rthdr;
1513 ip6_frag_t *fraghdr;
1514
1515 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1516 return (B_FALSE);
1517 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1518 length = IPV6_HDR_LEN;
1519 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1520
1521 if (fragp != NULL)
1522 *fragp = NULL;
1523
1524 nexthdrp = &ip6h->ip6_nxt;
1525 while (whereptr < endptr) {
1526 /* Is there enough left for len + nexthdr? */
1527 if (whereptr + MIN_EHDR_LEN > endptr)
1528 break;
1529
1530 switch (*nexthdrp) {
1531 case IPPROTO_HOPOPTS:
1532 case IPPROTO_DSTOPTS:
1533 /* Assumes the headers are identical for hbh and dst */
1534 desthdr = (ip6_dest_t *)whereptr;
1535 ehdrlen = 8 * (desthdr->ip6d_len + 1);
1536 if ((uchar_t *)desthdr + ehdrlen > endptr)
1537 return (B_FALSE);
1538 nexthdrp = &desthdr->ip6d_nxt;
1539 break;
1540 case IPPROTO_ROUTING:
1541 rthdr = (ip6_rthdr_t *)whereptr;
1542 ehdrlen = 8 * (rthdr->ip6r_len + 1);
1543 if ((uchar_t *)rthdr + ehdrlen > endptr)
1544 return (B_FALSE);
1545 nexthdrp = &rthdr->ip6r_nxt;
1546 break;
1547 case IPPROTO_FRAGMENT:
1548 fraghdr = (ip6_frag_t *)whereptr;
1549 ehdrlen = sizeof (ip6_frag_t);
1550 if ((uchar_t *)&fraghdr[1] > endptr)
1551 return (B_FALSE);
1552 nexthdrp = &fraghdr->ip6f_nxt;
1553 if (fragp != NULL)
1554 *fragp = fraghdr;
1555 break;
1556 case IPPROTO_NONE:
1557 /* No next header means we're finished */
1558 default:
1559 *hdr_length = length;
1560 *next_hdr = *nexthdrp;
1561 return (B_TRUE);
1562 }
1563 length += ehdrlen;
1564 whereptr += ehdrlen;
1565 *hdr_length = length;
1566 *next_hdr = *nexthdrp;
1567 }
1568 switch (*nexthdrp) {
1569 case IPPROTO_HOPOPTS:
1570 case IPPROTO_DSTOPTS:
1571 case IPPROTO_ROUTING:
1572 case IPPROTO_FRAGMENT:
1573 /*
1574 * If any know extension headers are still to be processed,
1575 * the packet's malformed (or at least all the IP header(s) are
1576 * not in the same mblk - and that should never happen.
1577 */
1578 return (B_FALSE);
1579
1580 default:
1581 /*
1582 * If we get here, we know that all of the IP headers were in
1583 * the same mblk, even if the ULP header is in the next mblk.
1584 */
1585 *hdr_length = length;
1586 *next_hdr = *nexthdrp;
1587 return (B_TRUE);
1588 }
1589 }
1590
1591 /*
1592 * The following set of routines are there to take care of interrupt
1593 * re-targeting for legacy (fixed) interrupts. Some older versions
1594 * of the popular NICs like e1000g do not support MSI-X interrupts
1595 * and they reserve fixed interrupts for RX/TX rings. To re-target
1596 * these interrupts, PCITOOL ioctls need to be used.
1597 */
1598 typedef struct mac_dladm_intr {
1599 int ino;
1600 int cpu_id;
1601 char driver_path[MAXPATHLEN];
1602 char nexus_path[MAXPATHLEN];
1603 } mac_dladm_intr_t;
1604
1605 /* Bind the interrupt to cpu_num */
1606 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)1607 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1608 {
1609 pcitool_intr_set_t iset;
1610 int err;
1611
1612 iset.old_cpu = oldcpuid;
1613 iset.ino = ino;
1614 iset.cpu_id = cpu_num;
1615 iset.user_version = PCITOOL_VERSION;
1616 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1617 kcred, NULL);
1618
1619 return (err);
1620 }
1621
1622 /*
1623 * Search interrupt information. iget is filled in with the info to search
1624 */
1625 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1626 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1627 {
1628 int i;
1629 char driver_path[2 * MAXPATHLEN];
1630
1631 for (i = 0; i < iget_p->num_devs; i++) {
1632 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1633 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1634 ":%s%d", iget_p->dev[i].driver_name,
1635 iget_p->dev[i].dev_inst);
1636 /* Match the device path for the device path */
1637 if (strcmp(driver_path, dln->driver_path) == 0) {
1638 dln->ino = iget_p->ino;
1639 dln->cpu_id = iget_p->cpu_id;
1640 return (B_TRUE);
1641 }
1642 }
1643 return (B_FALSE);
1644 }
1645
1646 /*
1647 * Get information about ino, i.e. if this is the interrupt for our
1648 * device and where it is bound etc.
1649 */
1650 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)1651 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1652 mac_dladm_intr_t *dln)
1653 {
1654 pcitool_intr_get_t *iget_p;
1655 int ipsz;
1656 int nipsz;
1657 int err;
1658 uint8_t inum;
1659
1660 /*
1661 * Check if SLEEP is OK, i.e if could come here in response to
1662 * changing the fanout due to some callback from the driver, say
1663 * link speed changes.
1664 */
1665 ipsz = PCITOOL_IGET_SIZE(0);
1666 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1667
1668 iget_p->num_devs_ret = 0;
1669 iget_p->user_version = PCITOOL_VERSION;
1670 iget_p->cpu_id = oldcpuid;
1671 iget_p->ino = ino;
1672
1673 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1674 FKIOCTL, kcred, NULL);
1675 if (err != 0) {
1676 kmem_free(iget_p, ipsz);
1677 return (B_FALSE);
1678 }
1679 if (iget_p->num_devs == 0) {
1680 kmem_free(iget_p, ipsz);
1681 return (B_FALSE);
1682 }
1683 inum = iget_p->num_devs;
1684 if (iget_p->num_devs_ret < iget_p->num_devs) {
1685 /* Reallocate */
1686 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1687
1688 kmem_free(iget_p, ipsz);
1689 ipsz = nipsz;
1690 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1691
1692 iget_p->num_devs_ret = inum;
1693 iget_p->cpu_id = oldcpuid;
1694 iget_p->ino = ino;
1695 iget_p->user_version = PCITOOL_VERSION;
1696 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1697 FKIOCTL, kcred, NULL);
1698 if (err != 0) {
1699 kmem_free(iget_p, ipsz);
1700 return (B_FALSE);
1701 }
1702 /* defensive */
1703 if (iget_p->num_devs != iget_p->num_devs_ret) {
1704 kmem_free(iget_p, ipsz);
1705 return (B_FALSE);
1706 }
1707 }
1708
1709 if (mac_search_intrinfo(iget_p, dln)) {
1710 kmem_free(iget_p, ipsz);
1711 return (B_TRUE);
1712 }
1713 kmem_free(iget_p, ipsz);
1714 return (B_FALSE);
1715 }
1716
1717 /*
1718 * Get the interrupts and check each one to see if it is for our device.
1719 */
1720 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1721 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1722 {
1723 pcitool_intr_info_t intr_info;
1724 int err;
1725 int ino;
1726 int oldcpuid;
1727
1728 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1729 FKIOCTL, kcred, NULL);
1730 if (err != 0)
1731 return (-1);
1732
1733 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1734 for (ino = 0; ino < intr_info.num_intr; ino++) {
1735 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1736 if (dln->cpu_id == cpuid)
1737 return (0);
1738 return (1);
1739 }
1740 }
1741 }
1742 return (-1);
1743 }
1744
1745 /*
1746 * Obtain the nexus parent node info. for mdip.
1747 */
1748 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1749 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1750 {
1751 struct dev_info *tdip = (struct dev_info *)mdip;
1752 struct ddi_minor_data *minordata;
1753 dev_info_t *pdip;
1754 char pathname[MAXPATHLEN];
1755
1756 while (tdip != NULL) {
1757 /*
1758 * The netboot code could call this function while walking the
1759 * device tree so we need to use ndi_devi_tryenter() here to
1760 * avoid deadlock.
1761 */
1762 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1763 break;
1764
1765 for (minordata = tdip->devi_minor; minordata != NULL;
1766 minordata = minordata->next) {
1767 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1768 strlen(DDI_NT_INTRCTL)) == 0) {
1769 pdip = minordata->dip;
1770 (void) ddi_pathname(pdip, pathname);
1771 (void) snprintf(dln->nexus_path, MAXPATHLEN,
1772 "/devices%s:intr", pathname);
1773 (void) ddi_pathname_minor(minordata, pathname);
1774 ndi_devi_exit((dev_info_t *)tdip);
1775 return (pdip);
1776 }
1777 }
1778 ndi_devi_exit((dev_info_t *)tdip);
1779 tdip = tdip->devi_parent;
1780 }
1781 return (NULL);
1782 }
1783
1784 /*
1785 * For a primary MAC client, if the user has set a list or CPUs or
1786 * we have obtained it implicitly, we try to retarget the interrupt
1787 * for that device on one of the CPUs in the list.
1788 * We assign the interrupt to the same CPU as the poll thread.
1789 */
1790 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1791 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1792 {
1793 ldi_handle_t lh = NULL;
1794 ldi_ident_t li = NULL;
1795 int err;
1796 int ret;
1797 mac_dladm_intr_t dln;
1798 dev_info_t *dip;
1799 struct ddi_minor_data *minordata;
1800
1801 dln.nexus_path[0] = '\0';
1802 dln.driver_path[0] = '\0';
1803
1804 minordata = ((struct dev_info *)mdip)->devi_minor;
1805 while (minordata != NULL) {
1806 if (minordata->type == DDM_MINOR)
1807 break;
1808 minordata = minordata->next;
1809 }
1810 if (minordata == NULL)
1811 return (B_FALSE);
1812
1813 (void) ddi_pathname_minor(minordata, dln.driver_path);
1814
1815 dip = mac_get_nexus_node(mdip, &dln);
1816 /* defensive */
1817 if (dip == NULL)
1818 return (B_FALSE);
1819
1820 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1821 if (err != 0)
1822 return (B_FALSE);
1823
1824 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1825 if (err != 0)
1826 return (B_FALSE);
1827
1828 ret = mac_validate_intr(lh, &dln, cpuid);
1829 if (ret < 0) {
1830 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1831 return (B_FALSE);
1832 }
1833 /* cmn_note? */
1834 if (ret != 0)
1835 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1836 != 0) {
1837 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1838 return (B_FALSE);
1839 }
1840 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1841 return (B_TRUE);
1842 }
1843
1844 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1845 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1846 {
1847 dev_info_t *mdip = (dev_info_t *)arg;
1848 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1849 mac_resource_props_t *mrp;
1850 mac_perim_handle_t mph;
1851 flow_entry_t *flent = mcip->mci_flent;
1852 mac_soft_ring_set_t *rx_srs;
1853 mac_cpus_t *srs_cpu;
1854
1855 if (!mac_check_interrupt_binding(mdip, cpuid))
1856 cpuid = -1;
1857 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1858 mrp = MCIP_RESOURCE_PROPS(mcip);
1859 mrp->mrp_rx_intr_cpu = cpuid;
1860 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1861 rx_srs = flent->fe_rx_srs[1];
1862 srs_cpu = &rx_srs->srs_cpu;
1863 srs_cpu->mc_rx_intr_cpu = cpuid;
1864 }
1865 mac_perim_exit(mph);
1866 }
1867
1868 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1869 mac_client_intr_cpu(mac_client_handle_t mch)
1870 {
1871 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1872 mac_cpus_t *srs_cpu;
1873 mac_soft_ring_set_t *rx_srs;
1874 flow_entry_t *flent = mcip->mci_flent;
1875 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
1876 mac_ring_t *ring;
1877 mac_intr_t *mintr;
1878
1879 /*
1880 * Check if we need to retarget the interrupt. We do this only
1881 * for the primary MAC client. We do this if we have the only
1882 * exclusive ring in the group.
1883 */
1884 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1885 rx_srs = flent->fe_rx_srs[1];
1886 srs_cpu = &rx_srs->srs_cpu;
1887 ring = rx_srs->srs_ring;
1888 mintr = &ring->mr_info.mri_intr;
1889 /*
1890 * If ddi_handle is present or the poll CPU is
1891 * already bound to the interrupt CPU, return -1.
1892 */
1893 if (mintr->mi_ddi_handle != NULL ||
1894 ((mrp->mrp_ncpus != 0) &&
1895 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1896 return (-1);
1897 }
1898 return (srs_cpu->mc_rx_pollid);
1899 }
1900 return (-1);
1901 }
1902
1903 void *
mac_get_devinfo(mac_handle_t mh)1904 mac_get_devinfo(mac_handle_t mh)
1905 {
1906 mac_impl_t *mip = (mac_impl_t *)mh;
1907
1908 return ((void *)mip->mi_dip);
1909 }
1910
1911 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1912 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1913 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1914
1915 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)1916 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
1917 {
1918 struct ether_header *ehp;
1919 uint64_t hash = 0;
1920 uint16_t sap;
1921 uint_t skip_len;
1922 uint8_t proto;
1923 boolean_t ip_fragmented;
1924
1925 /*
1926 * We may want to have one of these per MAC type plugin in the
1927 * future. For now supports only ethernet.
1928 */
1929 if (media != DL_ETHER)
1930 return (0L);
1931
1932 /* for now we support only outbound packets */
1933 ASSERT(is_outbound);
1934 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
1935 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
1936
1937 /* compute L2 hash */
1938
1939 ehp = (struct ether_header *)mp->b_rptr;
1940
1941 if ((policy & MAC_PKT_HASH_L2) != 0) {
1942 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
1943 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
1944 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
1945 policy &= ~MAC_PKT_HASH_L2;
1946 }
1947
1948 if (policy == 0)
1949 goto done;
1950
1951 /* skip ethernet header */
1952
1953 sap = ntohs(ehp->ether_type);
1954 if (sap == ETHERTYPE_VLAN) {
1955 struct ether_vlan_header *evhp;
1956 mblk_t *newmp = NULL;
1957
1958 skip_len = sizeof (struct ether_vlan_header);
1959 if (MBLKL(mp) < skip_len) {
1960 /* the vlan tag is the payload, pull up first */
1961 newmp = msgpullup(mp, -1);
1962 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
1963 goto done;
1964 }
1965 evhp = (struct ether_vlan_header *)newmp->b_rptr;
1966 } else {
1967 evhp = (struct ether_vlan_header *)mp->b_rptr;
1968 }
1969
1970 sap = ntohs(evhp->ether_type);
1971 freemsg(newmp);
1972 } else {
1973 skip_len = sizeof (struct ether_header);
1974 }
1975
1976 /* if ethernet header is in its own mblk, skip it */
1977 if (MBLKL(mp) <= skip_len) {
1978 skip_len -= MBLKL(mp);
1979 mp = mp->b_cont;
1980 if (mp == NULL)
1981 goto done;
1982 }
1983
1984 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
1985
1986 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
1987
1988 switch (sap) {
1989 case ETHERTYPE_IP: {
1990 ipha_t *iphp;
1991
1992 /*
1993 * If the header is not aligned or the header doesn't fit
1994 * in the mblk, bail now. Note that this may cause packets
1995 * reordering.
1996 */
1997 iphp = (ipha_t *)(mp->b_rptr + skip_len);
1998 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
1999 !OK_32PTR((char *)iphp))
2000 goto done;
2001
2002 proto = iphp->ipha_protocol;
2003 skip_len += IPH_HDR_LENGTH(iphp);
2004
2005 /* Check if the packet is fragmented. */
2006 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2007 IPH_OFFSET;
2008
2009 /*
2010 * For fragmented packets, use addresses in addition to
2011 * the frag_id to generate the hash inorder to get
2012 * better distribution.
2013 */
2014 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2015 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2016 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2017
2018 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2019 PKT_HASH_4BYTES(ip_dst));
2020 policy &= ~MAC_PKT_HASH_L3;
2021 }
2022
2023 if (ip_fragmented) {
2024 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2025 hash ^= PKT_HASH_2BYTES(identp);
2026 goto done;
2027 }
2028 break;
2029 }
2030 case ETHERTYPE_IPV6: {
2031 ip6_t *ip6hp;
2032 ip6_frag_t *frag = NULL;
2033 uint16_t hdr_length;
2034
2035 /*
2036 * If the header is not aligned or the header doesn't fit
2037 * in the mblk, bail now. Note that this may cause packets
2038 * reordering.
2039 */
2040
2041 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2042 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2043 !OK_32PTR((char *)ip6hp))
2044 goto done;
2045
2046 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2047 &proto, &frag))
2048 goto done;
2049 skip_len += hdr_length;
2050
2051 /*
2052 * For fragmented packets, use addresses in addition to
2053 * the frag_id to generate the hash inorder to get
2054 * better distribution.
2055 */
2056 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2057 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2058 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2059
2060 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2061 PKT_HASH_4BYTES(ip_dst));
2062 policy &= ~MAC_PKT_HASH_L3;
2063 }
2064
2065 if (frag != NULL) {
2066 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2067 hash ^= PKT_HASH_4BYTES(identp);
2068 goto done;
2069 }
2070 break;
2071 }
2072 default:
2073 goto done;
2074 }
2075
2076 if (policy == 0)
2077 goto done;
2078
2079 /* if ip header is in its own mblk, skip it */
2080 if (MBLKL(mp) <= skip_len) {
2081 skip_len -= MBLKL(mp);
2082 mp = mp->b_cont;
2083 if (mp == NULL)
2084 goto done;
2085 }
2086
2087 /* parse ULP header */
2088 again:
2089 switch (proto) {
2090 case IPPROTO_TCP:
2091 case IPPROTO_UDP:
2092 case IPPROTO_ESP:
2093 case IPPROTO_SCTP:
2094 /*
2095 * These Internet Protocols are intentionally designed
2096 * for hashing from the git-go. Port numbers are in the first
2097 * word for transports, SPI is first for ESP.
2098 */
2099 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2100 goto done;
2101 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2102 break;
2103
2104 case IPPROTO_AH: {
2105 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2106 uint_t ah_length = AH_TOTAL_LEN(ah);
2107
2108 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2109 goto done;
2110
2111 proto = ah->ah_nexthdr;
2112 skip_len += ah_length;
2113
2114 /* if AH header is in its own mblk, skip it */
2115 if (MBLKL(mp) <= skip_len) {
2116 skip_len -= MBLKL(mp);
2117 mp = mp->b_cont;
2118 if (mp == NULL)
2119 goto done;
2120 }
2121
2122 goto again;
2123 }
2124 }
2125
2126 done:
2127 return (hash);
2128 }
2129