1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 * Copyright 2023 Oxide Computer Company
25 */
26
27 /*
28 * MAC Services Module - misc utilities
29 */
30
31 #include <sys/types.h>
32 #include <sys/mac.h>
33 #include <sys/mac_impl.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/mac_client_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/vlan.h>
40 #include <sys/pattr.h>
41 #include <sys/pci_tools.h>
42 #include <inet/ip.h>
43 #include <inet/ip_impl.h>
44 #include <inet/ip6.h>
45 #include <sys/vtrace.h>
46 #include <sys/dlpi.h>
47 #include <sys/sunndi.h>
48 #include <inet/ipsec_impl.h>
49 #include <inet/sadb.h>
50 #include <inet/ipsecesp.h>
51 #include <inet/ipsecah.h>
52 #include <inet/tcp.h>
53 #include <inet/udp_impl.h>
54 #include <inet/sctp_ip.h>
55
56 /*
57 * The next two functions are used for dropping packets or chains of
58 * packets, respectively. We could use one function for both but
59 * separating the use cases allows us to specify intent and prevent
60 * dropping more data than intended.
61 *
62 * The purpose of these functions is to aid the debugging effort,
63 * especially in production. Rather than use freemsg()/freemsgchain(),
64 * it's preferable to use these functions when dropping a packet in
65 * the MAC layer. These functions should only be used during
66 * unexpected conditions. That is, any time a packet is dropped
67 * outside of the regular, successful datapath. Consolidating all
68 * drops on these functions allows the user to trace one location and
69 * determine why the packet was dropped based on the msg. It also
70 * allows the user to inspect the packet before it is freed. Finally,
71 * it allows the user to avoid tracing freemsg()/freemsgchain() thus
72 * keeping the hot path running as efficiently as possible.
73 *
74 * NOTE: At this time not all MAC drops are aggregated on these
75 * functions; but that is the plan. This comment should be erased once
76 * completed.
77 */
78
79 /*PRINTFLIKE2*/
80 void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)81 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
82 {
83 va_list adx;
84 char msg[128];
85 char *msgp = msg;
86
87 ASSERT3P(mp->b_next, ==, NULL);
88
89 va_start(adx, fmt);
90 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
91 va_end(adx);
92
93 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
94 freemsg(mp);
95 }
96
97 /*PRINTFLIKE2*/
98 void
mac_drop_chain(mblk_t * chain,const char * fmt,...)99 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
100 {
101 va_list adx;
102 char msg[128];
103 char *msgp = msg;
104
105 va_start(adx, fmt);
106 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
107 va_end(adx);
108
109 /*
110 * We could use freemsgchain() for the actual freeing but
111 * since we are already walking the chain to fire the dtrace
112 * probe we might as well free the msg here too.
113 */
114 for (mblk_t *mp = chain, *next; mp != NULL; ) {
115 next = mp->b_next;
116 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
117 mp->b_next = NULL;
118 freemsg(mp);
119 mp = next;
120 }
121 }
122
123 /*
124 * Copy an mblk, preserving its hardware checksum flags.
125 */
126 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)127 mac_copymsg_cksum(mblk_t *mp)
128 {
129 mblk_t *mp1;
130
131 mp1 = copymsg(mp);
132 if (mp1 == NULL)
133 return (NULL);
134
135 mac_hcksum_clone(mp, mp1);
136
137 return (mp1);
138 }
139
140 /*
141 * Copy an mblk chain, presenting the hardware checksum flags of the
142 * individual mblks.
143 */
144 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)145 mac_copymsgchain_cksum(mblk_t *mp)
146 {
147 mblk_t *nmp = NULL;
148 mblk_t **nmpp = &nmp;
149
150 for (; mp != NULL; mp = mp->b_next) {
151 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
152 freemsgchain(nmp);
153 return (NULL);
154 }
155
156 nmpp = &((*nmpp)->b_next);
157 }
158
159 return (nmp);
160 }
161
162 /*
163 * Calculate the ULP checksum for IPv4. Return true if the calculation
164 * was successful, or false if an error occurred. If the later, place
165 * an error message into '*err'.
166 */
167 static boolean_t
mac_sw_cksum_ipv4(mblk_t * mp,uint32_t ip_hdr_offset,ipha_t * ipha,const char ** err)168 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
169 const char **err)
170 {
171 const uint8_t proto = ipha->ipha_protocol;
172 size_t len;
173 const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
174 /* ULP offset from start of L2. */
175 const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
176 ipaddr_t src, dst;
177 uint32_t cksum;
178 uint16_t *up;
179
180 /*
181 * We need a pointer to the ULP checksum. We're assuming the
182 * ULP checksum pointer resides in the first mblk. Our native
183 * TCP stack should always put the headers in the first mblk,
184 * but currently we have no way to guarantee that other
185 * clients don't spread headers (or even header fields) across
186 * mblks.
187 */
188 switch (proto) {
189 case IPPROTO_TCP:
190 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
191 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
192 *err = "mblk doesn't contain TCP header";
193 goto bail;
194 }
195
196 up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
197 cksum = IP_TCP_CSUM_COMP;
198 break;
199
200 case IPPROTO_UDP:
201 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
202 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
203 *err = "mblk doesn't contain UDP header";
204 goto bail;
205 }
206
207 up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
208 cksum = IP_UDP_CSUM_COMP;
209 break;
210
211 case IPPROTO_SCTP: {
212 sctp_hdr_t *sctph;
213
214 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
215 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
216 *err = "mblk doesn't contain SCTP header";
217 goto bail;
218 }
219
220 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
221 sctph->sh_chksum = 0;
222 sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
223 return (B_TRUE);
224 }
225
226 default:
227 *err = "unexpected protocol";
228 goto bail;
229
230 }
231
232 /* Pseudo-header checksum. */
233 src = ipha->ipha_src;
234 dst = ipha->ipha_dst;
235 len = ntohs(ipha->ipha_length) - ip_hdr_sz;
236
237 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
238 cksum += htons(len);
239
240 /*
241 * We have already accounted for the pseudo checksum above.
242 * Make sure the ULP checksum field is zero before computing
243 * the rest.
244 */
245 *up = 0;
246 cksum = IP_CSUM(mp, ulp_offset, cksum);
247 *up = (uint16_t)(cksum ? cksum : ~cksum);
248
249 return (B_TRUE);
250
251 bail:
252 return (B_FALSE);
253 }
254
255 /*
256 * Calculate the ULP checksum for IPv6. Return true if the calculation
257 * was successful, or false if an error occurred. If the later, place
258 * an error message into '*err'.
259 */
260 static boolean_t
mac_sw_cksum_ipv6(mblk_t * mp,uint32_t ip_hdr_offset,const char ** err)261 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
262 {
263 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
264 const uint8_t proto = ip6h->ip6_nxt;
265 const uint16_t *iphs = (uint16_t *)ip6h;
266 /* ULP offset from start of L2. */
267 uint32_t ulp_offset;
268 size_t len;
269 uint32_t cksum;
270 uint16_t *up;
271 uint16_t ip_hdr_sz;
272
273 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
274 *err = "malformed IPv6 header";
275 goto bail;
276 }
277
278 ulp_offset = ip_hdr_offset + ip_hdr_sz;
279
280 /*
281 * We need a pointer to the ULP checksum. We're assuming the
282 * ULP checksum pointer resides in the first mblk. Our native
283 * TCP stack should always put the headers in the first mblk,
284 * but currently we have no way to guarantee that other
285 * clients don't spread headers (or even header fields) across
286 * mblks.
287 */
288 switch (proto) {
289 case IPPROTO_TCP:
290 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
291 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
292 *err = "mblk doesn't contain TCP header";
293 goto bail;
294 }
295
296 up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
297 cksum = IP_TCP_CSUM_COMP;
298 break;
299
300 case IPPROTO_UDP:
301 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
302 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
303 *err = "mblk doesn't contain UDP header";
304 goto bail;
305 }
306
307 up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
308 cksum = IP_UDP_CSUM_COMP;
309 break;
310
311 case IPPROTO_SCTP: {
312 sctp_hdr_t *sctph;
313
314 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
315 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
316 *err = "mblk doesn't contain SCTP header";
317 goto bail;
318 }
319
320 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
321 /*
322 * Zero out the checksum field to ensure proper
323 * checksum calculation.
324 */
325 sctph->sh_chksum = 0;
326 sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
327 return (B_TRUE);
328 }
329
330 default:
331 *err = "unexpected protocol";
332 goto bail;
333 }
334
335 /*
336 * The payload length includes the payload and the IPv6
337 * extension headers; the idea is to subtract the extension
338 * header length to get the real payload length.
339 */
340 len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
341 cksum += len;
342
343 /*
344 * We accumulate the pseudo header checksum in cksum; then we
345 * call IP_CSUM to compute the checksum over the payload.
346 */
347 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
348 iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
349 iphs[16] + iphs[17] + iphs[18] + iphs[19];
350 cksum = IP_CSUM(mp, ulp_offset, cksum);
351
352 /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
353 if (proto == IPPROTO_UDP && cksum == 0)
354 cksum = ~cksum;
355
356 *up = (uint16_t)cksum;
357
358 return (B_TRUE);
359
360 bail:
361 return (B_FALSE);
362 }
363
364 /*
365 * Perform software checksum on a single message, if needed. The
366 * emulation performed is determined by an intersection of the mblk's
367 * flags and the emul flags requested. The emul flags are documented
368 * in mac.h.
369 */
370 static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)371 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
372 {
373 mblk_t *skipped_hdr = NULL;
374 uint32_t flags, start, stuff, end, value;
375 uint32_t ip_hdr_offset;
376 uint16_t etype;
377 size_t ip_hdr_sz;
378 struct ether_header *ehp;
379 const char *err = "";
380
381 /*
382 * This function should only be called from mac_hw_emul()
383 * which handles mblk chains and the shared ref case.
384 */
385 ASSERT3P(mp->b_next, ==, NULL);
386
387 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
388
389 flags = DB_CKSUMFLAGS(mp);
390
391 /* Why call this if checksum emulation isn't needed? */
392 ASSERT3U(flags & (HCK_FLAGS), !=, 0);
393
394 /*
395 * Ethernet, and optionally VLAN header. mac_hw_emul() has
396 * already verified we have enough data to read the L2 header.
397 */
398 ehp = (struct ether_header *)mp->b_rptr;
399 if (ntohs(ehp->ether_type) == VLAN_TPID) {
400 struct ether_vlan_header *evhp;
401
402 evhp = (struct ether_vlan_header *)mp->b_rptr;
403 etype = ntohs(evhp->ether_type);
404 ip_hdr_offset = sizeof (struct ether_vlan_header);
405 } else {
406 etype = ntohs(ehp->ether_type);
407 ip_hdr_offset = sizeof (struct ether_header);
408 }
409
410 /*
411 * If this packet isn't IP, then leave it alone. We don't want
412 * to affect non-IP traffic like ARP. Assume the IP header
413 * doesn't include any options, for now. We will use the
414 * correct size later after we know there are enough bytes to
415 * at least fill out the basic header.
416 */
417 switch (etype) {
418 case ETHERTYPE_IP:
419 ip_hdr_sz = sizeof (ipha_t);
420 break;
421 case ETHERTYPE_IPV6:
422 ip_hdr_sz = sizeof (ip6_t);
423 break;
424 default:
425 return (mp);
426 }
427
428 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
429
430 /*
431 * If the first mblk of this packet contains only the ethernet
432 * header, skip past it for now. Packets with their data
433 * contained in only a single mblk can then use the fastpaths
434 * tuned to that possibility.
435 */
436 if (MBLKL(mp) == ip_hdr_offset) {
437 ip_hdr_offset -= MBLKL(mp);
438 /* This is guaranteed by mac_hw_emul(). */
439 ASSERT3P(mp->b_cont, !=, NULL);
440 skipped_hdr = mp;
441 mp = mp->b_cont;
442 }
443
444 /*
445 * Both full and partial checksum rely on finding the IP
446 * header in the current mblk. Our native TCP stack honors
447 * this assumption but it's prudent to guard our future
448 * clients that might not honor this contract.
449 */
450 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
451 if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
452 err = "mblk doesn't contain IP header";
453 goto bail;
454 }
455
456 /*
457 * We are about to modify the header mblk; make sure we are
458 * modifying our own copy. The code that follows assumes that
459 * the IP/ULP headers exist in this mblk (and drops the
460 * message if they don't).
461 */
462 if (DB_REF(mp) > 1) {
463 mblk_t *tmp = copyb(mp);
464
465 if (tmp == NULL) {
466 err = "copyb failed";
467 goto bail;
468 }
469
470 if (skipped_hdr != NULL) {
471 ASSERT3P(skipped_hdr->b_cont, ==, mp);
472 skipped_hdr->b_cont = tmp;
473 }
474
475 tmp->b_cont = mp->b_cont;
476 freeb(mp);
477 mp = tmp;
478 }
479
480 if (etype == ETHERTYPE_IP) {
481 ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
482
483 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
484 if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
485 goto bail;
486 }
487
488 /* We always update the ULP checksum flags. */
489 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
490 flags &= ~HCK_FULLCKSUM;
491 flags |= HCK_FULLCKSUM_OK;
492 value = 0;
493 }
494
495 /*
496 * While unlikely, it's possible to write code that
497 * might end up calling mac_sw_cksum() twice on the
498 * same mblk (performing both LSO and checksum
499 * emualtion in a single mblk chain loop -- the LSO
500 * emulation inserts a new chain into the existing
501 * chain and then the loop iterates back over the new
502 * segments and emulates the checksum a second time).
503 * Normally this wouldn't be a problem, because the
504 * HCK_*_OK flags are supposed to indicate that we
505 * don't need to do peform the work. But
506 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
507 * same value; so we cannot use these flags to
508 * determine if the IP header checksum has already
509 * been calculated or not. For this reason, we zero
510 * out the the checksum first. In the future, we
511 * should fix the HCK_* flags.
512 */
513 if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
514 ipha->ipha_hdr_checksum = 0;
515 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
516 flags &= ~HCK_IPV4_HDRCKSUM;
517 flags |= HCK_IPV4_HDRCKSUM_OK;
518 }
519 } else if (etype == ETHERTYPE_IPV6) {
520 /* There is no IP header checksum for IPv6. */
521 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
522 if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
523 goto bail;
524 flags &= ~HCK_FULLCKSUM;
525 flags |= HCK_FULLCKSUM_OK;
526 value = 0;
527 }
528 }
529
530 /*
531 * Partial checksum is the same for both IPv4 and IPv6.
532 */
533 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
534 uint16_t *up, partial, cksum;
535 uchar_t *ipp; /* ptr to beginning of IP header */
536
537 ipp = mp->b_rptr + ip_hdr_offset;
538 up = (uint16_t *)((uchar_t *)ipp + stuff);
539 partial = *up;
540 *up = 0;
541
542 ASSERT3S(end, >, start);
543 cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
544 *up = cksum != 0 ? cksum : ~cksum;
545 }
546
547 /* We always update the ULP checksum flags. */
548 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
549 flags &= ~HCK_PARTIALCKSUM;
550 flags |= HCK_FULLCKSUM_OK;
551 value = 0;
552 }
553
554 mac_hcksum_set(mp, start, stuff, end, value, flags);
555
556 /* Don't forget to reattach the header. */
557 if (skipped_hdr != NULL) {
558 ASSERT3P(skipped_hdr->b_cont, ==, mp);
559
560 /*
561 * Duplicate the HCKSUM data into the header mblk.
562 * This mimics mac_add_vlan_tag which ensures that
563 * both the first mblk _and_ the first data bearing
564 * mblk possess the HCKSUM information. Consumers like
565 * IP will end up discarding the ether_header mblk, so
566 * for now, it is important that the data be available
567 * in both places.
568 */
569 mac_hcksum_clone(mp, skipped_hdr);
570 mp = skipped_hdr;
571 }
572
573 return (mp);
574
575 bail:
576 if (skipped_hdr != NULL) {
577 ASSERT3P(skipped_hdr->b_cont, ==, mp);
578 mp = skipped_hdr;
579 }
580
581 mac_drop_pkt(mp, err);
582 return (NULL);
583 }
584
585 /*
586 * Build a single data segment from an LSO packet. The mblk chain
587 * returned, seg_head, represents the data segment and is always
588 * exactly seg_len bytes long. The lso_mp and offset input/output
589 * parameters track our position in the LSO packet. This function
590 * exists solely as a helper to mac_sw_lso().
591 *
592 * Case A
593 *
594 * The current lso_mp is larger than the requested seg_len. The
595 * beginning of seg_head may start at the beginning of lso_mp or
596 * offset into it. In either case, a single mblk is returned, and
597 * *offset is updated to reflect our new position in the current
598 * lso_mp.
599 *
600 * +----------------------------+
601 * | in *lso_mp / out *lso_mp |
602 * +----------------------------+
603 * ^ ^
604 * | |
605 * | |
606 * | |
607 * +------------------------+
608 * | seg_head |
609 * +------------------------+
610 * ^ ^
611 * | |
612 * in *offset = 0 out *offset = seg_len
613 *
614 * |------ seg_len ----|
615 *
616 *
617 * +------------------------------+
618 * | in *lso_mp / out *lso_mp |
619 * +------------------------------+
620 * ^ ^
621 * | |
622 * | |
623 * | |
624 * +------------------------+
625 * | seg_head |
626 * +------------------------+
627 * ^ ^
628 * | |
629 * in *offset = N out *offset = N + seg_len
630 *
631 * |------ seg_len ----|
632 *
633 *
634 *
635 * Case B
636 *
637 * The requested seg_len consumes exactly the rest of the lso_mp.
638 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
639 * The seg_head may start at the beginning of the lso_mp or at some
640 * offset into it. In either case we return a single mblk, reset
641 * *offset to zero, and walk to the next lso_mp.
642 *
643 * +------------------------+ +------------------------+
644 * | in *lso_mp |---------->| out *lso_mp |
645 * +------------------------+ +------------------------+
646 * ^ ^ ^
647 * | | |
648 * | | out *offset = 0
649 * | |
650 * +------------------------+
651 * | seg_head |
652 * +------------------------+
653 * ^
654 * |
655 * in *offset = 0
656 *
657 * |------ seg_len ----|
658 *
659 *
660 *
661 * +----------------------------+ +------------------------+
662 * | in *lso_mp |---------->| out *lso_mp |
663 * +----------------------------+ +------------------------+
664 * ^ ^ ^
665 * | | |
666 * | | out *offset = 0
667 * | |
668 * +------------------------+
669 * | seg_head |
670 * +------------------------+
671 * ^
672 * |
673 * in *offset = N
674 *
675 * |------ seg_len ----|
676 *
677 *
678 * Case C
679 *
680 * The requested seg_len is greater than the current lso_mp. In
681 * this case we must consume LSO mblks until we have enough data to
682 * satisfy either case (A) or (B) above. We will return multiple
683 * mblks linked via b_cont, offset will be set based on the cases
684 * above, and lso_mp will walk forward at least one mblk, but maybe
685 * more.
686 *
687 * N.B. This digram is not exhaustive. The seg_head may start on
688 * the beginning of an lso_mp. The seg_tail may end exactly on the
689 * boundary of an lso_mp. And there may be two (in this case the
690 * middle block wouldn't exist), three, or more mblks in the
691 * seg_head chain. This is meant as one example of what might
692 * happen. The main thing to remember is that the seg_tail mblk
693 * must be one of case (A) or (B) above.
694 *
695 * +------------------+ +----------------+ +------------------+
696 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
697 * +------------------+ +----------------+ +------------------+
698 * ^ ^ ^ ^ ^ ^
699 * | | | | | |
700 * | | | | | |
701 * | | | | | |
702 * | | | | | |
703 * +------------+ +----------------+ +------------+
704 * | seg_head |--->| |--->| seg_tail |
705 * +------------+ +----------------+ +------------+
706 * ^ ^
707 * | |
708 * in *offset = N out *offset = MBLKL(seg_tail)
709 *
710 * |------------------- seg_len -------------------|
711 *
712 */
713 static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)714 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
715 {
716 mblk_t *seg_head, *seg_tail, *seg_mp;
717
718 ASSERT3P(*lso_mp, !=, NULL);
719 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
720
721 seg_mp = dupb(*lso_mp);
722 if (seg_mp == NULL)
723 return (NULL);
724
725 seg_head = seg_mp;
726 seg_tail = seg_mp;
727
728 /* Continue where we left off from in the lso_mp. */
729 seg_mp->b_rptr += *offset;
730
731 last_mblk:
732 /* Case (A) */
733 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
734 *offset += seg_len;
735 seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
736 return (seg_head);
737 }
738
739 /* Case (B) */
740 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
741 *offset = 0;
742 *lso_mp = (*lso_mp)->b_cont;
743 return (seg_head);
744 }
745
746 /* Case (C) */
747 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
748
749 /*
750 * The current LSO mblk doesn't have enough data to satisfy
751 * seg_len -- continue peeling off LSO mblks to build the new
752 * segment message. If allocation fails we free the previously
753 * allocated segment mblks and return NULL.
754 */
755 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
756 ASSERT3U(MBLKL(seg_mp), <=, seg_len);
757 seg_len -= MBLKL(seg_mp);
758 *offset = 0;
759 *lso_mp = (*lso_mp)->b_cont;
760 seg_mp = dupb(*lso_mp);
761
762 if (seg_mp == NULL) {
763 freemsgchain(seg_head);
764 return (NULL);
765 }
766
767 seg_tail->b_cont = seg_mp;
768 seg_tail = seg_mp;
769 }
770
771 /*
772 * We've walked enough LSO mblks that we can now satisfy the
773 * remaining seg_len. At this point we need to jump back to
774 * determine if we have arrived at case (A) or (B).
775 */
776
777 /* Just to be paranoid that we didn't underflow. */
778 ASSERT3U(seg_len, <, IP_MAXPACKET);
779 ASSERT3U(seg_len, >, 0);
780 goto last_mblk;
781 }
782
783 /*
784 * Perform software segmentation of a single LSO message. Take an LSO
785 * message as input and return head/tail pointers as output. This
786 * function should not be invoked directly but instead through
787 * mac_hw_emul().
788 *
789 * The resulting chain is comprised of multiple (nsegs) MSS sized
790 * segments. Each segment will consist of two or more mblks joined by
791 * b_cont: a header and one or more data mblks. The header mblk is
792 * allocated anew for each message. The first segment's header is used
793 * as a template for the rest with adjustments made for things such as
794 * ID, sequence, length, TCP flags, etc. The data mblks reference into
795 * the existing LSO mblk (passed in as omp) by way of dupb(). Their
796 * b_rptr/b_wptr values are adjusted to reference only the fraction of
797 * the LSO message they are responsible for. At the successful
798 * completion of this function the original mblk (omp) is freed,
799 * leaving the newely created segment chain as the only remaining
800 * reference to the data.
801 */
802 static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)803 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
804 uint_t *count)
805 {
806 uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
807 uint32_t mss;
808 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
809 uint32_t oleft;
810 uint_t nsegs, seg;
811 int len;
812
813 struct ether_vlan_header *oevh;
814 const ipha_t *oiph;
815 const tcph_t *otcph;
816 ipha_t *niph;
817 tcph_t *ntcph;
818 uint16_t ip_id;
819 uint32_t tcp_seq, tcp_sum, otcp_sum;
820
821 uint32_t offset;
822 mblk_t *odatamp;
823 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
824 mblk_t *tmptail;
825
826 ASSERT3P(head, !=, NULL);
827 ASSERT3P(tail, !=, NULL);
828 ASSERT3P(count, !=, NULL);
829 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
830
831 /* Assume we are dealing with a single LSO message. */
832 ASSERT3P(omp->b_next, ==, NULL);
833
834 /*
835 * XXX: This is a hack to deal with mac_add_vlan_tag().
836 *
837 * When VLANs are in play, mac_add_vlan_tag() creates a new
838 * mblk with just the ether_vlan_header and tacks it onto the
839 * front of 'omp'. This breaks the assumptions made below;
840 * namely that the TCP/IP headers are in the first mblk. In
841 * this case, since we already have to pay the cost of LSO
842 * emulation, we simply pull up everything. While this might
843 * seem irksome, keep in mind this will only apply in a couple
844 * of scenarios: a) an LSO-capable VLAN client sending to a
845 * non-LSO-capable client over the "MAC/bridge loopback"
846 * datapath or b) an LSO-capable VLAN client is sending to a
847 * client that, for whatever reason, doesn't have DLS-bypass
848 * enabled. Finally, we have to check for both a tagged and
849 * untagged sized mblk depending on if the mblk came via
850 * mac_promisc_dispatch() or mac_rx_deliver().
851 *
852 * In the future, two things should be done:
853 *
854 * 1. This function should make use of some yet to be
855 * implemented "mblk helpers". These helper functions would
856 * perform all the b_cont walking for us and guarantee safe
857 * access to the mblk data.
858 *
859 * 2. We should add some slop to the mblks so that
860 * mac_add_vlan_tag() can just edit the first mblk instead
861 * of allocating on the hot path.
862 */
863 if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
864 MBLKL(omp) == sizeof (struct ether_header)) {
865 mblk_t *tmp = msgpullup(omp, -1);
866
867 if (tmp == NULL) {
868 mac_drop_pkt(omp, "failed to pull up");
869 goto fail;
870 }
871
872 mac_hcksum_clone(omp, tmp);
873 freemsg(omp);
874 omp = tmp;
875 }
876
877 mss = DB_LSOMSS(omp);
878 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
879 sizeof (struct ether_vlan_header));
880 opktlen = msgsize(omp);
881
882 /*
883 * First, get references to the IP and TCP headers and
884 * determine the total TCP length (header + data).
885 *
886 * Thanks to mac_hw_emul() we know that the first mblk must
887 * contain (at minimum) the full L2 header. However, this
888 * function assumes more than that. It assumes the L2/L3/L4
889 * headers are all contained in the first mblk of a message
890 * (i.e., no b_cont walking for headers). While this is a
891 * current reality (our native TCP stack and viona both
892 * enforce this) things may become more nuanced in the future
893 * (e.g. when introducing encap support or adding new
894 * clients). For now we guard against this case by dropping
895 * the packet.
896 */
897 oevh = (struct ether_vlan_header *)omp->b_rptr;
898 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
899 oehlen = sizeof (struct ether_vlan_header);
900 else
901 oehlen = sizeof (struct ether_header);
902
903 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
904 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
905 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
906 goto fail;
907 }
908
909 oiph = (ipha_t *)(omp->b_rptr + oehlen);
910 oiphlen = IPH_HDR_LENGTH(oiph);
911 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
912 otcphlen = TCP_HDR_LENGTH(otcph);
913
914 /*
915 * Currently we only support LSO for TCP/IPv4.
916 */
917 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
918 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
919 IPH_HDR_VERSION(oiph));
920 goto fail;
921 }
922
923 if (oiph->ipha_protocol != IPPROTO_TCP) {
924 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
925 oiph->ipha_protocol);
926 goto fail;
927 }
928
929 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
930 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
931 goto fail;
932 }
933
934 ohdrslen = oehlen + oiphlen + otcphlen;
935 if ((len = MBLKL(omp)) < ohdrslen) {
936 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
937 ohdrslen);
938 goto fail;
939 }
940
941 /*
942 * Either we have data in the first mblk or it's just the
943 * header. In either case, we need to set rptr to the start of
944 * the TCP data.
945 */
946 if (len > ohdrslen) {
947 odatamp = omp;
948 offset = ohdrslen;
949 } else {
950 ASSERT3U(len, ==, ohdrslen);
951 odatamp = omp->b_cont;
952 offset = 0;
953 }
954
955 /* Make sure we still have enough data. */
956 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
957
958 /*
959 * If a MAC negotiated LSO then it must negotioate both
960 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
961 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
962 * change during LSO segmentation (only the 3 fields of the
963 * pseudo header checksum don't change: src, dst, proto). Thus
964 * we would expect these flags (HCK_IPV4_HDRCKSUM |
965 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
966 * function to emulate those checksums in software. However,
967 * that assumes a world where we only expose LSO if the
968 * underlying hardware exposes LSO. Moving forward the plan is
969 * to assume LSO in the upper layers and have MAC perform
970 * software LSO when the underlying provider doesn't support
971 * it. In such a world, if the provider doesn't support LSO
972 * but does support hardware checksum offload, then we could
973 * simply perform the segmentation and allow the hardware to
974 * calculate the checksums. To the hardware it's just another
975 * chain of non-LSO packets.
976 */
977 ASSERT3S(DB_TYPE(omp), ==, M_DATA);
978 ocsum_flags = DB_CKSUMFLAGS(omp);
979 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
980 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
981
982 /*
983 * If hardware only provides partial checksum then software
984 * must supply the pseudo-header checksum. In the case of LSO
985 * we leave the TCP length at zero to be filled in by
986 * hardware. This function must handle two scenarios.
987 *
988 * 1. Being called by a MAC client on the Rx path to segment
989 * an LSO packet and calculate the checksum.
990 *
991 * 2. Being called by a MAC provider to segment an LSO packet.
992 * In this case the LSO segmentation is performed in
993 * software (by this routine) but the MAC provider should
994 * still calculate the TCP/IP checksums in hardware.
995 *
996 * To elaborate on the second case: we cannot have the
997 * scenario where IP sends LSO packets but the underlying HW
998 * doesn't support checksum offload -- because in that case
999 * TCP/IP would calculate the checksum in software (for the
1000 * LSO packet) but then MAC would segment the packet and have
1001 * to redo all the checksum work. So IP should never do LSO
1002 * if HW doesn't support both IP and TCP checksum.
1003 */
1004 if (ocsum_flags & HCK_PARTIALCKSUM) {
1005 ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
1006 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
1007 }
1008
1009 odatalen = opktlen - ohdrslen;
1010
1011 /*
1012 * Subtract one to account for the case where the data length
1013 * is evenly divisble by the MSS. Add one to account for the
1014 * fact that the division will always result in one less
1015 * segment than needed.
1016 */
1017 nsegs = ((odatalen - 1) / mss) + 1;
1018 if (nsegs < 2) {
1019 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
1020 goto fail;
1021 }
1022
1023 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
1024 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
1025 nsegs);
1026
1027 seg_chain = NULL;
1028 tmptail = seg_chain;
1029 oleft = odatalen;
1030
1031 for (uint_t i = 0; i < nsegs; i++) {
1032 boolean_t last_seg = ((i + 1) == nsegs);
1033 uint32_t seg_len;
1034
1035 /*
1036 * If we fail to allocate, then drop the partially
1037 * allocated chain as well as the LSO packet. Let the
1038 * sender deal with the fallout.
1039 */
1040 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
1041 freemsgchain(seg_chain);
1042 mac_drop_pkt(omp, "failed to alloc segment header");
1043 goto fail;
1044 }
1045 ASSERT3P(nhdrmp->b_cont, ==, NULL);
1046
1047 if (seg_chain == NULL) {
1048 seg_chain = nhdrmp;
1049 } else {
1050 ASSERT3P(tmptail, !=, NULL);
1051 tmptail->b_next = nhdrmp;
1052 }
1053
1054 tmptail = nhdrmp;
1055
1056 /*
1057 * Calculate this segment's lengh. It's either the MSS
1058 * or whatever remains for the last segment.
1059 */
1060 seg_len = last_seg ? oleft : mss;
1061 ASSERT3U(seg_len, <=, mss);
1062 ndatamp = build_data_seg(&odatamp, &offset, seg_len);
1063
1064 if (ndatamp == NULL) {
1065 freemsgchain(seg_chain);
1066 mac_drop_pkt(omp, "LSO failed to segment data");
1067 goto fail;
1068 }
1069
1070 /* Attach data mblk to header mblk. */
1071 nhdrmp->b_cont = ndatamp;
1072 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
1073 ASSERT3U(seg_len, <=, oleft);
1074 oleft -= seg_len;
1075 }
1076
1077 /* We should have consumed entire LSO msg. */
1078 ASSERT3S(oleft, ==, 0);
1079 ASSERT3P(odatamp, ==, NULL);
1080
1081 /*
1082 * All seg data mblks are referenced by the header mblks, null
1083 * out this pointer to catch any bad derefs.
1084 */
1085 ndatamp = NULL;
1086
1087 /*
1088 * Set headers and checksum for first segment.
1089 */
1090 nhdrmp = seg_chain;
1091 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
1092 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1093 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1094 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
1095 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1096 niph->ipha_hdr_checksum = 0;
1097 ip_id = ntohs(niph->ipha_ident);
1098 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1099 tcp_seq = BE32_TO_U32(ntcph->th_seq);
1100 tcp_seq += mss;
1101
1102 /*
1103 * The first segment shouldn't:
1104 *
1105 * o indicate end of data transmission (FIN),
1106 * o indicate immediate handling of the data (PUSH).
1107 */
1108 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1109 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1110
1111 /*
1112 * If the underlying HW provides partial checksum, then make
1113 * sure to correct the pseudo header checksum before calling
1114 * mac_sw_cksum(). The native TCP stack doesn't include the
1115 * length field in the pseudo header when LSO is in play -- so
1116 * we need to calculate it here.
1117 */
1118 if (ocsum_flags & HCK_PARTIALCKSUM) {
1119 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1120 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1121 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1122 tcp_sum = BE16_TO_U16(ntcph->th_sum);
1123 otcp_sum = tcp_sum;
1124 tcp_sum += mss + otcphlen;
1125 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1126 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1127 }
1128
1129 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1130 (emul & MAC_HWCKSUM_EMULS)) {
1131 next_nhdrmp = nhdrmp->b_next;
1132 nhdrmp->b_next = NULL;
1133 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1134 nhdrmp->b_next = next_nhdrmp;
1135 next_nhdrmp = NULL;
1136
1137 /*
1138 * We may have freed the nhdrmp argument during
1139 * checksum emulation, make sure that seg_chain
1140 * references a valid mblk.
1141 */
1142 seg_chain = nhdrmp;
1143 }
1144
1145 ASSERT3P(nhdrmp, !=, NULL);
1146
1147 seg = 1;
1148 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1149 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1150 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1151 uint_t, seg);
1152 seg++;
1153
1154 /* There better be at least 2 segs. */
1155 ASSERT3P(nhdrmp->b_next, !=, NULL);
1156 prev_nhdrmp = nhdrmp;
1157 nhdrmp = nhdrmp->b_next;
1158
1159 /*
1160 * Now adjust the headers of the middle segments. For each
1161 * header we need to adjust the following.
1162 *
1163 * o IP ID
1164 * o IP length
1165 * o TCP sequence
1166 * o TCP flags
1167 * o cksum flags
1168 * o cksum values (if MAC_HWCKSUM_EMUL is set)
1169 */
1170 for (; seg < nsegs; seg++) {
1171 /*
1172 * We use seg_chain as a reference to the first seg
1173 * header mblk -- this first header is a template for
1174 * the rest of the segments. This copy will include
1175 * the now updated checksum values from the first
1176 * header. We must reset these checksum values to
1177 * their original to make sure we produce the correct
1178 * value.
1179 */
1180 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1181 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1182 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1183 niph->ipha_ident = htons(++ip_id);
1184 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1185 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1186 niph->ipha_hdr_checksum = 0;
1187 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1188 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1189 tcp_seq += mss;
1190 /*
1191 * Just like the first segment, the middle segments
1192 * shouldn't have these flags set.
1193 */
1194 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1195 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1196
1197 if (ocsum_flags & HCK_PARTIALCKSUM) {
1198 /*
1199 * First and middle segs have same
1200 * pseudo-header checksum.
1201 */
1202 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1203 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1204 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1205 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1206 }
1207
1208 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1209 (emul & MAC_HWCKSUM_EMULS)) {
1210 next_nhdrmp = nhdrmp->b_next;
1211 nhdrmp->b_next = NULL;
1212 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1213 nhdrmp->b_next = next_nhdrmp;
1214 next_nhdrmp = NULL;
1215 /* We may have freed the original nhdrmp. */
1216 prev_nhdrmp->b_next = nhdrmp;
1217 }
1218
1219 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1220 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1221 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1222 uint_t, mss, uint_t, seg);
1223
1224 ASSERT3P(nhdrmp->b_next, !=, NULL);
1225 prev_nhdrmp = nhdrmp;
1226 nhdrmp = nhdrmp->b_next;
1227 }
1228
1229 /* Make sure we are on the last segment. */
1230 ASSERT3U(seg, ==, nsegs);
1231 ASSERT3P(nhdrmp->b_next, ==, NULL);
1232
1233 /*
1234 * Now we set the last segment header. The difference being
1235 * that FIN/PSH/RST flags are allowed.
1236 */
1237 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1238 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1239 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1240 niph->ipha_ident = htons(++ip_id);
1241 len = msgsize(nhdrmp->b_cont);
1242 ASSERT3S(len, >, 0);
1243 niph->ipha_length = htons(oiphlen + otcphlen + len);
1244 niph->ipha_hdr_checksum = 0;
1245 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1246 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1247
1248 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1249 if (ocsum_flags & HCK_PARTIALCKSUM) {
1250 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1251 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1252 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1253 tcp_sum = otcp_sum;
1254 tcp_sum += len + otcphlen;
1255 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1256 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1257 }
1258
1259 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1260 (emul & MAC_HWCKSUM_EMULS)) {
1261 /* This should be the last mblk. */
1262 ASSERT3P(nhdrmp->b_next, ==, NULL);
1263 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1264 prev_nhdrmp->b_next = nhdrmp;
1265 }
1266
1267 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1268 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1269 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1270 uint_t, seg);
1271
1272 /*
1273 * Free the reference to the original LSO message as it is
1274 * being replaced by seg_cahin.
1275 */
1276 freemsg(omp);
1277 *head = seg_chain;
1278 *tail = nhdrmp;
1279 *count = nsegs;
1280 return;
1281
1282 fail:
1283 *head = NULL;
1284 *tail = NULL;
1285 *count = 0;
1286 }
1287
1288 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1289
1290 /*
1291 * Emulate various hardware offload features in software. Take a chain
1292 * of packets as input and emulate the hardware features specified in
1293 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1294 * pointer given as input, and its tail pointer is written to
1295 * '*otail'. The number of packets in the new chain is written to
1296 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1297 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1298 * which case 'mp_chain' will simply stay a NULL chain.
1299 *
1300 * While unlikely, it is technically possible that this function could
1301 * receive a non-NULL chain as input and return a NULL chain as output
1302 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1303 * zero). This could happen if all the packets in the chain are
1304 * dropped or if we fail to allocate new mblks. In this case, there is
1305 * nothing for the caller to free. In any event, the caller shouldn't
1306 * assume that '*mp_chain' is non-NULL on return.
1307 *
1308 * This function was written with three main use cases in mind.
1309 *
1310 * 1. To emulate hardware offloads when traveling mac-loopback (two
1311 * clients on the same mac). This is wired up in mac_tx_send().
1312 *
1313 * 2. To provide hardware offloads to the client when the underlying
1314 * provider cannot. This is currently wired up in mac_tx() but we
1315 * still only negotiate offloads when the underlying provider
1316 * supports them.
1317 *
1318 * 3. To emulate real hardware in simnet.
1319 */
1320 void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1321 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1322 {
1323 mblk_t *head = NULL, *tail = NULL;
1324 uint_t count = 0;
1325
1326 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1327 ASSERT3P(mp_chain, !=, NULL);
1328
1329 for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1330 mblk_t *tmp, *next, *tmphead, *tmptail;
1331 struct ether_header *ehp;
1332 uint32_t flags;
1333 uint_t len = MBLKL(mp), l2len;
1334
1335 /* Perform LSO/cksum one message at a time. */
1336 next = mp->b_next;
1337 mp->b_next = NULL;
1338
1339 /*
1340 * For our sanity the first mblk should contain at
1341 * least the full L2 header.
1342 */
1343 if (len < sizeof (struct ether_header)) {
1344 mac_drop_pkt(mp, "packet too short (A): %u", len);
1345 mp = next;
1346 continue;
1347 }
1348
1349 ehp = (struct ether_header *)mp->b_rptr;
1350 if (ntohs(ehp->ether_type) == VLAN_TPID)
1351 l2len = sizeof (struct ether_vlan_header);
1352 else
1353 l2len = sizeof (struct ether_header);
1354
1355 /*
1356 * If the first mblk is solely the L2 header, then
1357 * there better be more data.
1358 */
1359 if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1360 mac_drop_pkt(mp, "packet too short (C): %u", len);
1361 mp = next;
1362 continue;
1363 }
1364
1365 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1366
1367 /*
1368 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1369 * because we don't want to mask-out the LSO flag.
1370 */
1371 flags = DB_CKSUMFLAGS(mp);
1372
1373 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1374 uint_t tmpcount = 0;
1375
1376 /*
1377 * LSO fix-up handles checksum emulation
1378 * inline (if requested). It also frees mp.
1379 */
1380 mac_sw_lso(mp, emul, &tmphead, &tmptail,
1381 &tmpcount);
1382 if (tmphead == NULL) {
1383 /* mac_sw_lso() freed the mp. */
1384 mp = next;
1385 continue;
1386 }
1387 count += tmpcount;
1388 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1389 tmp = mac_sw_cksum(mp, emul);
1390 if (tmp == NULL) {
1391 /* mac_sw_cksum() freed the mp. */
1392 mp = next;
1393 continue;
1394 }
1395 tmphead = tmp;
1396 tmptail = tmp;
1397 count++;
1398 } else {
1399 /* There is nothing to emulate. */
1400 tmp = mp;
1401 tmphead = tmp;
1402 tmptail = tmp;
1403 count++;
1404 }
1405
1406 /*
1407 * The tmp mblk chain is either the start of the new
1408 * chain or added to the tail of the new chain.
1409 */
1410 if (head == NULL) {
1411 head = tmphead;
1412 tail = tmptail;
1413 } else {
1414 /* Attach the new mblk to the end of the new chain. */
1415 tail->b_next = tmphead;
1416 tail = tmptail;
1417 }
1418
1419 mp = next;
1420 }
1421
1422 *mp_chain = head;
1423
1424 if (otail != NULL)
1425 *otail = tail;
1426
1427 if (ocount != NULL)
1428 *ocount = count;
1429 }
1430
1431 /*
1432 * Add VLAN tag to the specified mblk.
1433 */
1434 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1435 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1436 {
1437 mblk_t *hmp;
1438 struct ether_vlan_header *evhp;
1439 struct ether_header *ehp;
1440
1441 ASSERT(pri != 0 || vid != 0);
1442
1443 /*
1444 * Allocate an mblk for the new tagged ethernet header,
1445 * and copy the MAC addresses and ethertype from the
1446 * original header.
1447 */
1448
1449 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1450 if (hmp == NULL) {
1451 freemsg(mp);
1452 return (NULL);
1453 }
1454
1455 evhp = (struct ether_vlan_header *)hmp->b_rptr;
1456 ehp = (struct ether_header *)mp->b_rptr;
1457
1458 bcopy(ehp, evhp, (ETHERADDRL * 2));
1459 evhp->ether_type = ehp->ether_type;
1460 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1461
1462 hmp->b_wptr += sizeof (struct ether_vlan_header);
1463 mp->b_rptr += sizeof (struct ether_header);
1464
1465 /*
1466 * Free the original message if it's now empty. Link the
1467 * rest of messages to the header message.
1468 */
1469 mac_hcksum_clone(mp, hmp);
1470 if (MBLKL(mp) == 0) {
1471 hmp->b_cont = mp->b_cont;
1472 freeb(mp);
1473 } else {
1474 hmp->b_cont = mp;
1475 }
1476 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1477
1478 /*
1479 * Initialize the new TCI (Tag Control Information).
1480 */
1481 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1482
1483 return (hmp);
1484 }
1485
1486 /*
1487 * Adds a VLAN tag with the specified VID and priority to each mblk of
1488 * the specified chain.
1489 */
1490 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1491 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1492 {
1493 mblk_t *next_mp, **prev, *mp;
1494
1495 mp = mp_chain;
1496 prev = &mp_chain;
1497
1498 while (mp != NULL) {
1499 next_mp = mp->b_next;
1500 mp->b_next = NULL;
1501 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1502 freemsgchain(next_mp);
1503 break;
1504 }
1505 *prev = mp;
1506 prev = &mp->b_next;
1507 mp = mp->b_next = next_mp;
1508 }
1509
1510 return (mp_chain);
1511 }
1512
1513 /*
1514 * Strip VLAN tag
1515 */
1516 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1517 mac_strip_vlan_tag(mblk_t *mp)
1518 {
1519 mblk_t *newmp;
1520 struct ether_vlan_header *evhp;
1521
1522 evhp = (struct ether_vlan_header *)mp->b_rptr;
1523 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1524 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1525
1526 if (DB_REF(mp) > 1) {
1527 newmp = copymsg(mp);
1528 if (newmp == NULL)
1529 return (NULL);
1530 freemsg(mp);
1531 mp = newmp;
1532 }
1533
1534 evhp = (struct ether_vlan_header *)mp->b_rptr;
1535
1536 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1537 mp->b_rptr += VLAN_TAGSZ;
1538 }
1539 return (mp);
1540 }
1541
1542 /*
1543 * Strip VLAN tag from each mblk of the chain.
1544 */
1545 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1546 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1547 {
1548 mblk_t *mp, *next_mp, **prev;
1549
1550 mp = mp_chain;
1551 prev = &mp_chain;
1552
1553 while (mp != NULL) {
1554 next_mp = mp->b_next;
1555 mp->b_next = NULL;
1556 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1557 freemsgchain(next_mp);
1558 break;
1559 }
1560 *prev = mp;
1561 prev = &mp->b_next;
1562 mp = mp->b_next = next_mp;
1563 }
1564
1565 return (mp_chain);
1566 }
1567
1568 /*
1569 * Default callback function. Used when the datapath is not yet initialized.
1570 */
1571 /* ARGSUSED */
1572 void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1573 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1574 boolean_t loopback)
1575 {
1576 freemsgchain(mp_chain);
1577 }
1578
1579 /*
1580 * Determines the IPv6 header length accounting for all the optional IPv6
1581 * headers (hop-by-hop, destination, routing and fragment). The header length
1582 * and next header value (a transport header) is captured.
1583 *
1584 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1585 * returns B_TRUE.
1586 */
1587 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)1588 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1589 uint8_t *next_hdr, ip6_frag_t **fragp)
1590 {
1591 uint16_t length;
1592 uint_t ehdrlen;
1593 uint8_t *whereptr;
1594 uint8_t *nexthdrp;
1595 ip6_dest_t *desthdr;
1596 ip6_rthdr_t *rthdr;
1597 ip6_frag_t *fraghdr;
1598
1599 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1600 return (B_FALSE);
1601 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1602 length = IPV6_HDR_LEN;
1603 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1604
1605 if (fragp != NULL)
1606 *fragp = NULL;
1607
1608 nexthdrp = &ip6h->ip6_nxt;
1609 while (whereptr < endptr) {
1610 /* Is there enough left for len + nexthdr? */
1611 if (whereptr + MIN_EHDR_LEN > endptr)
1612 break;
1613
1614 switch (*nexthdrp) {
1615 case IPPROTO_HOPOPTS:
1616 case IPPROTO_DSTOPTS:
1617 /* Assumes the headers are identical for hbh and dst */
1618 desthdr = (ip6_dest_t *)whereptr;
1619 ehdrlen = 8 * (desthdr->ip6d_len + 1);
1620 if ((uchar_t *)desthdr + ehdrlen > endptr)
1621 return (B_FALSE);
1622 nexthdrp = &desthdr->ip6d_nxt;
1623 break;
1624 case IPPROTO_ROUTING:
1625 rthdr = (ip6_rthdr_t *)whereptr;
1626 ehdrlen = 8 * (rthdr->ip6r_len + 1);
1627 if ((uchar_t *)rthdr + ehdrlen > endptr)
1628 return (B_FALSE);
1629 nexthdrp = &rthdr->ip6r_nxt;
1630 break;
1631 case IPPROTO_FRAGMENT:
1632 fraghdr = (ip6_frag_t *)whereptr;
1633 ehdrlen = sizeof (ip6_frag_t);
1634 if ((uchar_t *)&fraghdr[1] > endptr)
1635 return (B_FALSE);
1636 nexthdrp = &fraghdr->ip6f_nxt;
1637 if (fragp != NULL)
1638 *fragp = fraghdr;
1639 break;
1640 case IPPROTO_NONE:
1641 /* No next header means we're finished */
1642 default:
1643 *hdr_length = length;
1644 *next_hdr = *nexthdrp;
1645 return (B_TRUE);
1646 }
1647 length += ehdrlen;
1648 whereptr += ehdrlen;
1649 *hdr_length = length;
1650 *next_hdr = *nexthdrp;
1651 }
1652 switch (*nexthdrp) {
1653 case IPPROTO_HOPOPTS:
1654 case IPPROTO_DSTOPTS:
1655 case IPPROTO_ROUTING:
1656 case IPPROTO_FRAGMENT:
1657 /*
1658 * If any know extension headers are still to be processed,
1659 * the packet's malformed (or at least all the IP header(s) are
1660 * not in the same mblk - and that should never happen.
1661 */
1662 return (B_FALSE);
1663
1664 default:
1665 /*
1666 * If we get here, we know that all of the IP headers were in
1667 * the same mblk, even if the ULP header is in the next mblk.
1668 */
1669 *hdr_length = length;
1670 *next_hdr = *nexthdrp;
1671 return (B_TRUE);
1672 }
1673 }
1674
1675 /*
1676 * The following set of routines are there to take care of interrupt
1677 * re-targeting for legacy (fixed) interrupts. Some older versions
1678 * of the popular NICs like e1000g do not support MSI-X interrupts
1679 * and they reserve fixed interrupts for RX/TX rings. To re-target
1680 * these interrupts, PCITOOL ioctls need to be used.
1681 */
1682 typedef struct mac_dladm_intr {
1683 int ino;
1684 int cpu_id;
1685 char driver_path[MAXPATHLEN];
1686 char nexus_path[MAXPATHLEN];
1687 } mac_dladm_intr_t;
1688
1689 /* Bind the interrupt to cpu_num */
1690 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)1691 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1692 {
1693 pcitool_intr_set_t iset;
1694 int err;
1695
1696 iset.old_cpu = oldcpuid;
1697 iset.ino = ino;
1698 iset.cpu_id = cpu_num;
1699 iset.user_version = PCITOOL_VERSION;
1700 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1701 kcred, NULL);
1702
1703 return (err);
1704 }
1705
1706 /*
1707 * Search interrupt information. iget is filled in with the info to search
1708 */
1709 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1710 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1711 {
1712 int i;
1713 char driver_path[2 * MAXPATHLEN];
1714
1715 for (i = 0; i < iget_p->num_devs; i++) {
1716 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1717 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1718 ":%s%d", iget_p->dev[i].driver_name,
1719 iget_p->dev[i].dev_inst);
1720 /* Match the device path for the device path */
1721 if (strcmp(driver_path, dln->driver_path) == 0) {
1722 dln->ino = iget_p->ino;
1723 dln->cpu_id = iget_p->cpu_id;
1724 return (B_TRUE);
1725 }
1726 }
1727 return (B_FALSE);
1728 }
1729
1730 /*
1731 * Get information about ino, i.e. if this is the interrupt for our
1732 * device and where it is bound etc.
1733 */
1734 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)1735 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1736 mac_dladm_intr_t *dln)
1737 {
1738 pcitool_intr_get_t *iget_p;
1739 int ipsz;
1740 int nipsz;
1741 int err;
1742 uint8_t inum;
1743
1744 /*
1745 * Check if SLEEP is OK, i.e if could come here in response to
1746 * changing the fanout due to some callback from the driver, say
1747 * link speed changes.
1748 */
1749 ipsz = PCITOOL_IGET_SIZE(0);
1750 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1751
1752 iget_p->num_devs_ret = 0;
1753 iget_p->user_version = PCITOOL_VERSION;
1754 iget_p->cpu_id = oldcpuid;
1755 iget_p->ino = ino;
1756
1757 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1758 FKIOCTL, kcred, NULL);
1759 if (err != 0) {
1760 kmem_free(iget_p, ipsz);
1761 return (B_FALSE);
1762 }
1763 if (iget_p->num_devs == 0) {
1764 kmem_free(iget_p, ipsz);
1765 return (B_FALSE);
1766 }
1767 inum = iget_p->num_devs;
1768 if (iget_p->num_devs_ret < iget_p->num_devs) {
1769 /* Reallocate */
1770 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1771
1772 kmem_free(iget_p, ipsz);
1773 ipsz = nipsz;
1774 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1775
1776 iget_p->num_devs_ret = inum;
1777 iget_p->cpu_id = oldcpuid;
1778 iget_p->ino = ino;
1779 iget_p->user_version = PCITOOL_VERSION;
1780 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1781 FKIOCTL, kcred, NULL);
1782 if (err != 0) {
1783 kmem_free(iget_p, ipsz);
1784 return (B_FALSE);
1785 }
1786 /* defensive */
1787 if (iget_p->num_devs != iget_p->num_devs_ret) {
1788 kmem_free(iget_p, ipsz);
1789 return (B_FALSE);
1790 }
1791 }
1792
1793 if (mac_search_intrinfo(iget_p, dln)) {
1794 kmem_free(iget_p, ipsz);
1795 return (B_TRUE);
1796 }
1797 kmem_free(iget_p, ipsz);
1798 return (B_FALSE);
1799 }
1800
1801 /*
1802 * Get the interrupts and check each one to see if it is for our device.
1803 */
1804 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1805 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1806 {
1807 pcitool_intr_info_t intr_info;
1808 int err;
1809 int ino;
1810 int oldcpuid;
1811
1812 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1813 FKIOCTL, kcred, NULL);
1814 if (err != 0)
1815 return (-1);
1816
1817 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1818 for (ino = 0; ino < intr_info.num_intr; ino++) {
1819 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1820 if (dln->cpu_id == cpuid)
1821 return (0);
1822 return (1);
1823 }
1824 }
1825 }
1826 return (-1);
1827 }
1828
1829 /*
1830 * Obtain the nexus parent node info. for mdip.
1831 */
1832 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1833 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1834 {
1835 struct dev_info *tdip = (struct dev_info *)mdip;
1836 struct ddi_minor_data *minordata;
1837 dev_info_t *pdip;
1838 char pathname[MAXPATHLEN];
1839
1840 while (tdip != NULL) {
1841 /*
1842 * The netboot code could call this function while walking the
1843 * device tree so we need to use ndi_devi_tryenter() here to
1844 * avoid deadlock.
1845 */
1846 if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1847 break;
1848
1849 for (minordata = tdip->devi_minor; minordata != NULL;
1850 minordata = minordata->next) {
1851 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1852 strlen(DDI_NT_INTRCTL)) == 0) {
1853 pdip = minordata->dip;
1854 (void) ddi_pathname(pdip, pathname);
1855 (void) snprintf(dln->nexus_path, MAXPATHLEN,
1856 "/devices%s:intr", pathname);
1857 (void) ddi_pathname_minor(minordata, pathname);
1858 ndi_devi_exit((dev_info_t *)tdip);
1859 return (pdip);
1860 }
1861 }
1862 ndi_devi_exit((dev_info_t *)tdip);
1863 tdip = tdip->devi_parent;
1864 }
1865 return (NULL);
1866 }
1867
1868 /*
1869 * For a primary MAC client, if the user has set a list or CPUs or
1870 * we have obtained it implicitly, we try to retarget the interrupt
1871 * for that device on one of the CPUs in the list.
1872 * We assign the interrupt to the same CPU as the poll thread.
1873 */
1874 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1875 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1876 {
1877 ldi_handle_t lh = NULL;
1878 ldi_ident_t li = NULL;
1879 int err;
1880 int ret;
1881 mac_dladm_intr_t dln;
1882 dev_info_t *dip;
1883 struct ddi_minor_data *minordata;
1884
1885 dln.nexus_path[0] = '\0';
1886 dln.driver_path[0] = '\0';
1887
1888 minordata = ((struct dev_info *)mdip)->devi_minor;
1889 while (minordata != NULL) {
1890 if (minordata->type == DDM_MINOR)
1891 break;
1892 minordata = minordata->next;
1893 }
1894 if (minordata == NULL)
1895 return (B_FALSE);
1896
1897 (void) ddi_pathname_minor(minordata, dln.driver_path);
1898
1899 dip = mac_get_nexus_node(mdip, &dln);
1900 /* defensive */
1901 if (dip == NULL)
1902 return (B_FALSE);
1903
1904 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1905 if (err != 0)
1906 return (B_FALSE);
1907
1908 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1909 if (err != 0)
1910 return (B_FALSE);
1911
1912 ret = mac_validate_intr(lh, &dln, cpuid);
1913 if (ret < 0) {
1914 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1915 return (B_FALSE);
1916 }
1917 /* cmn_note? */
1918 if (ret != 0)
1919 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1920 != 0) {
1921 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1922 return (B_FALSE);
1923 }
1924 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1925 return (B_TRUE);
1926 }
1927
1928 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1929 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1930 {
1931 dev_info_t *mdip = (dev_info_t *)arg;
1932 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1933 mac_resource_props_t *mrp;
1934 mac_perim_handle_t mph;
1935 flow_entry_t *flent = mcip->mci_flent;
1936 mac_soft_ring_set_t *rx_srs;
1937 mac_cpus_t *srs_cpu;
1938
1939 if (!mac_check_interrupt_binding(mdip, cpuid))
1940 cpuid = -1;
1941 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1942 mrp = MCIP_RESOURCE_PROPS(mcip);
1943 mrp->mrp_rx_intr_cpu = cpuid;
1944 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1945 rx_srs = flent->fe_rx_srs[1];
1946 srs_cpu = &rx_srs->srs_cpu;
1947 srs_cpu->mc_rx_intr_cpu = cpuid;
1948 }
1949 mac_perim_exit(mph);
1950 }
1951
1952 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1953 mac_client_intr_cpu(mac_client_handle_t mch)
1954 {
1955 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1956 mac_cpus_t *srs_cpu;
1957 mac_soft_ring_set_t *rx_srs;
1958 flow_entry_t *flent = mcip->mci_flent;
1959 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
1960 mac_ring_t *ring;
1961 mac_intr_t *mintr;
1962
1963 /*
1964 * Check if we need to retarget the interrupt. We do this only
1965 * for the primary MAC client. We do this if we have the only
1966 * exclusive ring in the group.
1967 */
1968 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1969 rx_srs = flent->fe_rx_srs[1];
1970 srs_cpu = &rx_srs->srs_cpu;
1971 ring = rx_srs->srs_ring;
1972 mintr = &ring->mr_info.mri_intr;
1973 /*
1974 * If ddi_handle is present or the poll CPU is
1975 * already bound to the interrupt CPU, return -1.
1976 */
1977 if (mintr->mi_ddi_handle != NULL ||
1978 ((mrp->mrp_ncpus != 0) &&
1979 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1980 return (-1);
1981 }
1982 return (srs_cpu->mc_rx_pollid);
1983 }
1984 return (-1);
1985 }
1986
1987 void *
mac_get_devinfo(mac_handle_t mh)1988 mac_get_devinfo(mac_handle_t mh)
1989 {
1990 mac_impl_t *mip = (mac_impl_t *)mh;
1991
1992 return ((void *)mip->mi_dip);
1993 }
1994
1995 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1996 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1997 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1998
1999 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)2000 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
2001 {
2002 struct ether_header *ehp;
2003 uint64_t hash = 0;
2004 uint16_t sap;
2005 uint_t skip_len;
2006 uint8_t proto;
2007 boolean_t ip_fragmented;
2008
2009 /*
2010 * We may want to have one of these per MAC type plugin in the
2011 * future. For now supports only ethernet.
2012 */
2013 if (media != DL_ETHER)
2014 return (0L);
2015
2016 /* for now we support only outbound packets */
2017 ASSERT(is_outbound);
2018 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
2019 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
2020
2021 /* compute L2 hash */
2022
2023 ehp = (struct ether_header *)mp->b_rptr;
2024
2025 if ((policy & MAC_PKT_HASH_L2) != 0) {
2026 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
2027 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
2028 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
2029 policy &= ~MAC_PKT_HASH_L2;
2030 }
2031
2032 if (policy == 0)
2033 goto done;
2034
2035 /* skip ethernet header */
2036
2037 sap = ntohs(ehp->ether_type);
2038 if (sap == ETHERTYPE_VLAN) {
2039 struct ether_vlan_header *evhp;
2040 mblk_t *newmp = NULL;
2041
2042 skip_len = sizeof (struct ether_vlan_header);
2043 if (MBLKL(mp) < skip_len) {
2044 /* the vlan tag is the payload, pull up first */
2045 newmp = msgpullup(mp, -1);
2046 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
2047 goto done;
2048 }
2049 evhp = (struct ether_vlan_header *)newmp->b_rptr;
2050 } else {
2051 evhp = (struct ether_vlan_header *)mp->b_rptr;
2052 }
2053
2054 sap = ntohs(evhp->ether_type);
2055 freemsg(newmp);
2056 } else {
2057 skip_len = sizeof (struct ether_header);
2058 }
2059
2060 /* if ethernet header is in its own mblk, skip it */
2061 if (MBLKL(mp) <= skip_len) {
2062 skip_len -= MBLKL(mp);
2063 mp = mp->b_cont;
2064 if (mp == NULL)
2065 goto done;
2066 }
2067
2068 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
2069
2070 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
2071
2072 switch (sap) {
2073 case ETHERTYPE_IP: {
2074 ipha_t *iphp;
2075
2076 /*
2077 * If the header is not aligned or the header doesn't fit
2078 * in the mblk, bail now. Note that this may cause packets
2079 * reordering.
2080 */
2081 iphp = (ipha_t *)(mp->b_rptr + skip_len);
2082 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2083 !OK_32PTR((char *)iphp))
2084 goto done;
2085
2086 proto = iphp->ipha_protocol;
2087 skip_len += IPH_HDR_LENGTH(iphp);
2088
2089 /* Check if the packet is fragmented. */
2090 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2091 IPH_OFFSET;
2092
2093 /*
2094 * For fragmented packets, use addresses in addition to
2095 * the frag_id to generate the hash inorder to get
2096 * better distribution.
2097 */
2098 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2099 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2100 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2101
2102 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2103 PKT_HASH_4BYTES(ip_dst));
2104 policy &= ~MAC_PKT_HASH_L3;
2105 }
2106
2107 if (ip_fragmented) {
2108 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2109 hash ^= PKT_HASH_2BYTES(identp);
2110 goto done;
2111 }
2112 break;
2113 }
2114 case ETHERTYPE_IPV6: {
2115 ip6_t *ip6hp;
2116 ip6_frag_t *frag = NULL;
2117 uint16_t hdr_length;
2118
2119 /*
2120 * If the header is not aligned or the header doesn't fit
2121 * in the mblk, bail now. Note that this may cause packets
2122 * reordering.
2123 */
2124
2125 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2126 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2127 !OK_32PTR((char *)ip6hp))
2128 goto done;
2129
2130 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2131 &proto, &frag))
2132 goto done;
2133 skip_len += hdr_length;
2134
2135 /*
2136 * For fragmented packets, use addresses in addition to
2137 * the frag_id to generate the hash inorder to get
2138 * better distribution.
2139 */
2140 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2141 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2142 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2143
2144 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2145 PKT_HASH_4BYTES(ip_dst));
2146 policy &= ~MAC_PKT_HASH_L3;
2147 }
2148
2149 if (frag != NULL) {
2150 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2151 hash ^= PKT_HASH_4BYTES(identp);
2152 goto done;
2153 }
2154 break;
2155 }
2156 default:
2157 goto done;
2158 }
2159
2160 if (policy == 0)
2161 goto done;
2162
2163 /* if ip header is in its own mblk, skip it */
2164 if (MBLKL(mp) <= skip_len) {
2165 skip_len -= MBLKL(mp);
2166 mp = mp->b_cont;
2167 if (mp == NULL)
2168 goto done;
2169 }
2170
2171 /* parse ULP header */
2172 again:
2173 switch (proto) {
2174 case IPPROTO_TCP:
2175 case IPPROTO_UDP:
2176 case IPPROTO_ESP:
2177 case IPPROTO_SCTP:
2178 /*
2179 * These Internet Protocols are intentionally designed
2180 * for hashing from the git-go. Port numbers are in the first
2181 * word for transports, SPI is first for ESP.
2182 */
2183 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2184 goto done;
2185 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2186 break;
2187
2188 case IPPROTO_AH: {
2189 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2190 uint_t ah_length = AH_TOTAL_LEN(ah);
2191
2192 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2193 goto done;
2194
2195 proto = ah->ah_nexthdr;
2196 skip_len += ah_length;
2197
2198 /* if AH header is in its own mblk, skip it */
2199 if (MBLKL(mp) <= skip_len) {
2200 skip_len -= MBLKL(mp);
2201 mp = mp->b_cont;
2202 if (mp == NULL)
2203 goto done;
2204 }
2205
2206 goto again;
2207 }
2208 }
2209
2210 done:
2211 return (hash);
2212 }
2213