xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision 10597944279b73141546abca67a8e947810e5bb2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  * Copyright 2025 Oxide Computer Company
25  */
26 
27 /*
28  * MAC Services Module - misc utilities
29  */
30 
31 #include <sys/types.h>
32 #include <sys/mac.h>
33 #include <sys/mac_impl.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/mac_client_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/vlan.h>
40 #include <sys/pattr.h>
41 #include <sys/pci_tools.h>
42 #include <inet/ip.h>
43 #include <inet/ip_impl.h>
44 #include <inet/ip6.h>
45 #include <sys/vtrace.h>
46 #include <sys/dlpi.h>
47 #include <sys/sunndi.h>
48 #include <inet/ipsec_impl.h>
49 #include <inet/sadb.h>
50 #include <inet/ipsecesp.h>
51 #include <inet/ipsecah.h>
52 #include <inet/tcp.h>
53 #include <inet/sctp_ip.h>
54 
55 /*
56  * The next two functions are used for dropping packets or chains of
57  * packets, respectively. We could use one function for both but
58  * separating the use cases allows us to specify intent and prevent
59  * dropping more data than intended.
60  *
61  * The purpose of these functions is to aid the debugging effort,
62  * especially in production. Rather than use freemsg()/freemsgchain(),
63  * it's preferable to use these functions when dropping a packet in
64  * the MAC layer. These functions should only be used during
65  * unexpected conditions. That is, any time a packet is dropped
66  * outside of the regular, successful datapath. Consolidating all
67  * drops on these functions allows the user to trace one location and
68  * determine why the packet was dropped based on the msg. It also
69  * allows the user to inspect the packet before it is freed. Finally,
70  * it allows the user to avoid tracing freemsg()/freemsgchain() thus
71  * keeping the hot path running as efficiently as possible.
72  *
73  * NOTE: At this time not all MAC drops are aggregated on these
74  * functions; but that is the plan. This comment should be erased once
75  * completed.
76  */
77 
78 /*PRINTFLIKE2*/
79 void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
81 {
82 	va_list adx;
83 	char msg[128];
84 	char *msgp = msg;
85 
86 	ASSERT3P(mp->b_next, ==, NULL);
87 
88 	va_start(adx, fmt);
89 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
90 	va_end(adx);
91 
92 	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
93 	freemsg(mp);
94 }
95 
96 /*PRINTFLIKE2*/
97 void
mac_drop_chain(mblk_t * chain,const char * fmt,...)98 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
99 {
100 	va_list adx;
101 	char msg[128];
102 	char *msgp = msg;
103 
104 	va_start(adx, fmt);
105 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
106 	va_end(adx);
107 
108 	/*
109 	 * We could use freemsgchain() for the actual freeing but
110 	 * since we are already walking the chain to fire the dtrace
111 	 * probe we might as well free the msg here too.
112 	 */
113 	for (mblk_t *mp = chain, *next; mp != NULL; ) {
114 		next = mp->b_next;
115 		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
116 		mp->b_next = NULL;
117 		freemsg(mp);
118 		mp = next;
119 	}
120 }
121 
122 /*
123  * Copy an mblk, preserving its hardware checksum flags.
124  */
125 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)126 mac_copymsg_cksum(mblk_t *mp)
127 {
128 	mblk_t *mp1;
129 
130 	mp1 = copymsg(mp);
131 	if (mp1 == NULL)
132 		return (NULL);
133 
134 	mac_hcksum_clone(mp, mp1);
135 
136 	return (mp1);
137 }
138 
139 /*
140  * Copy an mblk chain, presenting the hardware checksum flags of the
141  * individual mblks.
142  */
143 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)144 mac_copymsgchain_cksum(mblk_t *mp)
145 {
146 	mblk_t *nmp = NULL;
147 	mblk_t **nmpp = &nmp;
148 
149 	for (; mp != NULL; mp = mp->b_next) {
150 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
151 			freemsgchain(nmp);
152 			return (NULL);
153 		}
154 
155 		nmpp = &((*nmpp)->b_next);
156 	}
157 
158 	return (nmp);
159 }
160 
161 /*
162  * Perform software checksum on a single message, if needed. The emulation
163  * performed is determined by an intersection of the mblk's flags and the emul
164  * flags requested. The emul flags are documented in mac.h.
165  */
166 static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)167 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
168 {
169 	mac_ether_offload_info_t meoi = { 0 };
170 	const char *err = "";
171 
172 	/*
173 	 * The only current caller is mac_hw_emul(), which handles any chaining
174 	 * of mblks prior to now.
175 	 */
176 	VERIFY3P(mp->b_next, ==, NULL);
177 
178 	uint32_t flags = DB_CKSUMFLAGS(mp);
179 
180 	/* Why call this if checksum emulation isn't needed? */
181 	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
182 	/* But also, requesting both ULP cksum types is improper */
183 	if ((flags & HCK_FULLCKSUM) != 0 && (flags & HCK_PARTIALCKSUM) != 0) {
184 		err = "full and partial ULP cksum requested";
185 		goto bail;
186 	}
187 
188 	const boolean_t do_v4_cksum = (emul & MAC_IPCKSUM_EMUL) != 0 &&
189 	    (flags & HCK_IPV4_HDRCKSUM) != 0;
190 	const boolean_t do_ulp_cksum = (emul & MAC_HWCKSUM_EMUL) != 0 &&
191 	    (flags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) != 0;
192 	const boolean_t ulp_prefer_partial = (flags & HCK_PARTIALCKSUM) != 0;
193 
194 	mac_ether_offload_info(mp, &meoi);
195 	if ((meoi.meoi_flags & MEOI_L2INFO_SET) == 0 ||
196 	    (meoi.meoi_l3proto != ETHERTYPE_IP &&
197 	    meoi.meoi_l3proto != ETHERTYPE_IPV6)) {
198 		/* Non-IP traffic (like ARP) is left alone */
199 		return (mp);
200 	}
201 
202 	/*
203 	 * Ensure that requested checksum type(s) are supported by the
204 	 * protocols encoded in the packet headers.
205 	 */
206 	if (do_v4_cksum) {
207 		if (meoi.meoi_l3proto != ETHERTYPE_IP) {
208 			err = "IPv4 csum requested on non-IPv4 packet";
209 			goto bail;
210 		}
211 	}
212 	if (do_ulp_cksum) {
213 		if ((meoi.meoi_flags & MEOI_L4INFO_SET) == 0) {
214 			err = "missing ULP header";
215 			goto bail;
216 		}
217 		switch (meoi.meoi_l4proto) {
218 		case IPPROTO_TCP:
219 		case IPPROTO_UDP:
220 		case IPPROTO_ICMP:
221 		case IPPROTO_ICMPV6:
222 		case IPPROTO_SCTP:
223 			break;
224 		default:
225 			err = "unexpected ULP";
226 			goto bail;
227 		}
228 	}
229 
230 	/*
231 	 * If the first mblk of this packet contains only the Ethernet header,
232 	 * skip past it for now. Packets with their data contained in only a
233 	 * single mblk can then use the fastpaths tuned to that possibility.
234 	 */
235 	mblk_t *skipped_hdr = NULL;
236 	if (MBLKL(mp) == meoi.meoi_l2hlen) {
237 		meoi.meoi_len -= meoi.meoi_l2hlen;
238 		meoi.meoi_l2hlen = 0;
239 		skipped_hdr = mp;
240 		mp = mp->b_cont;
241 
242 		ASSERT(mp != NULL);
243 	}
244 
245 	/*
246 	 * Ensure that all of the headers we need to access are:
247 	 * 1. Collected in the first mblk
248 	 * 2. Held in a data-block which is safe for us to modify
249 	 *    (It must have a refcount of 1)
250 	 */
251 	const size_t hdr_len_reqd = (meoi.meoi_l2hlen + meoi.meoi_l3hlen) +
252 	    (do_ulp_cksum ? meoi.meoi_l4hlen : 0);
253 	if (MBLKL(mp) < hdr_len_reqd || DB_REF(mp) > 1) {
254 		mblk_t *hdrmp = msgpullup(mp, hdr_len_reqd);
255 
256 		if (hdrmp == NULL) {
257 			err = "could not pullup msg headers";
258 			goto bail;
259 		}
260 
261 		mac_hcksum_clone(mp, hdrmp);
262 		if (skipped_hdr != NULL) {
263 			ASSERT3P(skipped_hdr->b_cont, ==, mp);
264 			skipped_hdr->b_cont = hdrmp;
265 		}
266 		freemsg(mp);
267 		mp = hdrmp;
268 	}
269 
270 	/* Calculate IPv4 header checksum, if requested */
271 	if (do_v4_cksum) {
272 		/*
273 		 * While unlikely, it's possible to write code that might end up
274 		 * calling mac_sw_cksum() twice on the same mblk (performing
275 		 * both LSO and checksum emulation in a single mblk chain loop
276 		 * -- the LSO emulation inserts a new chain into the existing
277 		 * chain and then the loop iterates back over the new segments
278 		 * and emulates the checksum a second time).  Normally this
279 		 * wouldn't be a problem, because the HCK_*_OK flags are
280 		 * supposed to indicate that we don't need to do peform the
281 		 * work. But HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
282 		 * same value; so we cannot use these flags to determine if the
283 		 * IP header checksum has already been calculated or not. For
284 		 * this reason, we zero out the the checksum first. In the
285 		 * future, we should fix the HCK_* flags.
286 		 */
287 		ipha_t *ipha = (ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
288 		ipha->ipha_hdr_checksum = 0;
289 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
290 		flags &= ~HCK_IPV4_HDRCKSUM;
291 		flags |= HCK_IPV4_HDRCKSUM_OK;
292 	}
293 
294 	/*
295 	 * The SCTP is different from all the other protocols in that it uses
296 	 * CRC32 for its checksum, rather than ones' complement.
297 	 */
298 	if (do_ulp_cksum && meoi.meoi_l4proto == IPPROTO_SCTP) {
299 		if (ulp_prefer_partial) {
300 			err = "SCTP does not support partial checksum";
301 			goto bail;
302 		}
303 
304 		const uint_t ulp_off = meoi.meoi_l2hlen + meoi.meoi_l3hlen;
305 		sctp_hdr_t *sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_off);
306 
307 		sctph->sh_chksum = 0;
308 		sctph->sh_chksum = sctp_cksum(mp, ulp_off);
309 
310 		flags &= ~HCK_FULLCKSUM;
311 		flags |= HCK_FULLCKSUM_OK;
312 		goto success;
313 	}
314 
315 	/* Calculate full ULP checksum, if requested */
316 	if (do_ulp_cksum && !ulp_prefer_partial) {
317 		/*
318 		 * Calculate address and length portions of pseudo-header csum
319 		 */
320 		uint32_t cksum = 0;
321 		if (meoi.meoi_l3proto == ETHERTYPE_IP) {
322 			const ipha_t *ipha =
323 			    (const ipha_t *)(mp->b_rptr + meoi.meoi_l2hlen);
324 			const uint16_t *ipp =
325 			    (const uint16_t *)(&ipha->ipha_src);
326 
327 			cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3];
328 
329 			/*
330 			 * While it is tempting to calculate the payload length
331 			 * solely from `meoi`, like as done below for IPv6,
332 			 * doing so is a trap.  Packets shorter than 60 bytes
333 			 * will get padded out to that length in order to meet
334 			 * the minimums for Ethernet.  Instead, we pull the
335 			 * length from the IP header.
336 			 */
337 			const uint16_t payload_len =
338 			    ntohs(ipha->ipha_length) - meoi.meoi_l3hlen;
339 			cksum += htons(payload_len);
340 		} else if (meoi.meoi_l3proto == ETHERTYPE_IPV6) {
341 			const ip6_t *ip6h =
342 			    (const ip6_t *)(mp->b_rptr + meoi.meoi_l2hlen);
343 			const uint16_t *ipp =
344 			    (const uint16_t *)(&ip6h->ip6_src);
345 
346 			cksum += ipp[0] + ipp[1] + ipp[2] + ipp[3] +
347 			    ipp[4] + ipp[5] + ipp[6] + ipp[7];
348 			cksum += ipp[8] + ipp[9] + ipp[10] + ipp[11] +
349 			    ipp[12] + ipp[13] + ipp[14] + ipp[15];
350 
351 			const uint16_t payload_len = meoi.meoi_len -
352 			    ((uint16_t)meoi.meoi_l2hlen + meoi.meoi_l3hlen);
353 			cksum += htons(payload_len);
354 		} else {
355 			/*
356 			 * Since we already checked for recognized L3 protocols
357 			 * earlier, this should not be reachable.
358 			 */
359 			panic("L3 protocol unexpectedly changed");
360 		}
361 
362 		/* protocol portion of pseudo-header */
363 		uint_t cksum_off;
364 		switch (meoi.meoi_l4proto) {
365 		case IPPROTO_TCP:
366 			cksum += IP_TCP_CSUM_COMP;
367 			cksum_off = TCP_CHECKSUM_OFFSET;
368 			break;
369 		case IPPROTO_UDP:
370 			cksum += IP_UDP_CSUM_COMP;
371 			cksum_off = UDP_CHECKSUM_OFFSET;
372 			break;
373 		case IPPROTO_ICMP:
374 			/* ICMP cksum does not include pseudo-header contents */
375 			cksum = 0;
376 			cksum_off = ICMP_CHECKSUM_OFFSET;
377 			break;
378 		case IPPROTO_ICMPV6:
379 			cksum += IP_ICMPV6_CSUM_COMP;
380 			cksum_off = ICMPV6_CHECKSUM_OFFSET;
381 			break;
382 		default:
383 			err = "unrecognized L4 protocol";
384 			goto bail;
385 		}
386 
387 		/*
388 		 * With IP_CSUM() taking into account the pseudo-header
389 		 * checksum, make sure the ULP checksum field is zeroed before
390 		 * computing the rest;
391 		 */
392 		const uint_t l4_off = meoi.meoi_l3hlen + meoi.meoi_l2hlen;
393 		uint16_t *up = (uint16_t *)(mp->b_rptr + l4_off + cksum_off);
394 		*up = 0;
395 		cksum = IP_CSUM(mp, l4_off, cksum);
396 
397 		if (meoi.meoi_l4proto == IPPROTO_UDP && cksum == 0) {
398 			/*
399 			 * A zero checksum is not allowed on UDPv6, and on UDPv4
400 			 * implies no checksum.  In either case, invert to a
401 			 * values of all-1s.
402 			 */
403 			*up = 0xffff;
404 		} else {
405 			*up = cksum;
406 		}
407 
408 		flags &= ~HCK_FULLCKSUM;
409 		flags |= HCK_FULLCKSUM_OK;
410 		goto success;
411 	}
412 
413 	/* Calculate partial ULP checksum, if requested */
414 	if (do_ulp_cksum && ulp_prefer_partial) {
415 		uint32_t start, stuff, end, value;
416 		mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
417 
418 		ASSERT3S(end, >, start);
419 
420 		/*
421 		 * The prior size checks against the header length data ensure
422 		 * that the mblk contains everything through at least the ULP
423 		 * header, but if the partial checksum (unexpectedly) requests
424 		 * its result be stored past that, we cannot continue.
425 		 */
426 		if (stuff + sizeof (uint16_t) > MBLKL(mp)) {
427 			err = "partial csum request is out of bounds";
428 			goto bail;
429 		}
430 
431 		uchar_t *ipp = (uchar_t *)(mp->b_rptr + meoi.meoi_l2hlen);
432 		uint16_t *up = (uint16_t *)(ipp + stuff);
433 
434 		const uint16_t partial = *up;
435 		*up = 0;
436 		const uint16_t cksum =
437 		    ~IP_CSUM_PARTIAL(mp, start + meoi.meoi_l2hlen, partial);
438 		*up = cksum != 0 ? cksum : ~cksum;
439 
440 		flags &= ~HCK_PARTIALCKSUM;
441 		flags |= HCK_FULLCKSUM_OK;
442 	}
443 
444 success:
445 	/*
446 	 * With the checksum(s) calculated, store the updated flags to reflect
447 	 * the current status, and zero out any of the partial-checksum fields
448 	 * which would be irrelevant now.
449 	 */
450 	mac_hcksum_set(mp, 0, 0, 0, 0, flags);
451 
452 	/* Don't forget to reattach the header. */
453 	if (skipped_hdr != NULL) {
454 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
455 
456 		/*
457 		 * Duplicate the HCKSUM data into the header mblk.
458 		 *
459 		 * This mimics mac_add_vlan_tag() which ensures that both the
460 		 * first mblk _and_ the first data bearing mblk possess the
461 		 * HCKSUM information. Consumers like IP will end up discarding
462 		 * the ether_header mblk, so for now, it is important that the
463 		 * data be available in both places.
464 		 */
465 		mac_hcksum_clone(mp, skipped_hdr);
466 		mp = skipped_hdr;
467 	}
468 	return (mp);
469 
470 bail:
471 	if (skipped_hdr != NULL) {
472 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
473 		mp = skipped_hdr;
474 	}
475 
476 	mac_drop_pkt(mp, err);
477 	return (NULL);
478 }
479 
480 /*
481  * Build a single data segment from an LSO packet. The mblk chain
482  * returned, seg_head, represents the data segment and is always
483  * exactly seg_len bytes long. The lso_mp and offset input/output
484  * parameters track our position in the LSO packet. This function
485  * exists solely as a helper to mac_sw_lso().
486  *
487  * Case A
488  *
489  *     The current lso_mp is larger than the requested seg_len. The
490  *     beginning of seg_head may start at the beginning of lso_mp or
491  *     offset into it. In either case, a single mblk is returned, and
492  *     *offset is updated to reflect our new position in the current
493  *     lso_mp.
494  *
495  *          +----------------------------+
496  *          |  in *lso_mp / out *lso_mp  |
497  *          +----------------------------+
498  *          ^                        ^
499  *          |                        |
500  *          |                        |
501  *          |                        |
502  *          +------------------------+
503  *          |        seg_head        |
504  *          +------------------------+
505  *          ^                        ^
506  *          |                        |
507  *   in *offset = 0        out *offset = seg_len
508  *
509  *          |------   seg_len    ----|
510  *
511  *
512  *       +------------------------------+
513  *       |   in *lso_mp / out *lso_mp   |
514  *       +------------------------------+
515  *          ^                        ^
516  *          |                        |
517  *          |                        |
518  *          |                        |
519  *          +------------------------+
520  *          |        seg_head        |
521  *          +------------------------+
522  *          ^                        ^
523  *          |                        |
524  *   in *offset = N        out *offset = N + seg_len
525  *
526  *          |------   seg_len    ----|
527  *
528  *
529  *
530  * Case B
531  *
532  *    The requested seg_len consumes exactly the rest of the lso_mp.
533  *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
534  *    The seg_head may start at the beginning of the lso_mp or at some
535  *    offset into it. In either case we return a single mblk, reset
536  *    *offset to zero, and walk to the next lso_mp.
537  *
538  *          +------------------------+           +------------------------+
539  *          |       in *lso_mp       |---------->|      out *lso_mp       |
540  *          +------------------------+           +------------------------+
541  *          ^                        ^           ^
542  *          |                        |           |
543  *          |                        |    out *offset = 0
544  *          |                        |
545  *          +------------------------+
546  *          |        seg_head        |
547  *          +------------------------+
548  *          ^
549  *          |
550  *   in *offset = 0
551  *
552  *          |------   seg_len    ----|
553  *
554  *
555  *
556  *      +----------------------------+           +------------------------+
557  *      |         in *lso_mp         |---------->|      out *lso_mp       |
558  *      +----------------------------+           +------------------------+
559  *          ^                        ^           ^
560  *          |                        |           |
561  *          |                        |    out *offset = 0
562  *          |                        |
563  *          +------------------------+
564  *          |        seg_head        |
565  *          +------------------------+
566  *          ^
567  *          |
568  *   in *offset = N
569  *
570  *          |------   seg_len    ----|
571  *
572  *
573  * Case C
574  *
575  *    The requested seg_len is greater than the current lso_mp. In
576  *    this case we must consume LSO mblks until we have enough data to
577  *    satisfy either case (A) or (B) above. We will return multiple
578  *    mblks linked via b_cont, offset will be set based on the cases
579  *    above, and lso_mp will walk forward at least one mblk, but maybe
580  *    more.
581  *
582  *    N.B. This digram is not exhaustive. The seg_head may start on
583  *    the beginning of an lso_mp. The seg_tail may end exactly on the
584  *    boundary of an lso_mp. And there may be two (in this case the
585  *    middle block wouldn't exist), three, or more mblks in the
586  *    seg_head chain. This is meant as one example of what might
587  *    happen. The main thing to remember is that the seg_tail mblk
588  *    must be one of case (A) or (B) above.
589  *
590  *  +------------------+    +----------------+    +------------------+
591  *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
592  *  +------------------+    +----------------+    +------------------+
593  *        ^            ^    ^                ^    ^            ^
594  *        |            |    |                |    |            |
595  *        |            |    |                |    |            |
596  *        |            |    |                |    |            |
597  *        |            |    |                |    |            |
598  *        +------------+    +----------------+    +------------+
599  *        |  seg_head  |--->|                |--->|  seg_tail  |
600  *        +------------+    +----------------+    +------------+
601  *        ^                                                    ^
602  *        |                                                    |
603  *  in *offset = N                          out *offset = MBLKL(seg_tail)
604  *
605  *        |-------------------   seg_len    -------------------|
606  *
607  */
608 static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)609 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
610 {
611 	mblk_t *seg_head, *seg_tail, *seg_mp;
612 
613 	ASSERT3P(*lso_mp, !=, NULL);
614 	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
615 
616 	seg_mp = dupb(*lso_mp);
617 	if (seg_mp == NULL)
618 		return (NULL);
619 
620 	seg_head = seg_mp;
621 	seg_tail = seg_mp;
622 
623 	/* Continue where we left off from in the lso_mp. */
624 	seg_mp->b_rptr += *offset;
625 
626 last_mblk:
627 	/* Case (A) */
628 	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
629 		*offset += seg_len;
630 		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
631 		return (seg_head);
632 	}
633 
634 	/* Case (B) */
635 	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
636 		*offset = 0;
637 		*lso_mp = (*lso_mp)->b_cont;
638 		return (seg_head);
639 	}
640 
641 	/* Case (C) */
642 	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
643 
644 	/*
645 	 * The current LSO mblk doesn't have enough data to satisfy
646 	 * seg_len -- continue peeling off LSO mblks to build the new
647 	 * segment message. If allocation fails we free the previously
648 	 * allocated segment mblks and return NULL.
649 	 */
650 	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
651 		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
652 		seg_len -= MBLKL(seg_mp);
653 		*offset = 0;
654 		*lso_mp = (*lso_mp)->b_cont;
655 		seg_mp = dupb(*lso_mp);
656 
657 		if (seg_mp == NULL) {
658 			freemsgchain(seg_head);
659 			return (NULL);
660 		}
661 
662 		seg_tail->b_cont = seg_mp;
663 		seg_tail = seg_mp;
664 	}
665 
666 	/*
667 	 * We've walked enough LSO mblks that we can now satisfy the
668 	 * remaining seg_len. At this point we need to jump back to
669 	 * determine if we have arrived at case (A) or (B).
670 	 */
671 
672 	/* Just to be paranoid that we didn't underflow. */
673 	ASSERT3U(seg_len, <, IP_MAXPACKET);
674 	ASSERT3U(seg_len, >, 0);
675 	goto last_mblk;
676 }
677 
678 /*
679  * Perform software segmentation of a single LSO message. Take an LSO
680  * message as input and return head/tail pointers as output. This
681  * function should not be invoked directly but instead through
682  * mac_hw_emul().
683  *
684  * The resulting chain is comprised of multiple (nsegs) MSS sized
685  * segments. Each segment will consist of two or more mblks joined by
686  * b_cont: a header and one or more data mblks. The header mblk is
687  * allocated anew for each message. The first segment's header is used
688  * as a template for the rest with adjustments made for things such as
689  * ID, sequence, length, TCP flags, etc. The data mblks reference into
690  * the existing LSO mblk (passed in as omp) by way of dupb(). Their
691  * b_rptr/b_wptr values are adjusted to reference only the fraction of
692  * the LSO message they are responsible for. At the successful
693  * completion of this function the original mblk (omp) is freed,
694  * leaving the newely created segment chain as the only remaining
695  * reference to the data.
696  */
697 static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)698 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
699     uint_t *count)
700 {
701 	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
702 	uint32_t mss;
703 	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen;
704 	uint32_t odatalen, oleft;
705 	uint_t nsegs, seg;
706 	int len;
707 
708 	const void *oiph;
709 	const tcph_t *otcph;
710 	ipha_t *niph;
711 	tcph_t *ntcph;
712 	uint16_t ip_id;
713 	uint32_t tcp_seq, tcp_sum, otcp_sum;
714 
715 	boolean_t is_v6 = B_FALSE;
716 	ip6_t *niph6;
717 
718 	uint32_t offset = 0;
719 	mblk_t *odatamp;
720 	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
721 	mblk_t *tmptail;
722 
723 	mac_ether_offload_info_t meoi = { 0 };
724 
725 	ASSERT3P(head, !=, NULL);
726 	ASSERT3P(tail, !=, NULL);
727 	ASSERT3P(count, !=, NULL);
728 	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
729 
730 	/* Assume we are dealing with a single LSO message. */
731 	ASSERT3P(omp->b_next, ==, NULL);
732 
733 	mac_ether_offload_info(omp, &meoi);
734 	opktlen = meoi.meoi_len;
735 	oehlen = meoi.meoi_l2hlen;
736 	oiphlen = meoi.meoi_l3hlen;
737 	otcphlen = meoi.meoi_l4hlen;
738 	ohdrslen = oehlen + oiphlen + otcphlen;
739 
740 	/* Performing LSO requires that we successfully read fully up to L4 */
741 	if ((MEOI_L4INFO_SET & meoi.meoi_flags) == 0) {
742 		mac_drop_pkt(omp, "unable to fully parse packet to L4");
743 		goto fail;
744 	}
745 
746 	if (meoi.meoi_l3proto != ETHERTYPE_IP &&
747 	    meoi.meoi_l3proto != ETHERTYPE_IPV6) {
748 		mac_drop_pkt(omp, "LSO'd packet has non-IP L3 header: %x",
749 		    meoi.meoi_l3proto);
750 		goto fail;
751 	}
752 
753 	if (meoi.meoi_l4proto != IPPROTO_TCP) {
754 		mac_drop_pkt(omp, "LSO unsupported protocol: %x",
755 		    meoi.meoi_l4proto);
756 		goto fail;
757 	}
758 
759 	is_v6 = meoi.meoi_l3proto == ETHERTYPE_IPV6;
760 
761 	mss = DB_LSOMSS(omp);
762 	if (mss == 0) {
763 		mac_drop_pkt(omp, "packet misconfigured for LSO (MSS == 0)");
764 		goto fail;
765 	}
766 	ASSERT3U(opktlen, <=, IP_MAXPACKET + oehlen);
767 
768 	/*
769 	 * Ensure the headers are contiguous. The IP header is used only for the
770 	 * benefit of DTrace SDTs, whereas the TCP header is actively read.
771 	 * This small pullup should only practically happen when
772 	 * mac_add_vlan_tag is in play, which prepends a new mblk in front
773 	 * containing the amended Ethernet header.
774 	 */
775 	if (MBLKL(omp) < ohdrslen) {
776 		mblk_t *tmp = msgpullup(omp, ohdrslen);
777 
778 		if (tmp == NULL) {
779 			mac_drop_pkt(omp, "failed to pull up");
780 			goto fail;
781 		}
782 
783 		mac_hcksum_clone(omp, tmp);
784 		freemsg(omp);
785 		omp = tmp;
786 	}
787 
788 	oiph = (void *)(omp->b_rptr + oehlen);
789 	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
790 
791 	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
792 		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
793 		goto fail;
794 	}
795 
796 	len = MBLKL(omp);
797 
798 	/*
799 	 * Either we have data in the first mblk or it's just the
800 	 * header. In either case, we need to set rptr to the start of
801 	 * the TCP data.
802 	 */
803 	if (len > ohdrslen) {
804 		odatamp = omp;
805 		offset = ohdrslen;
806 	} else {
807 		ASSERT3U(len, ==, ohdrslen);
808 		odatamp = omp->b_cont;
809 		offset = 0;
810 	}
811 
812 	/* Make sure we still have enough data. */
813 	odatalen = opktlen - ohdrslen;
814 	ASSERT3U(msgsize(odatamp), >=, odatalen);
815 
816 	/*
817 	 * If a MAC negotiated LSO then it must negotiate both
818 	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
819 	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
820 	 * change during LSO segmentation (only the 3 fields of the
821 	 * pseudo header checksum don't change: src, dst, proto). Thus
822 	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
823 	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
824 	 * function to emulate those checksums in software. However,
825 	 * that assumes a world where we only expose LSO if the
826 	 * underlying hardware exposes LSO. Moving forward the plan is
827 	 * to assume LSO in the upper layers and have MAC perform
828 	 * software LSO when the underlying provider doesn't support
829 	 * it. In such a world, if the provider doesn't support LSO
830 	 * but does support hardware checksum offload, then we could
831 	 * simply perform the segmentation and allow the hardware to
832 	 * calculate the checksums. To the hardware it's just another
833 	 * chain of non-LSO packets.
834 	 */
835 	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
836 	ocsum_flags = DB_CKSUMFLAGS(omp);
837 	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
838 
839 	/*
840 	 * If hardware only provides partial checksum then software
841 	 * must supply the pseudo-header checksum. In the case of LSO
842 	 * we leave the TCP length at zero to be filled in by
843 	 * hardware. This function must handle two scenarios.
844 	 *
845 	 * 1. Being called by a MAC client on the Rx path to segment
846 	 *    an LSO packet and calculate the checksum.
847 	 *
848 	 * 2. Being called by a MAC provider to segment an LSO packet.
849 	 *    In this case the LSO segmentation is performed in
850 	 *    software (by this routine) but the MAC provider should
851 	 *    still calculate the TCP/IP checksums in hardware.
852 	 *
853 	 *  To elaborate on the second case: we cannot have the
854 	 *  scenario where IP sends LSO packets but the underlying HW
855 	 *  doesn't support checksum offload -- because in that case
856 	 *  TCP/IP would calculate the checksum in software (for the
857 	 *  LSO packet) but then MAC would segment the packet and have
858 	 *  to redo all the checksum work. So IP should never do LSO
859 	 *  if HW doesn't support both IP and TCP checksum.
860 	 */
861 	if (ocsum_flags & HCK_PARTIALCKSUM) {
862 		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
863 		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
864 	}
865 
866 	/*
867 	 * Subtract one to account for the case where the data length
868 	 * is evenly divisble by the MSS. Add one to account for the
869 	 * fact that the division will always result in one less
870 	 * segment than needed.
871 	 */
872 	nsegs = ((odatalen - 1) / mss) + 1;
873 	if (nsegs < 2) {
874 		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
875 		goto fail;
876 	}
877 
878 	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
879 	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss,
880 	    uint_t, nsegs);
881 
882 	seg_chain = NULL;
883 	tmptail = seg_chain;
884 	oleft = odatalen;
885 
886 	for (uint_t i = 0; i < nsegs; i++) {
887 		boolean_t last_seg = ((i + 1) == nsegs);
888 		uint32_t seg_len;
889 
890 		/*
891 		 * If we fail to allocate, then drop the partially
892 		 * allocated chain as well as the LSO packet. Let the
893 		 * sender deal with the fallout.
894 		 */
895 		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
896 			freemsgchain(seg_chain);
897 			mac_drop_pkt(omp, "failed to alloc segment header");
898 			goto fail;
899 		}
900 		ASSERT3P(nhdrmp->b_cont, ==, NULL);
901 
902 		/* Copy over the header stack. */
903 		bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
904 		nhdrmp->b_wptr += ohdrslen;
905 
906 		if (seg_chain == NULL) {
907 			seg_chain = nhdrmp;
908 		} else {
909 			ASSERT3P(tmptail, !=, NULL);
910 			tmptail->b_next = nhdrmp;
911 		}
912 
913 		tmptail = nhdrmp;
914 
915 		/*
916 		 * Calculate this segment's length. It's either the MSS
917 		 * or whatever remains for the last segment.
918 		 */
919 		seg_len = last_seg ? oleft : mss;
920 		ASSERT3U(seg_len, <=, mss);
921 		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
922 
923 		if (ndatamp == NULL) {
924 			freemsgchain(seg_chain);
925 			mac_drop_pkt(omp, "LSO failed to segment data");
926 			goto fail;
927 		}
928 
929 		/* Attach data mblk to header mblk. */
930 		nhdrmp->b_cont = ndatamp;
931 		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
932 		ASSERT3U(seg_len, <=, oleft);
933 		oleft -= seg_len;
934 
935 		/* Setup partial checksum offsets. */
936 		if (ocsum_flags & HCK_PARTIALCKSUM) {
937 			DB_CKSUMSTART(nhdrmp) = ocsum_start;
938 			DB_CKSUMEND(nhdrmp) = oiphlen + otcphlen + seg_len;
939 			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
940 		}
941 	}
942 
943 	/* We should have consumed entire LSO msg. */
944 	ASSERT3S(oleft, ==, 0);
945 	ASSERT3P(odatamp, ==, NULL);
946 
947 	/*
948 	 * All seg data mblks are referenced by the header mblks, null
949 	 * out this pointer to catch any bad derefs.
950 	 */
951 	ndatamp = NULL;
952 
953 	/*
954 	 * Set headers and checksum for first segment.
955 	 */
956 	nhdrmp = seg_chain;
957 	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
958 
959 	if (is_v6) {
960 		niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
961 		niph6->ip6_plen = htons(
962 		    (oiphlen - IPV6_HDR_LEN) + otcphlen + mss);
963 	} else {
964 		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
965 		niph->ipha_length = htons(oiphlen + otcphlen + mss);
966 		/*
967 		 * If the v4 checksum was filled, we won't have a v4 offload
968 		 * flag. We can't write zero checksums without inserting said
969 		 * flag, but our output frames won't necessarily be rechecked by
970 		 * the caller! As a compromise, we need to force emulation to
971 		 * uphold the same contracts the packet already agreed to.
972 		 */
973 		if (niph->ipha_hdr_checksum != 0) {
974 			emul |= MAC_IPCKSUM_EMUL;
975 			ocsum_flags |= HCK_IPV4_HDRCKSUM;
976 		}
977 		niph->ipha_hdr_checksum = 0;
978 		ip_id = ntohs(niph->ipha_ident);
979 	}
980 
981 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
982 	tcp_seq = BE32_TO_U32(ntcph->th_seq);
983 	tcp_seq += mss;
984 
985 	/*
986 	 * The first segment shouldn't:
987 	 *
988 	 *	o indicate end of data transmission (FIN),
989 	 *	o indicate immediate handling of the data (PUSH).
990 	 */
991 	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
992 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
993 
994 	/*
995 	 * If the underlying HW provides partial checksum, then make
996 	 * sure to correct the pseudo header checksum before calling
997 	 * mac_sw_cksum(). The native TCP stack doesn't include the
998 	 * length field in the pseudo header when LSO is in play -- so
999 	 * we need to calculate it here.
1000 	 */
1001 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1002 		tcp_sum = BE16_TO_U16(ntcph->th_sum);
1003 		otcp_sum = tcp_sum;
1004 		tcp_sum += mss + otcphlen;
1005 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1006 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1007 	}
1008 
1009 	if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) {
1010 		next_nhdrmp = nhdrmp->b_next;
1011 		nhdrmp->b_next = NULL;
1012 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1013 		/*
1014 		 * The mblk could be replaced (via pull-up) or freed (due to
1015 		 * failure) during mac_sw_cksum(), so we must take care with the
1016 		 * result here.
1017 		 */
1018 		if (nhdrmp != NULL) {
1019 			nhdrmp->b_next = next_nhdrmp;
1020 			next_nhdrmp = NULL;
1021 			seg_chain = nhdrmp;
1022 		} else {
1023 			freemsgchain(next_nhdrmp);
1024 			/*
1025 			 * nhdrmp referenced the head of seg_chain when it was
1026 			 * freed, so further clean-up there is unnecessary
1027 			 */
1028 			seg_chain = NULL;
1029 			mac_drop_pkt(omp, "LSO cksum emulation failed");
1030 			goto fail;
1031 		}
1032 	}
1033 
1034 	ASSERT3P(nhdrmp, !=, NULL);
1035 
1036 	seg = 1;
1037 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1038 	    (is_v6 ? (void *)niph6 : (void *)niph),
1039 	    __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, int_t, seg);
1040 	seg++;
1041 
1042 	/* There better be at least 2 segs. */
1043 	ASSERT3P(nhdrmp->b_next, !=, NULL);
1044 	prev_nhdrmp = nhdrmp;
1045 	nhdrmp = nhdrmp->b_next;
1046 
1047 	/*
1048 	 * Now adjust the headers of the middle segments. For each
1049 	 * header we need to adjust the following.
1050 	 *
1051 	 *	o IP ID
1052 	 *	o IP length
1053 	 *	o TCP sequence
1054 	 *	o TCP flags
1055 	 *	o cksum flags
1056 	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
1057 	 */
1058 	for (; seg < nsegs; seg++) {
1059 		/*
1060 		 * We use seg_chain as a reference to the first seg
1061 		 * header mblk -- this first header is a template for
1062 		 * the rest of the segments. This copy will include
1063 		 * the now updated checksum values from the first
1064 		 * header. We must reset these checksum values to
1065 		 * their original to make sure we produce the correct
1066 		 * value.
1067 		 */
1068 		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1069 		if (is_v6) {
1070 			niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
1071 			niph6->ip6_plen = htons(
1072 			    (oiphlen - IPV6_HDR_LEN) + otcphlen + mss);
1073 		} else {
1074 			niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1075 			niph->ipha_ident = htons(++ip_id);
1076 			niph->ipha_length = htons(oiphlen + otcphlen + mss);
1077 			niph->ipha_hdr_checksum = 0;
1078 		}
1079 		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1080 		U32_TO_BE32(tcp_seq, ntcph->th_seq);
1081 		tcp_seq += mss;
1082 		/*
1083 		 * Just like the first segment, the middle segments
1084 		 * shouldn't have these flags set.
1085 		 */
1086 		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1087 		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1088 
1089 		/*
1090 		 * First and middle segs have same
1091 		 * pseudo-header checksum.
1092 		 */
1093 		if (ocsum_flags & HCK_PARTIALCKSUM)
1094 			U16_TO_BE16(tcp_sum, ntcph->th_sum);
1095 
1096 		if ((ocsum_flags & HCK_TX_FLAGS) &&
1097 		    (emul & MAC_HWCKSUM_EMULS)) {
1098 			next_nhdrmp = nhdrmp->b_next;
1099 			nhdrmp->b_next = NULL;
1100 			nhdrmp = mac_sw_cksum(nhdrmp, emul);
1101 			/*
1102 			 * Like above, handle cases where mac_sw_cksum() does a
1103 			 * pull-up or drop of the mblk.
1104 			 */
1105 			if (nhdrmp != NULL) {
1106 				nhdrmp->b_next = next_nhdrmp;
1107 				next_nhdrmp = NULL;
1108 				prev_nhdrmp->b_next = nhdrmp;
1109 			} else {
1110 				freemsgchain(next_nhdrmp);
1111 				/*
1112 				 * Critical to de-link the now-freed nhdrmp
1113 				 * before freeing the rest of the preceding
1114 				 * chain.
1115 				 */
1116 				prev_nhdrmp->b_next = NULL;
1117 				freemsgchain(seg_chain);
1118 				seg_chain = NULL;
1119 				mac_drop_pkt(omp, "LSO cksum emulation failed");
1120 				goto fail;
1121 			}
1122 		}
1123 
1124 		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1125 		    (is_v6 ? (void *)niph6 : (void *)niph),
1126 		    __dtrace_tcp_tcph_t *, ntcph, uint_t, mss, uint_t, seg);
1127 
1128 		ASSERT3P(nhdrmp->b_next, !=, NULL);
1129 		prev_nhdrmp = nhdrmp;
1130 		nhdrmp = nhdrmp->b_next;
1131 	}
1132 
1133 	/* Make sure we are on the last segment. */
1134 	ASSERT3U(seg, ==, nsegs);
1135 	ASSERT3P(nhdrmp->b_next, ==, NULL);
1136 
1137 	/*
1138 	 * Now we set the last segment header. The difference being
1139 	 * that FIN/PSH/RST flags are allowed.
1140 	 */
1141 	len = msgsize(nhdrmp->b_cont);
1142 	ASSERT3S(len, >, 0);
1143 	if (is_v6) {
1144 		niph6 = (ip6_t *)(nhdrmp->b_rptr + oehlen);
1145 		niph6->ip6_plen = htons(
1146 		    (oiphlen - IPV6_HDR_LEN) + otcphlen + len);
1147 	} else {
1148 		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1149 		niph->ipha_ident = htons(++ip_id);
1150 		niph->ipha_length = htons(oiphlen + otcphlen + len);
1151 		niph->ipha_hdr_checksum = 0;
1152 	}
1153 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1154 	U32_TO_BE32(tcp_seq, ntcph->th_seq);
1155 
1156 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1157 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1158 		tcp_sum = otcp_sum;
1159 		tcp_sum += len + otcphlen;
1160 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1161 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1162 	}
1163 
1164 	if ((ocsum_flags & HCK_TX_FLAGS) && (emul & MAC_HWCKSUM_EMULS)) {
1165 		/* This should be the last mblk. */
1166 		ASSERT3P(nhdrmp->b_next, ==, NULL);
1167 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1168 		/*
1169 		 * If the final mblk happens to be dropped as part of
1170 		 * mac_sw_cksum(), that is unfortunate, but it need not be a
1171 		 * show-stopper at this point.  We can just pretend that final
1172 		 * packet was dropped in transit.
1173 		 */
1174 		prev_nhdrmp->b_next = nhdrmp;
1175 	}
1176 
1177 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1178 	    (is_v6 ? (void *)niph6 : (void *)niph),
1179 	    __dtrace_tcp_tcph_t *, ntcph, uint_t, len, uint_t, seg);
1180 
1181 	/*
1182 	 * Free the reference to the original LSO message as it is
1183 	 * being replaced by seg_cahin.
1184 	 */
1185 	freemsg(omp);
1186 	*head = seg_chain;
1187 	*tail = nhdrmp;
1188 	*count = nsegs;
1189 	return;
1190 
1191 fail:
1192 	*head = NULL;
1193 	*tail = NULL;
1194 	*count = 0;
1195 }
1196 
1197 #define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1198 
1199 /*
1200  * Emulate various hardware offload features in software. Take a chain
1201  * of packets as input and emulate the hardware features specified in
1202  * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1203  * pointer given as input, and its tail pointer is written to
1204  * '*otail'. The number of packets in the new chain is written to
1205  * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1206  * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1207  * which case 'mp_chain' will simply stay a NULL chain.
1208  *
1209  * While unlikely, it is technically possible that this function could
1210  * receive a non-NULL chain as input and return a NULL chain as output
1211  * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1212  * zero). This could happen if all the packets in the chain are
1213  * dropped or if we fail to allocate new mblks. In this case, there is
1214  * nothing for the caller to free. In any event, the caller shouldn't
1215  * assume that '*mp_chain' is non-NULL on return.
1216  *
1217  * This function was written with three main use cases in mind.
1218  *
1219  * 1. To emulate hardware offloads when traveling mac-loopback (two
1220  *    clients on the same mac). This is wired up in mac_tx_send().
1221  *
1222  * 2. To provide hardware offloads to the client when the underlying
1223  *    provider cannot. This is currently wired up in mac_tx() but we
1224  *    still only negotiate offloads when the underlying provider
1225  *    supports them.
1226  *
1227  * 3. To emulate real hardware in simnet.
1228  */
1229 void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1230 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1231 {
1232 	mblk_t *head = NULL, *tail = NULL;
1233 	uint_t count = 0;
1234 
1235 	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1236 	ASSERT3P(mp_chain, !=, NULL);
1237 
1238 	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1239 		mblk_t *tmp, *next, *tmphead, *tmptail;
1240 		struct ether_header *ehp;
1241 		uint32_t flags;
1242 		uint_t len = MBLKL(mp), l2len;
1243 
1244 		/* Perform LSO/cksum one message at a time. */
1245 		next = mp->b_next;
1246 		mp->b_next = NULL;
1247 
1248 		/*
1249 		 * For our sanity the first mblk should contain at
1250 		 * least the full L2 header.
1251 		 */
1252 		if (len < sizeof (struct ether_header)) {
1253 			mac_drop_pkt(mp, "packet too short (A): %u", len);
1254 			mp = next;
1255 			continue;
1256 		}
1257 
1258 		ehp = (struct ether_header *)mp->b_rptr;
1259 		if (ntohs(ehp->ether_type) == VLAN_TPID)
1260 			l2len = sizeof (struct ether_vlan_header);
1261 		else
1262 			l2len = sizeof (struct ether_header);
1263 
1264 		/*
1265 		 * If the first mblk is solely the L2 header, then
1266 		 * there better be more data.
1267 		 */
1268 		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1269 			mac_drop_pkt(mp, "packet too short (C): %u", len);
1270 			mp = next;
1271 			continue;
1272 		}
1273 
1274 		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1275 
1276 		/*
1277 		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1278 		 * because we don't want to mask-out the LSO flag.
1279 		 */
1280 		flags = DB_CKSUMFLAGS(mp);
1281 
1282 		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1283 			uint_t tmpcount = 0;
1284 
1285 			/*
1286 			 * LSO fix-up handles checksum emulation
1287 			 * inline (if requested). It also frees mp.
1288 			 */
1289 			mac_sw_lso(mp, emul, &tmphead, &tmptail,
1290 			    &tmpcount);
1291 			if (tmphead == NULL) {
1292 				/* mac_sw_lso() freed the mp. */
1293 				mp = next;
1294 				continue;
1295 			}
1296 			count += tmpcount;
1297 		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1298 			tmp = mac_sw_cksum(mp, emul);
1299 			if (tmp == NULL) {
1300 				/* mac_sw_cksum() freed the mp. */
1301 				mp = next;
1302 				continue;
1303 			}
1304 			tmphead = tmp;
1305 			tmptail = tmp;
1306 			count++;
1307 		} else {
1308 			/* There is nothing to emulate. */
1309 			tmp = mp;
1310 			tmphead = tmp;
1311 			tmptail = tmp;
1312 			count++;
1313 		}
1314 
1315 		/*
1316 		 * The tmp mblk chain is either the start of the new
1317 		 * chain or added to the tail of the new chain.
1318 		 */
1319 		if (head == NULL) {
1320 			head = tmphead;
1321 			tail = tmptail;
1322 		} else {
1323 			/* Attach the new mblk to the end of the new chain. */
1324 			tail->b_next = tmphead;
1325 			tail = tmptail;
1326 		}
1327 
1328 		mp = next;
1329 	}
1330 
1331 	*mp_chain = head;
1332 
1333 	if (otail != NULL)
1334 		*otail = tail;
1335 
1336 	if (ocount != NULL)
1337 		*ocount = count;
1338 }
1339 
1340 /*
1341  * Add VLAN tag to the specified mblk.
1342  */
1343 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1344 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1345 {
1346 	mblk_t *hmp;
1347 	struct ether_vlan_header *evhp;
1348 	struct ether_header *ehp;
1349 
1350 	ASSERT(pri != 0 || vid != 0);
1351 
1352 	/*
1353 	 * Allocate an mblk for the new tagged ethernet header,
1354 	 * and copy the MAC addresses and ethertype from the
1355 	 * original header.
1356 	 */
1357 
1358 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1359 	if (hmp == NULL) {
1360 		freemsg(mp);
1361 		return (NULL);
1362 	}
1363 
1364 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
1365 	ehp = (struct ether_header *)mp->b_rptr;
1366 
1367 	bcopy(ehp, evhp, (ETHERADDRL * 2));
1368 	evhp->ether_type = ehp->ether_type;
1369 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1370 
1371 	hmp->b_wptr += sizeof (struct ether_vlan_header);
1372 	mp->b_rptr += sizeof (struct ether_header);
1373 
1374 	/*
1375 	 * Free the original message if it's now empty. Link the
1376 	 * rest of messages to the header message.
1377 	 */
1378 	mac_hcksum_clone(mp, hmp);
1379 	if (MBLKL(mp) == 0) {
1380 		hmp->b_cont = mp->b_cont;
1381 		freeb(mp);
1382 	} else {
1383 		hmp->b_cont = mp;
1384 	}
1385 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1386 
1387 	/*
1388 	 * Initialize the new TCI (Tag Control Information).
1389 	 */
1390 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1391 
1392 	return (hmp);
1393 }
1394 
1395 /*
1396  * Adds a VLAN tag with the specified VID and priority to each mblk of
1397  * the specified chain.
1398  */
1399 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1400 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1401 {
1402 	mblk_t *next_mp, **prev, *mp;
1403 
1404 	mp = mp_chain;
1405 	prev = &mp_chain;
1406 
1407 	while (mp != NULL) {
1408 		next_mp = mp->b_next;
1409 		mp->b_next = NULL;
1410 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1411 			freemsgchain(next_mp);
1412 			break;
1413 		}
1414 		*prev = mp;
1415 		prev = &mp->b_next;
1416 		mp = mp->b_next = next_mp;
1417 	}
1418 
1419 	return (mp_chain);
1420 }
1421 
1422 /*
1423  * Strip VLAN tag
1424  */
1425 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1426 mac_strip_vlan_tag(mblk_t *mp)
1427 {
1428 	mblk_t *newmp;
1429 	struct ether_vlan_header *evhp;
1430 
1431 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1432 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1433 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1434 
1435 		if (DB_REF(mp) > 1) {
1436 			newmp = copymsg(mp);
1437 			if (newmp == NULL)
1438 				return (NULL);
1439 			freemsg(mp);
1440 			mp = newmp;
1441 		}
1442 
1443 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1444 
1445 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1446 		mp->b_rptr += VLAN_TAGSZ;
1447 	}
1448 	return (mp);
1449 }
1450 
1451 /*
1452  * Strip VLAN tag from each mblk of the chain.
1453  */
1454 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1455 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1456 {
1457 	mblk_t *mp, *next_mp, **prev;
1458 
1459 	mp = mp_chain;
1460 	prev = &mp_chain;
1461 
1462 	while (mp != NULL) {
1463 		next_mp = mp->b_next;
1464 		mp->b_next = NULL;
1465 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1466 			freemsgchain(next_mp);
1467 			break;
1468 		}
1469 		*prev = mp;
1470 		prev = &mp->b_next;
1471 		mp = mp->b_next = next_mp;
1472 	}
1473 
1474 	return (mp_chain);
1475 }
1476 
1477 /*
1478  * Default callback function. Used when the datapath is not yet initialized.
1479  */
1480 /* ARGSUSED */
1481 void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1482 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1483     boolean_t loopback)
1484 {
1485 	freemsgchain(mp_chain);
1486 }
1487 
1488 /*
1489  * Determines the IPv6 header length accounting for all the optional IPv6
1490  * headers (hop-by-hop, destination, routing and fragment). The header length
1491  * and next header value (a transport header) is captured.
1492  *
1493  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1494  * returns B_TRUE.
1495  */
1496 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)1497 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1498     uint8_t *next_hdr, ip6_frag_t **fragp)
1499 {
1500 	uint16_t length;
1501 	uint_t	ehdrlen;
1502 	uint8_t *whereptr;
1503 	uint8_t *nexthdrp;
1504 	ip6_dest_t *desthdr;
1505 	ip6_rthdr_t *rthdr;
1506 	ip6_frag_t *fraghdr;
1507 
1508 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1509 		return (B_FALSE);
1510 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1511 	length = IPV6_HDR_LEN;
1512 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1513 
1514 	if (fragp != NULL)
1515 		*fragp = NULL;
1516 
1517 	nexthdrp = &ip6h->ip6_nxt;
1518 	while (whereptr < endptr) {
1519 		/* Is there enough left for len + nexthdr? */
1520 		if (whereptr + MIN_EHDR_LEN > endptr)
1521 			break;
1522 
1523 		switch (*nexthdrp) {
1524 		case IPPROTO_HOPOPTS:
1525 		case IPPROTO_DSTOPTS:
1526 			/* Assumes the headers are identical for hbh and dst */
1527 			desthdr = (ip6_dest_t *)whereptr;
1528 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
1529 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
1530 				return (B_FALSE);
1531 			nexthdrp = &desthdr->ip6d_nxt;
1532 			break;
1533 		case IPPROTO_ROUTING:
1534 			rthdr = (ip6_rthdr_t *)whereptr;
1535 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
1536 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
1537 				return (B_FALSE);
1538 			nexthdrp = &rthdr->ip6r_nxt;
1539 			break;
1540 		case IPPROTO_FRAGMENT:
1541 			fraghdr = (ip6_frag_t *)whereptr;
1542 			ehdrlen = sizeof (ip6_frag_t);
1543 			if ((uchar_t *)&fraghdr[1] > endptr)
1544 				return (B_FALSE);
1545 			nexthdrp = &fraghdr->ip6f_nxt;
1546 			if (fragp != NULL)
1547 				*fragp = fraghdr;
1548 			break;
1549 		case IPPROTO_NONE:
1550 			/* No next header means we're finished */
1551 		default:
1552 			*hdr_length = length;
1553 			*next_hdr = *nexthdrp;
1554 			return (B_TRUE);
1555 		}
1556 		length += ehdrlen;
1557 		whereptr += ehdrlen;
1558 		*hdr_length = length;
1559 		*next_hdr = *nexthdrp;
1560 	}
1561 	switch (*nexthdrp) {
1562 	case IPPROTO_HOPOPTS:
1563 	case IPPROTO_DSTOPTS:
1564 	case IPPROTO_ROUTING:
1565 	case IPPROTO_FRAGMENT:
1566 		/*
1567 		 * If any know extension headers are still to be processed,
1568 		 * the packet's malformed (or at least all the IP header(s) are
1569 		 * not in the same mblk - and that should never happen.
1570 		 */
1571 		return (B_FALSE);
1572 
1573 	default:
1574 		/*
1575 		 * If we get here, we know that all of the IP headers were in
1576 		 * the same mblk, even if the ULP header is in the next mblk.
1577 		 */
1578 		*hdr_length = length;
1579 		*next_hdr = *nexthdrp;
1580 		return (B_TRUE);
1581 	}
1582 }
1583 
1584 /*
1585  * The following set of routines are there to take care of interrupt
1586  * re-targeting for legacy (fixed) interrupts. Some older versions
1587  * of the popular NICs like e1000g do not support MSI-X interrupts
1588  * and they reserve fixed interrupts for RX/TX rings. To re-target
1589  * these interrupts, PCITOOL ioctls need to be used.
1590  */
1591 typedef struct mac_dladm_intr {
1592 	int	ino;
1593 	int	cpu_id;
1594 	char	driver_path[MAXPATHLEN];
1595 	char	nexus_path[MAXPATHLEN];
1596 } mac_dladm_intr_t;
1597 
1598 /* Bind the interrupt to cpu_num */
1599 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)1600 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1601 {
1602 	pcitool_intr_set_t	iset;
1603 	int			err;
1604 
1605 	iset.old_cpu = oldcpuid;
1606 	iset.ino = ino;
1607 	iset.cpu_id = cpu_num;
1608 	iset.user_version = PCITOOL_VERSION;
1609 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1610 	    kcred, NULL);
1611 
1612 	return (err);
1613 }
1614 
1615 /*
1616  * Search interrupt information. iget is filled in with the info to search
1617  */
1618 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1619 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1620 {
1621 	int	i;
1622 	char	driver_path[2 * MAXPATHLEN];
1623 
1624 	for (i = 0; i < iget_p->num_devs; i++) {
1625 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1626 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1627 		    ":%s%d", iget_p->dev[i].driver_name,
1628 		    iget_p->dev[i].dev_inst);
1629 		/* Match the device path for the device path */
1630 		if (strcmp(driver_path, dln->driver_path) == 0) {
1631 			dln->ino = iget_p->ino;
1632 			dln->cpu_id = iget_p->cpu_id;
1633 			return (B_TRUE);
1634 		}
1635 	}
1636 	return (B_FALSE);
1637 }
1638 
1639 /*
1640  * Get information about ino, i.e. if this is the interrupt for our
1641  * device and where it is bound etc.
1642  */
1643 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)1644 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1645     mac_dladm_intr_t *dln)
1646 {
1647 	pcitool_intr_get_t	*iget_p;
1648 	int			ipsz;
1649 	int			nipsz;
1650 	int			err;
1651 	uint8_t			inum;
1652 
1653 	/*
1654 	 * Check if SLEEP is OK, i.e if could come here in response to
1655 	 * changing the fanout due to some callback from the driver, say
1656 	 * link speed changes.
1657 	 */
1658 	ipsz = PCITOOL_IGET_SIZE(0);
1659 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1660 
1661 	iget_p->num_devs_ret = 0;
1662 	iget_p->user_version = PCITOOL_VERSION;
1663 	iget_p->cpu_id = oldcpuid;
1664 	iget_p->ino = ino;
1665 
1666 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1667 	    FKIOCTL, kcred, NULL);
1668 	if (err != 0) {
1669 		kmem_free(iget_p, ipsz);
1670 		return (B_FALSE);
1671 	}
1672 	if (iget_p->num_devs == 0) {
1673 		kmem_free(iget_p, ipsz);
1674 		return (B_FALSE);
1675 	}
1676 	inum = iget_p->num_devs;
1677 	if (iget_p->num_devs_ret < iget_p->num_devs) {
1678 		/* Reallocate */
1679 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1680 
1681 		kmem_free(iget_p, ipsz);
1682 		ipsz = nipsz;
1683 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1684 
1685 		iget_p->num_devs_ret = inum;
1686 		iget_p->cpu_id = oldcpuid;
1687 		iget_p->ino = ino;
1688 		iget_p->user_version = PCITOOL_VERSION;
1689 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1690 		    FKIOCTL, kcred, NULL);
1691 		if (err != 0) {
1692 			kmem_free(iget_p, ipsz);
1693 			return (B_FALSE);
1694 		}
1695 		/* defensive */
1696 		if (iget_p->num_devs != iget_p->num_devs_ret) {
1697 			kmem_free(iget_p, ipsz);
1698 			return (B_FALSE);
1699 		}
1700 	}
1701 
1702 	if (mac_search_intrinfo(iget_p, dln)) {
1703 		kmem_free(iget_p, ipsz);
1704 		return (B_TRUE);
1705 	}
1706 	kmem_free(iget_p, ipsz);
1707 	return (B_FALSE);
1708 }
1709 
1710 /*
1711  * Get the interrupts and check each one to see if it is for our device.
1712  */
1713 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1714 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1715 {
1716 	pcitool_intr_info_t	intr_info;
1717 	int			err;
1718 	int			ino;
1719 	int			oldcpuid;
1720 
1721 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1722 	    FKIOCTL, kcred, NULL);
1723 	if (err != 0)
1724 		return (-1);
1725 
1726 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1727 		for (ino = 0; ino < intr_info.num_intr; ino++) {
1728 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1729 				if (dln->cpu_id == cpuid)
1730 					return (0);
1731 				return (1);
1732 			}
1733 		}
1734 	}
1735 	return (-1);
1736 }
1737 
1738 /*
1739  * Obtain the nexus parent node info. for mdip.
1740  */
1741 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1742 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1743 {
1744 	struct dev_info		*tdip = (struct dev_info *)mdip;
1745 	struct ddi_minor_data	*minordata;
1746 	dev_info_t		*pdip;
1747 	char			pathname[MAXPATHLEN];
1748 
1749 	while (tdip != NULL) {
1750 		/*
1751 		 * The netboot code could call this function while walking the
1752 		 * device tree so we need to use ndi_devi_tryenter() here to
1753 		 * avoid deadlock.
1754 		 */
1755 		if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1756 			break;
1757 
1758 		for (minordata = tdip->devi_minor; minordata != NULL;
1759 		    minordata = minordata->next) {
1760 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1761 			    strlen(DDI_NT_INTRCTL)) == 0) {
1762 				pdip = minordata->dip;
1763 				(void) ddi_pathname(pdip, pathname);
1764 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
1765 				    "/devices%s:intr", pathname);
1766 				(void) ddi_pathname_minor(minordata, pathname);
1767 				ndi_devi_exit((dev_info_t *)tdip);
1768 				return (pdip);
1769 			}
1770 		}
1771 		ndi_devi_exit((dev_info_t *)tdip);
1772 		tdip = tdip->devi_parent;
1773 	}
1774 	return (NULL);
1775 }
1776 
1777 /*
1778  * For a primary MAC client, if the user has set a list or CPUs or
1779  * we have obtained it implicitly, we try to retarget the interrupt
1780  * for that device on one of the CPUs in the list.
1781  * We assign the interrupt to the same CPU as the poll thread.
1782  */
1783 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1784 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1785 {
1786 	ldi_handle_t		lh = NULL;
1787 	ldi_ident_t		li = NULL;
1788 	int			err;
1789 	int			ret;
1790 	mac_dladm_intr_t	dln;
1791 	dev_info_t		*dip;
1792 	struct ddi_minor_data	*minordata;
1793 
1794 	dln.nexus_path[0] = '\0';
1795 	dln.driver_path[0] = '\0';
1796 
1797 	minordata = ((struct dev_info *)mdip)->devi_minor;
1798 	while (minordata != NULL) {
1799 		if (minordata->type == DDM_MINOR)
1800 			break;
1801 		minordata = minordata->next;
1802 	}
1803 	if (minordata == NULL)
1804 		return (B_FALSE);
1805 
1806 	(void) ddi_pathname_minor(minordata, dln.driver_path);
1807 
1808 	dip = mac_get_nexus_node(mdip, &dln);
1809 	/* defensive */
1810 	if (dip == NULL)
1811 		return (B_FALSE);
1812 
1813 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1814 	if (err != 0)
1815 		return (B_FALSE);
1816 
1817 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1818 	if (err != 0)
1819 		return (B_FALSE);
1820 
1821 	ret = mac_validate_intr(lh, &dln, cpuid);
1822 	if (ret < 0) {
1823 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
1824 		return (B_FALSE);
1825 	}
1826 	/* cmn_note? */
1827 	if (ret != 0)
1828 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1829 		    != 0) {
1830 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
1831 			return (B_FALSE);
1832 		}
1833 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
1834 	return (B_TRUE);
1835 }
1836 
1837 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1838 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1839 {
1840 	dev_info_t		*mdip = (dev_info_t *)arg;
1841 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1842 	mac_resource_props_t	*mrp;
1843 	mac_perim_handle_t	mph;
1844 	flow_entry_t		*flent = mcip->mci_flent;
1845 	mac_soft_ring_set_t	*rx_srs;
1846 	mac_cpus_t		*srs_cpu;
1847 
1848 	if (!mac_check_interrupt_binding(mdip, cpuid))
1849 		cpuid = -1;
1850 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1851 	mrp = MCIP_RESOURCE_PROPS(mcip);
1852 	mrp->mrp_rx_intr_cpu = cpuid;
1853 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1854 		rx_srs = flent->fe_rx_srs[1];
1855 		srs_cpu = &rx_srs->srs_cpu;
1856 		srs_cpu->mc_rx_intr_cpu = cpuid;
1857 	}
1858 	mac_perim_exit(mph);
1859 }
1860 
1861 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1862 mac_client_intr_cpu(mac_client_handle_t mch)
1863 {
1864 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1865 	mac_cpus_t		*srs_cpu;
1866 	mac_soft_ring_set_t	*rx_srs;
1867 	flow_entry_t		*flent = mcip->mci_flent;
1868 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
1869 	mac_ring_t		*ring;
1870 	mac_intr_t		*mintr;
1871 
1872 	/*
1873 	 * Check if we need to retarget the interrupt. We do this only
1874 	 * for the primary MAC client. We do this if we have the only
1875 	 * exclusive ring in the group.
1876 	 */
1877 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1878 		rx_srs = flent->fe_rx_srs[1];
1879 		srs_cpu = &rx_srs->srs_cpu;
1880 		ring = rx_srs->srs_ring;
1881 		mintr = &ring->mr_info.mri_intr;
1882 		/*
1883 		 * If ddi_handle is present or the poll CPU is
1884 		 * already bound to the interrupt CPU, return -1.
1885 		 */
1886 		if (mintr->mi_ddi_handle != NULL ||
1887 		    ((mrp->mrp_ncpus != 0) &&
1888 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1889 			return (-1);
1890 		}
1891 		return (srs_cpu->mc_rx_pollid);
1892 	}
1893 	return (-1);
1894 }
1895 
1896 void *
mac_get_devinfo(mac_handle_t mh)1897 mac_get_devinfo(mac_handle_t mh)
1898 {
1899 	mac_impl_t	*mip = (mac_impl_t *)mh;
1900 
1901 	return ((void *)mip->mi_dip);
1902 }
1903 
1904 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1905 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1906 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1907 
1908 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)1909 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
1910 {
1911 	struct ether_header *ehp;
1912 	uint64_t hash = 0;
1913 	uint16_t sap;
1914 	uint_t skip_len;
1915 	uint8_t proto;
1916 	boolean_t ip_fragmented;
1917 
1918 	/*
1919 	 * We may want to have one of these per MAC type plugin in the
1920 	 * future. For now supports only ethernet.
1921 	 */
1922 	if (media != DL_ETHER)
1923 		return (0L);
1924 
1925 	/* for now we support only outbound packets */
1926 	ASSERT(is_outbound);
1927 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
1928 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
1929 
1930 	/* compute L2 hash */
1931 
1932 	ehp = (struct ether_header *)mp->b_rptr;
1933 
1934 	if ((policy & MAC_PKT_HASH_L2) != 0) {
1935 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
1936 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
1937 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
1938 		policy &= ~MAC_PKT_HASH_L2;
1939 	}
1940 
1941 	if (policy == 0)
1942 		goto done;
1943 
1944 	/* skip ethernet header */
1945 
1946 	sap = ntohs(ehp->ether_type);
1947 	if (sap == ETHERTYPE_VLAN) {
1948 		struct ether_vlan_header *evhp;
1949 		mblk_t *newmp = NULL;
1950 
1951 		skip_len = sizeof (struct ether_vlan_header);
1952 		if (MBLKL(mp) < skip_len) {
1953 			/* the vlan tag is the payload, pull up first */
1954 			newmp = msgpullup(mp, -1);
1955 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
1956 				goto done;
1957 			}
1958 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
1959 		} else {
1960 			evhp = (struct ether_vlan_header *)mp->b_rptr;
1961 		}
1962 
1963 		sap = ntohs(evhp->ether_type);
1964 		freemsg(newmp);
1965 	} else {
1966 		skip_len = sizeof (struct ether_header);
1967 	}
1968 
1969 	/* if ethernet header is in its own mblk, skip it */
1970 	if (MBLKL(mp) <= skip_len) {
1971 		skip_len -= MBLKL(mp);
1972 		mp = mp->b_cont;
1973 		if (mp == NULL)
1974 			goto done;
1975 	}
1976 
1977 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
1978 
1979 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
1980 
1981 	switch (sap) {
1982 	case ETHERTYPE_IP: {
1983 		ipha_t *iphp;
1984 
1985 		/*
1986 		 * If the header is not aligned or the header doesn't fit
1987 		 * in the mblk, bail now. Note that this may cause packets
1988 		 * reordering.
1989 		 */
1990 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
1991 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
1992 		    !OK_32PTR((char *)iphp))
1993 			goto done;
1994 
1995 		proto = iphp->ipha_protocol;
1996 		skip_len += IPH_HDR_LENGTH(iphp);
1997 
1998 		/* Check if the packet is fragmented. */
1999 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2000 		    IPH_OFFSET;
2001 
2002 		/*
2003 		 * For fragmented packets, use addresses in addition to
2004 		 * the frag_id to generate the hash inorder to get
2005 		 * better distribution.
2006 		 */
2007 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2008 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2009 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2010 
2011 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2012 			    PKT_HASH_4BYTES(ip_dst));
2013 			policy &= ~MAC_PKT_HASH_L3;
2014 		}
2015 
2016 		if (ip_fragmented) {
2017 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2018 			hash ^= PKT_HASH_2BYTES(identp);
2019 			goto done;
2020 		}
2021 		break;
2022 	}
2023 	case ETHERTYPE_IPV6: {
2024 		ip6_t *ip6hp;
2025 		ip6_frag_t *frag = NULL;
2026 		uint16_t hdr_length;
2027 
2028 		/*
2029 		 * If the header is not aligned or the header doesn't fit
2030 		 * in the mblk, bail now. Note that this may cause packets
2031 		 * reordering.
2032 		 */
2033 
2034 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2035 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2036 		    !OK_32PTR((char *)ip6hp))
2037 			goto done;
2038 
2039 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2040 		    &proto, &frag))
2041 			goto done;
2042 		skip_len += hdr_length;
2043 
2044 		/*
2045 		 * For fragmented packets, use addresses in addition to
2046 		 * the frag_id to generate the hash inorder to get
2047 		 * better distribution.
2048 		 */
2049 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2050 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2051 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2052 
2053 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2054 			    PKT_HASH_4BYTES(ip_dst));
2055 			policy &= ~MAC_PKT_HASH_L3;
2056 		}
2057 
2058 		if (frag != NULL) {
2059 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2060 			hash ^= PKT_HASH_4BYTES(identp);
2061 			goto done;
2062 		}
2063 		break;
2064 	}
2065 	default:
2066 		goto done;
2067 	}
2068 
2069 	if (policy == 0)
2070 		goto done;
2071 
2072 	/* if ip header is in its own mblk, skip it */
2073 	if (MBLKL(mp) <= skip_len) {
2074 		skip_len -= MBLKL(mp);
2075 		mp = mp->b_cont;
2076 		if (mp == NULL)
2077 			goto done;
2078 	}
2079 
2080 	/* parse ULP header */
2081 again:
2082 	switch (proto) {
2083 	case IPPROTO_TCP:
2084 	case IPPROTO_UDP:
2085 	case IPPROTO_ESP:
2086 	case IPPROTO_SCTP:
2087 		/*
2088 		 * These Internet Protocols are intentionally designed
2089 		 * for hashing from the git-go.  Port numbers are in the first
2090 		 * word for transports, SPI is first for ESP.
2091 		 */
2092 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2093 			goto done;
2094 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2095 		break;
2096 
2097 	case IPPROTO_AH: {
2098 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2099 		uint_t ah_length = AH_TOTAL_LEN(ah);
2100 
2101 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2102 			goto done;
2103 
2104 		proto = ah->ah_nexthdr;
2105 		skip_len += ah_length;
2106 
2107 		/* if AH header is in its own mblk, skip it */
2108 		if (MBLKL(mp) <= skip_len) {
2109 			skip_len -= MBLKL(mp);
2110 			mp = mp->b_cont;
2111 			if (mp == NULL)
2112 				goto done;
2113 		}
2114 
2115 		goto again;
2116 	}
2117 	}
2118 
2119 done:
2120 	return (hash);
2121 }
2122