xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision 8119dad84d6416f13557b0ba8e2aaf9064cbcfd3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  * Copyright 2023 Oxide Computer Company
25  */
26 
27 /*
28  * MAC Services Module - misc utilities
29  */
30 
31 #include <sys/types.h>
32 #include <sys/mac.h>
33 #include <sys/mac_impl.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/mac_client_impl.h>
36 #include <sys/mac_soft_ring.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/vlan.h>
40 #include <sys/pattr.h>
41 #include <sys/pci_tools.h>
42 #include <inet/ip.h>
43 #include <inet/ip_impl.h>
44 #include <inet/ip6.h>
45 #include <sys/vtrace.h>
46 #include <sys/dlpi.h>
47 #include <sys/sunndi.h>
48 #include <inet/ipsec_impl.h>
49 #include <inet/sadb.h>
50 #include <inet/ipsecesp.h>
51 #include <inet/ipsecah.h>
52 #include <inet/tcp.h>
53 #include <inet/udp_impl.h>
54 #include <inet/sctp_ip.h>
55 
56 /*
57  * The next two functions are used for dropping packets or chains of
58  * packets, respectively. We could use one function for both but
59  * separating the use cases allows us to specify intent and prevent
60  * dropping more data than intended.
61  *
62  * The purpose of these functions is to aid the debugging effort,
63  * especially in production. Rather than use freemsg()/freemsgchain(),
64  * it's preferable to use these functions when dropping a packet in
65  * the MAC layer. These functions should only be used during
66  * unexpected conditions. That is, any time a packet is dropped
67  * outside of the regular, successful datapath. Consolidating all
68  * drops on these functions allows the user to trace one location and
69  * determine why the packet was dropped based on the msg. It also
70  * allows the user to inspect the packet before it is freed. Finally,
71  * it allows the user to avoid tracing freemsg()/freemsgchain() thus
72  * keeping the hot path running as efficiently as possible.
73  *
74  * NOTE: At this time not all MAC drops are aggregated on these
75  * functions; but that is the plan. This comment should be erased once
76  * completed.
77  */
78 
79 /*PRINTFLIKE2*/
80 void
81 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
82 {
83 	va_list adx;
84 	char msg[128];
85 	char *msgp = msg;
86 
87 	ASSERT3P(mp->b_next, ==, NULL);
88 
89 	va_start(adx, fmt);
90 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
91 	va_end(adx);
92 
93 	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
94 	freemsg(mp);
95 }
96 
97 /*PRINTFLIKE2*/
98 void
99 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
100 {
101 	va_list adx;
102 	char msg[128];
103 	char *msgp = msg;
104 
105 	va_start(adx, fmt);
106 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
107 	va_end(adx);
108 
109 	/*
110 	 * We could use freemsgchain() for the actual freeing but
111 	 * since we are already walking the chain to fire the dtrace
112 	 * probe we might as well free the msg here too.
113 	 */
114 	for (mblk_t *mp = chain, *next; mp != NULL; ) {
115 		next = mp->b_next;
116 		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
117 		mp->b_next = NULL;
118 		freemsg(mp);
119 		mp = next;
120 	}
121 }
122 
123 /*
124  * Copy an mblk, preserving its hardware checksum flags.
125  */
126 static mblk_t *
127 mac_copymsg_cksum(mblk_t *mp)
128 {
129 	mblk_t *mp1;
130 
131 	mp1 = copymsg(mp);
132 	if (mp1 == NULL)
133 		return (NULL);
134 
135 	mac_hcksum_clone(mp, mp1);
136 
137 	return (mp1);
138 }
139 
140 /*
141  * Copy an mblk chain, presenting the hardware checksum flags of the
142  * individual mblks.
143  */
144 mblk_t *
145 mac_copymsgchain_cksum(mblk_t *mp)
146 {
147 	mblk_t *nmp = NULL;
148 	mblk_t **nmpp = &nmp;
149 
150 	for (; mp != NULL; mp = mp->b_next) {
151 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
152 			freemsgchain(nmp);
153 			return (NULL);
154 		}
155 
156 		nmpp = &((*nmpp)->b_next);
157 	}
158 
159 	return (nmp);
160 }
161 
162 /*
163  * Calculate the ULP checksum for IPv4. Return true if the calculation
164  * was successful, or false if an error occurred. If the later, place
165  * an error message into '*err'.
166  */
167 static boolean_t
168 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
169     const char **err)
170 {
171 	const uint8_t proto = ipha->ipha_protocol;
172 	size_t len;
173 	const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
174 	/* ULP offset from start of L2. */
175 	const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
176 	ipaddr_t src, dst;
177 	uint32_t cksum;
178 	uint16_t *up;
179 
180 	/*
181 	 * We need a pointer to the ULP checksum. We're assuming the
182 	 * ULP checksum pointer resides in the first mblk. Our native
183 	 * TCP stack should always put the headers in the first mblk,
184 	 * but currently we have no way to guarantee that other
185 	 * clients don't spread headers (or even header fields) across
186 	 * mblks.
187 	 */
188 	switch (proto) {
189 	case IPPROTO_TCP:
190 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
191 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
192 			*err = "mblk doesn't contain TCP header";
193 			goto bail;
194 		}
195 
196 		up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
197 		cksum = IP_TCP_CSUM_COMP;
198 		break;
199 
200 	case IPPROTO_UDP:
201 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
202 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
203 			*err = "mblk doesn't contain UDP header";
204 			goto bail;
205 		}
206 
207 		up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
208 		cksum = IP_UDP_CSUM_COMP;
209 		break;
210 
211 	case IPPROTO_SCTP: {
212 		sctp_hdr_t *sctph;
213 
214 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
215 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
216 			*err = "mblk doesn't contain SCTP header";
217 			goto bail;
218 		}
219 
220 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
221 		sctph->sh_chksum = 0;
222 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
223 		return (B_TRUE);
224 	}
225 
226 	default:
227 		*err = "unexpected protocol";
228 		goto bail;
229 
230 	}
231 
232 	/* Pseudo-header checksum. */
233 	src = ipha->ipha_src;
234 	dst = ipha->ipha_dst;
235 	len = ntohs(ipha->ipha_length) - ip_hdr_sz;
236 
237 	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
238 	cksum += htons(len);
239 
240 	/*
241 	 * We have already accounted for the pseudo checksum above.
242 	 * Make sure the ULP checksum field is zero before computing
243 	 * the rest.
244 	 */
245 	*up = 0;
246 	cksum = IP_CSUM(mp, ulp_offset, cksum);
247 	*up = (uint16_t)(cksum ? cksum : ~cksum);
248 
249 	return (B_TRUE);
250 
251 bail:
252 	return (B_FALSE);
253 }
254 
255 /*
256  * Calculate the ULP checksum for IPv6. Return true if the calculation
257  * was successful, or false if an error occurred. If the later, place
258  * an error message into '*err'.
259  */
260 static boolean_t
261 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
262 {
263 	ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
264 	const uint8_t proto = ip6h->ip6_nxt;
265 	const uint16_t *iphs = (uint16_t *)ip6h;
266 	/* ULP offset from start of L2. */
267 	uint32_t ulp_offset;
268 	size_t len;
269 	uint32_t cksum;
270 	uint16_t *up;
271 	uint16_t ip_hdr_sz;
272 
273 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
274 		*err = "malformed IPv6 header";
275 		goto bail;
276 	}
277 
278 	ulp_offset = ip_hdr_offset + ip_hdr_sz;
279 
280 	/*
281 	 * We need a pointer to the ULP checksum. We're assuming the
282 	 * ULP checksum pointer resides in the first mblk. Our native
283 	 * TCP stack should always put the headers in the first mblk,
284 	 * but currently we have no way to guarantee that other
285 	 * clients don't spread headers (or even header fields) across
286 	 * mblks.
287 	 */
288 	switch (proto) {
289 	case IPPROTO_TCP:
290 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
291 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
292 			*err = "mblk doesn't contain TCP header";
293 			goto bail;
294 		}
295 
296 		up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
297 		cksum = IP_TCP_CSUM_COMP;
298 		break;
299 
300 	case IPPROTO_UDP:
301 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
302 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
303 			*err = "mblk doesn't contain UDP header";
304 			goto bail;
305 		}
306 
307 		up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
308 		cksum = IP_UDP_CSUM_COMP;
309 		break;
310 
311 	case IPPROTO_SCTP: {
312 		sctp_hdr_t *sctph;
313 
314 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
315 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
316 			*err = "mblk doesn't contain SCTP header";
317 			goto bail;
318 		}
319 
320 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
321 		/*
322 		 * Zero out the checksum field to ensure proper
323 		 * checksum calculation.
324 		 */
325 		sctph->sh_chksum = 0;
326 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
327 		return (B_TRUE);
328 	}
329 
330 	default:
331 		*err = "unexpected protocol";
332 		goto bail;
333 	}
334 
335 	/*
336 	 * The payload length includes the payload and the IPv6
337 	 * extension headers; the idea is to subtract the extension
338 	 * header length to get the real payload length.
339 	 */
340 	len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
341 	cksum += len;
342 
343 	/*
344 	 * We accumulate the pseudo header checksum in cksum; then we
345 	 * call IP_CSUM to compute the checksum over the payload.
346 	 */
347 	cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
348 	    iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
349 	    iphs[16] + iphs[17] + iphs[18] + iphs[19];
350 	cksum = IP_CSUM(mp, ulp_offset, cksum);
351 
352 	/* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
353 	if (proto == IPPROTO_UDP && cksum == 0)
354 		cksum = ~cksum;
355 
356 	*up = (uint16_t)cksum;
357 
358 	return (B_TRUE);
359 
360 bail:
361 	return (B_FALSE);
362 }
363 
364 /*
365  * Perform software checksum on a single message, if needed. The
366  * emulation performed is determined by an intersection of the mblk's
367  * flags and the emul flags requested. The emul flags are documented
368  * in mac.h.
369  */
370 static mblk_t *
371 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
372 {
373 	mblk_t *skipped_hdr = NULL;
374 	uint32_t flags, start, stuff, end, value;
375 	uint32_t ip_hdr_offset;
376 	uint16_t etype;
377 	size_t ip_hdr_sz;
378 	struct ether_header *ehp;
379 	const char *err = "";
380 
381 	/*
382 	 * This function should only be called from mac_hw_emul()
383 	 * which handles mblk chains and the shared ref case.
384 	 */
385 	ASSERT3P(mp->b_next, ==, NULL);
386 
387 	mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
388 
389 	flags = DB_CKSUMFLAGS(mp);
390 
391 	/* Why call this if checksum emulation isn't needed? */
392 	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
393 
394 	/*
395 	 * Ethernet, and optionally VLAN header. mac_hw_emul() has
396 	 * already verified we have enough data to read the L2 header.
397 	 */
398 	ehp = (struct ether_header *)mp->b_rptr;
399 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
400 		struct ether_vlan_header *evhp;
401 
402 		evhp = (struct ether_vlan_header *)mp->b_rptr;
403 		etype = ntohs(evhp->ether_type);
404 		ip_hdr_offset = sizeof (struct ether_vlan_header);
405 	} else {
406 		etype = ntohs(ehp->ether_type);
407 		ip_hdr_offset = sizeof (struct ether_header);
408 	}
409 
410 	/*
411 	 * If this packet isn't IP, then leave it alone. We don't want
412 	 * to affect non-IP traffic like ARP. Assume the IP header
413 	 * doesn't include any options, for now. We will use the
414 	 * correct size later after we know there are enough bytes to
415 	 * at least fill out the basic header.
416 	 */
417 	switch (etype) {
418 	case ETHERTYPE_IP:
419 		ip_hdr_sz = sizeof (ipha_t);
420 		break;
421 	case ETHERTYPE_IPV6:
422 		ip_hdr_sz = sizeof (ip6_t);
423 		break;
424 	default:
425 		return (mp);
426 	}
427 
428 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
429 
430 	/*
431 	 * If the first mblk of this packet contains only the ethernet
432 	 * header, skip past it for now. Packets with their data
433 	 * contained in only a single mblk can then use the fastpaths
434 	 * tuned to that possibility.
435 	 */
436 	if (MBLKL(mp) == ip_hdr_offset) {
437 		ip_hdr_offset -= MBLKL(mp);
438 		/* This is guaranteed by mac_hw_emul(). */
439 		ASSERT3P(mp->b_cont, !=, NULL);
440 		skipped_hdr = mp;
441 		mp = mp->b_cont;
442 	}
443 
444 	/*
445 	 * Both full and partial checksum rely on finding the IP
446 	 * header in the current mblk. Our native TCP stack honors
447 	 * this assumption but it's prudent to guard our future
448 	 * clients that might not honor this contract.
449 	 */
450 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
451 	if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
452 		err = "mblk doesn't contain IP header";
453 		goto bail;
454 	}
455 
456 	/*
457 	 * We are about to modify the header mblk; make sure we are
458 	 * modifying our own copy. The code that follows assumes that
459 	 * the IP/ULP headers exist in this mblk (and drops the
460 	 * message if they don't).
461 	 */
462 	if (DB_REF(mp) > 1) {
463 		mblk_t *tmp = copyb(mp);
464 
465 		if (tmp == NULL) {
466 			err = "copyb failed";
467 			goto bail;
468 		}
469 
470 		if (skipped_hdr != NULL) {
471 			ASSERT3P(skipped_hdr->b_cont, ==, mp);
472 			skipped_hdr->b_cont = tmp;
473 		}
474 
475 		tmp->b_cont = mp->b_cont;
476 		freeb(mp);
477 		mp = tmp;
478 	}
479 
480 	if (etype == ETHERTYPE_IP) {
481 		ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
482 
483 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
484 			if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
485 				goto bail;
486 		}
487 
488 		/* We always update the ULP checksum flags. */
489 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
490 			flags &= ~HCK_FULLCKSUM;
491 			flags |= HCK_FULLCKSUM_OK;
492 			value = 0;
493 		}
494 
495 		/*
496 		 * While unlikely, it's possible to write code that
497 		 * might end up calling mac_sw_cksum() twice on the
498 		 * same mblk (performing both LSO and checksum
499 		 * emualtion in a single mblk chain loop -- the LSO
500 		 * emulation inserts a new chain into the existing
501 		 * chain and then the loop iterates back over the new
502 		 * segments and emulates the checksum a second time).
503 		 * Normally this wouldn't be a problem, because the
504 		 * HCK_*_OK flags are supposed to indicate that we
505 		 * don't need to do peform the work. But
506 		 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
507 		 * same value; so we cannot use these flags to
508 		 * determine if the IP header checksum has already
509 		 * been calculated or not. For this reason, we zero
510 		 * out the the checksum first. In the future, we
511 		 * should fix the HCK_* flags.
512 		 */
513 		if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
514 			ipha->ipha_hdr_checksum = 0;
515 			ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
516 			flags &= ~HCK_IPV4_HDRCKSUM;
517 			flags |= HCK_IPV4_HDRCKSUM_OK;
518 		}
519 	} else if (etype == ETHERTYPE_IPV6) {
520 		/* There is no IP header checksum for IPv6. */
521 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
522 			if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
523 				goto bail;
524 			flags &= ~HCK_FULLCKSUM;
525 			flags |= HCK_FULLCKSUM_OK;
526 			value = 0;
527 		}
528 	}
529 
530 	/*
531 	 * Partial checksum is the same for both IPv4 and IPv6.
532 	 */
533 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
534 		uint16_t *up, partial, cksum;
535 		uchar_t *ipp; /* ptr to beginning of IP header */
536 
537 		ipp = mp->b_rptr + ip_hdr_offset;
538 		up = (uint16_t *)((uchar_t *)ipp + stuff);
539 		partial = *up;
540 		*up = 0;
541 
542 		ASSERT3S(end, >, start);
543 		cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
544 		*up = cksum != 0 ? cksum : ~cksum;
545 	}
546 
547 	/* We always update the ULP checksum flags. */
548 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
549 		flags &= ~HCK_PARTIALCKSUM;
550 		flags |= HCK_FULLCKSUM_OK;
551 		value = 0;
552 	}
553 
554 	mac_hcksum_set(mp, start, stuff, end, value, flags);
555 
556 	/* Don't forget to reattach the header. */
557 	if (skipped_hdr != NULL) {
558 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
559 
560 		/*
561 		 * Duplicate the HCKSUM data into the header mblk.
562 		 * This mimics mac_add_vlan_tag which ensures that
563 		 * both the first mblk _and_ the first data bearing
564 		 * mblk possess the HCKSUM information. Consumers like
565 		 * IP will end up discarding the ether_header mblk, so
566 		 * for now, it is important that the data be available
567 		 * in both places.
568 		 */
569 		mac_hcksum_clone(mp, skipped_hdr);
570 		mp = skipped_hdr;
571 	}
572 
573 	return (mp);
574 
575 bail:
576 	if (skipped_hdr != NULL) {
577 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
578 		mp = skipped_hdr;
579 	}
580 
581 	mac_drop_pkt(mp, err);
582 	return (NULL);
583 }
584 
585 /*
586  * Build a single data segment from an LSO packet. The mblk chain
587  * returned, seg_head, represents the data segment and is always
588  * exactly seg_len bytes long. The lso_mp and offset input/output
589  * parameters track our position in the LSO packet. This function
590  * exists solely as a helper to mac_sw_lso().
591  *
592  * Case A
593  *
594  *     The current lso_mp is larger than the requested seg_len. The
595  *     beginning of seg_head may start at the beginning of lso_mp or
596  *     offset into it. In either case, a single mblk is returned, and
597  *     *offset is updated to reflect our new position in the current
598  *     lso_mp.
599  *
600  *          +----------------------------+
601  *          |  in *lso_mp / out *lso_mp  |
602  *          +----------------------------+
603  *          ^                        ^
604  *          |                        |
605  *          |                        |
606  *          |                        |
607  *          +------------------------+
608  *          |        seg_head        |
609  *          +------------------------+
610  *          ^                        ^
611  *          |                        |
612  *   in *offset = 0        out *offset = seg_len
613  *
614  *          |------   seg_len    ----|
615  *
616  *
617  *       +------------------------------+
618  *       |   in *lso_mp / out *lso_mp   |
619  *       +------------------------------+
620  *          ^                        ^
621  *          |                        |
622  *          |                        |
623  *          |                        |
624  *          +------------------------+
625  *          |        seg_head        |
626  *          +------------------------+
627  *          ^                        ^
628  *          |                        |
629  *   in *offset = N        out *offset = N + seg_len
630  *
631  *          |------   seg_len    ----|
632  *
633  *
634  *
635  * Case B
636  *
637  *    The requested seg_len consumes exactly the rest of the lso_mp.
638  *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
639  *    The seg_head may start at the beginning of the lso_mp or at some
640  *    offset into it. In either case we return a single mblk, reset
641  *    *offset to zero, and walk to the next lso_mp.
642  *
643  *          +------------------------+           +------------------------+
644  *          |       in *lso_mp       |---------->|      out *lso_mp       |
645  *          +------------------------+           +------------------------+
646  *          ^                        ^           ^
647  *          |                        |           |
648  *          |                        |    out *offset = 0
649  *          |                        |
650  *          +------------------------+
651  *          |        seg_head        |
652  *          +------------------------+
653  *          ^
654  *          |
655  *   in *offset = 0
656  *
657  *          |------   seg_len    ----|
658  *
659  *
660  *
661  *      +----------------------------+           +------------------------+
662  *      |         in *lso_mp         |---------->|      out *lso_mp       |
663  *      +----------------------------+           +------------------------+
664  *          ^                        ^           ^
665  *          |                        |           |
666  *          |                        |    out *offset = 0
667  *          |                        |
668  *          +------------------------+
669  *          |        seg_head        |
670  *          +------------------------+
671  *          ^
672  *          |
673  *   in *offset = N
674  *
675  *          |------   seg_len    ----|
676  *
677  *
678  * Case C
679  *
680  *    The requested seg_len is greater than the current lso_mp. In
681  *    this case we must consume LSO mblks until we have enough data to
682  *    satisfy either case (A) or (B) above. We will return multiple
683  *    mblks linked via b_cont, offset will be set based on the cases
684  *    above, and lso_mp will walk forward at least one mblk, but maybe
685  *    more.
686  *
687  *    N.B. This digram is not exhaustive. The seg_head may start on
688  *    the beginning of an lso_mp. The seg_tail may end exactly on the
689  *    boundary of an lso_mp. And there may be two (in this case the
690  *    middle block wouldn't exist), three, or more mblks in the
691  *    seg_head chain. This is meant as one example of what might
692  *    happen. The main thing to remember is that the seg_tail mblk
693  *    must be one of case (A) or (B) above.
694  *
695  *  +------------------+    +----------------+    +------------------+
696  *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
697  *  +------------------+    +----------------+    +------------------+
698  *        ^            ^    ^                ^    ^            ^
699  *        |            |    |                |    |            |
700  *        |            |    |                |    |            |
701  *        |            |    |                |    |            |
702  *        |            |    |                |    |            |
703  *        +------------+    +----------------+    +------------+
704  *        |  seg_head  |--->|                |--->|  seg_tail  |
705  *        +------------+    +----------------+    +------------+
706  *        ^                                                    ^
707  *        |                                                    |
708  *  in *offset = N                          out *offset = MBLKL(seg_tail)
709  *
710  *        |-------------------   seg_len    -------------------|
711  *
712  */
713 static mblk_t *
714 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
715 {
716 	mblk_t *seg_head, *seg_tail, *seg_mp;
717 
718 	ASSERT3P(*lso_mp, !=, NULL);
719 	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
720 
721 	seg_mp = dupb(*lso_mp);
722 	if (seg_mp == NULL)
723 		return (NULL);
724 
725 	seg_head = seg_mp;
726 	seg_tail = seg_mp;
727 
728 	/* Continue where we left off from in the lso_mp. */
729 	seg_mp->b_rptr += *offset;
730 
731 last_mblk:
732 	/* Case (A) */
733 	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
734 		*offset += seg_len;
735 		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
736 		return (seg_head);
737 	}
738 
739 	/* Case (B) */
740 	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
741 		*offset = 0;
742 		*lso_mp = (*lso_mp)->b_cont;
743 		return (seg_head);
744 	}
745 
746 	/* Case (C) */
747 	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
748 
749 	/*
750 	 * The current LSO mblk doesn't have enough data to satisfy
751 	 * seg_len -- continue peeling off LSO mblks to build the new
752 	 * segment message. If allocation fails we free the previously
753 	 * allocated segment mblks and return NULL.
754 	 */
755 	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
756 		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
757 		seg_len -= MBLKL(seg_mp);
758 		*offset = 0;
759 		*lso_mp = (*lso_mp)->b_cont;
760 		seg_mp = dupb(*lso_mp);
761 
762 		if (seg_mp == NULL) {
763 			freemsgchain(seg_head);
764 			return (NULL);
765 		}
766 
767 		seg_tail->b_cont = seg_mp;
768 		seg_tail = seg_mp;
769 	}
770 
771 	/*
772 	 * We've walked enough LSO mblks that we can now satisfy the
773 	 * remaining seg_len. At this point we need to jump back to
774 	 * determine if we have arrived at case (A) or (B).
775 	 */
776 
777 	/* Just to be paranoid that we didn't underflow. */
778 	ASSERT3U(seg_len, <, IP_MAXPACKET);
779 	ASSERT3U(seg_len, >, 0);
780 	goto last_mblk;
781 }
782 
783 /*
784  * Perform software segmentation of a single LSO message. Take an LSO
785  * message as input and return head/tail pointers as output. This
786  * function should not be invoked directly but instead through
787  * mac_hw_emul().
788  *
789  * The resulting chain is comprised of multiple (nsegs) MSS sized
790  * segments. Each segment will consist of two or more mblks joined by
791  * b_cont: a header and one or more data mblks. The header mblk is
792  * allocated anew for each message. The first segment's header is used
793  * as a template for the rest with adjustments made for things such as
794  * ID, sequence, length, TCP flags, etc. The data mblks reference into
795  * the existing LSO mblk (passed in as omp) by way of dupb(). Their
796  * b_rptr/b_wptr values are adjusted to reference only the fraction of
797  * the LSO message they are responsible for. At the successful
798  * completion of this function the original mblk (omp) is freed,
799  * leaving the newely created segment chain as the only remaining
800  * reference to the data.
801  */
802 static void
803 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
804     uint_t *count)
805 {
806 	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
807 	uint32_t mss;
808 	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
809 	uint32_t oleft;
810 	uint_t nsegs, seg;
811 	int len;
812 
813 	struct ether_vlan_header *oevh;
814 	const ipha_t *oiph;
815 	const tcph_t *otcph;
816 	ipha_t *niph;
817 	tcph_t *ntcph;
818 	uint16_t ip_id;
819 	uint32_t tcp_seq, tcp_sum, otcp_sum;
820 
821 	uint32_t offset;
822 	mblk_t *odatamp;
823 	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
824 	mblk_t *tmptail;
825 
826 	ASSERT3P(head, !=, NULL);
827 	ASSERT3P(tail, !=, NULL);
828 	ASSERT3P(count, !=, NULL);
829 	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
830 
831 	/* Assume we are dealing with a single LSO message. */
832 	ASSERT3P(omp->b_next, ==, NULL);
833 
834 	/*
835 	 * XXX: This is a hack to deal with mac_add_vlan_tag().
836 	 *
837 	 * When VLANs are in play, mac_add_vlan_tag() creates a new
838 	 * mblk with just the ether_vlan_header and tacks it onto the
839 	 * front of 'omp'. This breaks the assumptions made below;
840 	 * namely that the TCP/IP headers are in the first mblk. In
841 	 * this case, since we already have to pay the cost of LSO
842 	 * emulation, we simply pull up everything. While this might
843 	 * seem irksome, keep in mind this will only apply in a couple
844 	 * of scenarios: a) an LSO-capable VLAN client sending to a
845 	 * non-LSO-capable client over the "MAC/bridge loopback"
846 	 * datapath or b) an LSO-capable VLAN client is sending to a
847 	 * client that, for whatever reason, doesn't have DLS-bypass
848 	 * enabled. Finally, we have to check for both a tagged and
849 	 * untagged sized mblk depending on if the mblk came via
850 	 * mac_promisc_dispatch() or mac_rx_deliver().
851 	 *
852 	 * In the future, two things should be done:
853 	 *
854 	 * 1. This function should make use of some yet to be
855 	 *    implemented "mblk helpers". These helper functions would
856 	 *    perform all the b_cont walking for us and guarantee safe
857 	 *    access to the mblk data.
858 	 *
859 	 * 2. We should add some slop to the mblks so that
860 	 *    mac_add_vlan_tag() can just edit the first mblk instead
861 	 *    of allocating on the hot path.
862 	 */
863 	if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
864 	    MBLKL(omp) == sizeof (struct ether_header)) {
865 		mblk_t *tmp = msgpullup(omp, -1);
866 
867 		if (tmp == NULL) {
868 			mac_drop_pkt(omp, "failed to pull up");
869 			goto fail;
870 		}
871 
872 		mac_hcksum_clone(omp, tmp);
873 		freemsg(omp);
874 		omp = tmp;
875 	}
876 
877 	mss = DB_LSOMSS(omp);
878 	ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
879 	    sizeof (struct ether_vlan_header));
880 	opktlen = msgsize(omp);
881 
882 	/*
883 	 * First, get references to the IP and TCP headers and
884 	 * determine the total TCP length (header + data).
885 	 *
886 	 * Thanks to mac_hw_emul() we know that the first mblk must
887 	 * contain (at minimum) the full L2 header. However, this
888 	 * function assumes more than that. It assumes the L2/L3/L4
889 	 * headers are all contained in the first mblk of a message
890 	 * (i.e., no b_cont walking for headers). While this is a
891 	 * current reality (our native TCP stack and viona both
892 	 * enforce this) things may become more nuanced in the future
893 	 * (e.g. when introducing encap support or adding new
894 	 * clients). For now we guard against this case by dropping
895 	 * the packet.
896 	 */
897 	oevh = (struct ether_vlan_header *)omp->b_rptr;
898 	if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
899 		oehlen = sizeof (struct ether_vlan_header);
900 	else
901 		oehlen = sizeof (struct ether_header);
902 
903 	ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
904 	if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
905 		mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
906 		goto fail;
907 	}
908 
909 	oiph = (ipha_t *)(omp->b_rptr + oehlen);
910 	oiphlen = IPH_HDR_LENGTH(oiph);
911 	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
912 	otcphlen = TCP_HDR_LENGTH(otcph);
913 
914 	/*
915 	 * Currently we only support LSO for TCP/IPv4.
916 	 */
917 	if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
918 		mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
919 		    IPH_HDR_VERSION(oiph));
920 		goto fail;
921 	}
922 
923 	if (oiph->ipha_protocol != IPPROTO_TCP) {
924 		mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
925 		    oiph->ipha_protocol);
926 		goto fail;
927 	}
928 
929 	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
930 		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
931 		goto fail;
932 	}
933 
934 	ohdrslen = oehlen + oiphlen + otcphlen;
935 	if ((len = MBLKL(omp)) < ohdrslen) {
936 		mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
937 		    ohdrslen);
938 		goto fail;
939 	}
940 
941 	/*
942 	 * Either we have data in the first mblk or it's just the
943 	 * header. In either case, we need to set rptr to the start of
944 	 * the TCP data.
945 	 */
946 	if (len > ohdrslen) {
947 		odatamp = omp;
948 		offset = ohdrslen;
949 	} else {
950 		ASSERT3U(len, ==, ohdrslen);
951 		odatamp = omp->b_cont;
952 		offset = 0;
953 	}
954 
955 	/* Make sure we still have enough data. */
956 	ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
957 
958 	/*
959 	 * If a MAC negotiated LSO then it must negotioate both
960 	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
961 	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
962 	 * change during LSO segmentation (only the 3 fields of the
963 	 * pseudo header checksum don't change: src, dst, proto). Thus
964 	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
965 	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
966 	 * function to emulate those checksums in software. However,
967 	 * that assumes a world where we only expose LSO if the
968 	 * underlying hardware exposes LSO. Moving forward the plan is
969 	 * to assume LSO in the upper layers and have MAC perform
970 	 * software LSO when the underlying provider doesn't support
971 	 * it. In such a world, if the provider doesn't support LSO
972 	 * but does support hardware checksum offload, then we could
973 	 * simply perform the segmentation and allow the hardware to
974 	 * calculate the checksums. To the hardware it's just another
975 	 * chain of non-LSO packets.
976 	 */
977 	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
978 	ocsum_flags = DB_CKSUMFLAGS(omp);
979 	ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
980 	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
981 
982 	/*
983 	 * If hardware only provides partial checksum then software
984 	 * must supply the pseudo-header checksum. In the case of LSO
985 	 * we leave the TCP length at zero to be filled in by
986 	 * hardware. This function must handle two scenarios.
987 	 *
988 	 * 1. Being called by a MAC client on the Rx path to segment
989 	 *    an LSO packet and calculate the checksum.
990 	 *
991 	 * 2. Being called by a MAC provider to segment an LSO packet.
992 	 *    In this case the LSO segmentation is performed in
993 	 *    software (by this routine) but the MAC provider should
994 	 *    still calculate the TCP/IP checksums in hardware.
995 	 *
996 	 *  To elaborate on the second case: we cannot have the
997 	 *  scenario where IP sends LSO packets but the underlying HW
998 	 *  doesn't support checksum offload -- because in that case
999 	 *  TCP/IP would calculate the checksum in software (for the
1000 	 *  LSO packet) but then MAC would segment the packet and have
1001 	 *  to redo all the checksum work. So IP should never do LSO
1002 	 *  if HW doesn't support both IP and TCP checksum.
1003 	 */
1004 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1005 		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
1006 		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
1007 	}
1008 
1009 	odatalen = opktlen - ohdrslen;
1010 
1011 	/*
1012 	 * Subtract one to account for the case where the data length
1013 	 * is evenly divisble by the MSS. Add one to account for the
1014 	 * fact that the division will always result in one less
1015 	 * segment than needed.
1016 	 */
1017 	nsegs = ((odatalen - 1) / mss) + 1;
1018 	if (nsegs < 2) {
1019 		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
1020 		goto fail;
1021 	}
1022 
1023 	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
1024 	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
1025 	    nsegs);
1026 
1027 	seg_chain = NULL;
1028 	tmptail = seg_chain;
1029 	oleft = odatalen;
1030 
1031 	for (uint_t i = 0; i < nsegs; i++) {
1032 		boolean_t last_seg = ((i + 1) == nsegs);
1033 		uint32_t seg_len;
1034 
1035 		/*
1036 		 * If we fail to allocate, then drop the partially
1037 		 * allocated chain as well as the LSO packet. Let the
1038 		 * sender deal with the fallout.
1039 		 */
1040 		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
1041 			freemsgchain(seg_chain);
1042 			mac_drop_pkt(omp, "failed to alloc segment header");
1043 			goto fail;
1044 		}
1045 		ASSERT3P(nhdrmp->b_cont, ==, NULL);
1046 
1047 		if (seg_chain == NULL) {
1048 			seg_chain = nhdrmp;
1049 		} else {
1050 			ASSERT3P(tmptail, !=, NULL);
1051 			tmptail->b_next = nhdrmp;
1052 		}
1053 
1054 		tmptail = nhdrmp;
1055 
1056 		/*
1057 		 * Calculate this segment's lengh. It's either the MSS
1058 		 * or whatever remains for the last segment.
1059 		 */
1060 		seg_len = last_seg ? oleft : mss;
1061 		ASSERT3U(seg_len, <=, mss);
1062 		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
1063 
1064 		if (ndatamp == NULL) {
1065 			freemsgchain(seg_chain);
1066 			mac_drop_pkt(omp, "LSO failed to segment data");
1067 			goto fail;
1068 		}
1069 
1070 		/* Attach data mblk to header mblk. */
1071 		nhdrmp->b_cont = ndatamp;
1072 		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
1073 		ASSERT3U(seg_len, <=, oleft);
1074 		oleft -= seg_len;
1075 	}
1076 
1077 	/* We should have consumed entire LSO msg. */
1078 	ASSERT3S(oleft, ==, 0);
1079 	ASSERT3P(odatamp, ==, NULL);
1080 
1081 	/*
1082 	 * All seg data mblks are referenced by the header mblks, null
1083 	 * out this pointer to catch any bad derefs.
1084 	 */
1085 	ndatamp = NULL;
1086 
1087 	/*
1088 	 * Set headers and checksum for first segment.
1089 	 */
1090 	nhdrmp = seg_chain;
1091 	bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
1092 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1093 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1094 	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
1095 	niph->ipha_length = htons(oiphlen + otcphlen + mss);
1096 	niph->ipha_hdr_checksum = 0;
1097 	ip_id = ntohs(niph->ipha_ident);
1098 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1099 	tcp_seq = BE32_TO_U32(ntcph->th_seq);
1100 	tcp_seq += mss;
1101 
1102 	/*
1103 	 * The first segment shouldn't:
1104 	 *
1105 	 *	o indicate end of data transmission (FIN),
1106 	 *	o indicate immediate handling of the data (PUSH).
1107 	 */
1108 	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1109 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1110 
1111 	/*
1112 	 * If the underlying HW provides partial checksum, then make
1113 	 * sure to correct the pseudo header checksum before calling
1114 	 * mac_sw_cksum(). The native TCP stack doesn't include the
1115 	 * length field in the pseudo header when LSO is in play -- so
1116 	 * we need to calculate it here.
1117 	 */
1118 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1119 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1120 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1121 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1122 		tcp_sum = BE16_TO_U16(ntcph->th_sum);
1123 		otcp_sum = tcp_sum;
1124 		tcp_sum += mss + otcphlen;
1125 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1126 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1127 	}
1128 
1129 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1130 	    (emul & MAC_HWCKSUM_EMULS)) {
1131 		next_nhdrmp = nhdrmp->b_next;
1132 		nhdrmp->b_next = NULL;
1133 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1134 		nhdrmp->b_next = next_nhdrmp;
1135 		next_nhdrmp = NULL;
1136 
1137 		/*
1138 		 * We may have freed the nhdrmp argument during
1139 		 * checksum emulation, make sure that seg_chain
1140 		 * references a valid mblk.
1141 		 */
1142 		seg_chain = nhdrmp;
1143 	}
1144 
1145 	ASSERT3P(nhdrmp, !=, NULL);
1146 
1147 	seg = 1;
1148 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1149 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1150 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1151 	    uint_t, seg);
1152 	seg++;
1153 
1154 	/* There better be at least 2 segs. */
1155 	ASSERT3P(nhdrmp->b_next, !=, NULL);
1156 	prev_nhdrmp = nhdrmp;
1157 	nhdrmp = nhdrmp->b_next;
1158 
1159 	/*
1160 	 * Now adjust the headers of the middle segments. For each
1161 	 * header we need to adjust the following.
1162 	 *
1163 	 *	o IP ID
1164 	 *	o IP length
1165 	 *	o TCP sequence
1166 	 *	o TCP flags
1167 	 *	o cksum flags
1168 	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
1169 	 */
1170 	for (; seg < nsegs; seg++) {
1171 		/*
1172 		 * We use seg_chain as a reference to the first seg
1173 		 * header mblk -- this first header is a template for
1174 		 * the rest of the segments. This copy will include
1175 		 * the now updated checksum values from the first
1176 		 * header. We must reset these checksum values to
1177 		 * their original to make sure we produce the correct
1178 		 * value.
1179 		 */
1180 		bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1181 		nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1182 		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1183 		niph->ipha_ident = htons(++ip_id);
1184 		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1185 		niph->ipha_length = htons(oiphlen + otcphlen + mss);
1186 		niph->ipha_hdr_checksum = 0;
1187 		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1188 		U32_TO_BE32(tcp_seq, ntcph->th_seq);
1189 		tcp_seq += mss;
1190 		/*
1191 		 * Just like the first segment, the middle segments
1192 		 * shouldn't have these flags set.
1193 		 */
1194 		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1195 		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1196 
1197 		if (ocsum_flags & HCK_PARTIALCKSUM) {
1198 			/*
1199 			 * First and middle segs have same
1200 			 * pseudo-header checksum.
1201 			 */
1202 			U16_TO_BE16(tcp_sum, ntcph->th_sum);
1203 			DB_CKSUMSTART(nhdrmp) = ocsum_start;
1204 			DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1205 			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1206 		}
1207 
1208 		if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1209 		    (emul & MAC_HWCKSUM_EMULS)) {
1210 			next_nhdrmp = nhdrmp->b_next;
1211 			nhdrmp->b_next = NULL;
1212 			nhdrmp = mac_sw_cksum(nhdrmp, emul);
1213 			nhdrmp->b_next = next_nhdrmp;
1214 			next_nhdrmp = NULL;
1215 			/* We may have freed the original nhdrmp. */
1216 			prev_nhdrmp->b_next = nhdrmp;
1217 		}
1218 
1219 		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1220 		    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1221 		    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1222 		    uint_t, mss, uint_t, seg);
1223 
1224 		ASSERT3P(nhdrmp->b_next, !=, NULL);
1225 		prev_nhdrmp = nhdrmp;
1226 		nhdrmp = nhdrmp->b_next;
1227 	}
1228 
1229 	/* Make sure we are on the last segment. */
1230 	ASSERT3U(seg, ==, nsegs);
1231 	ASSERT3P(nhdrmp->b_next, ==, NULL);
1232 
1233 	/*
1234 	 * Now we set the last segment header. The difference being
1235 	 * that FIN/PSH/RST flags are allowed.
1236 	 */
1237 	bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1238 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1239 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1240 	niph->ipha_ident = htons(++ip_id);
1241 	len = msgsize(nhdrmp->b_cont);
1242 	ASSERT3S(len, >, 0);
1243 	niph->ipha_length = htons(oiphlen + otcphlen + len);
1244 	niph->ipha_hdr_checksum = 0;
1245 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1246 	U32_TO_BE32(tcp_seq, ntcph->th_seq);
1247 
1248 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1249 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1250 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1251 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1252 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1253 		tcp_sum = otcp_sum;
1254 		tcp_sum += len + otcphlen;
1255 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1256 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1257 	}
1258 
1259 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1260 	    (emul & MAC_HWCKSUM_EMULS)) {
1261 		/* This should be the last mblk. */
1262 		ASSERT3P(nhdrmp->b_next, ==, NULL);
1263 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1264 		prev_nhdrmp->b_next = nhdrmp;
1265 	}
1266 
1267 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1268 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1269 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1270 	    uint_t, seg);
1271 
1272 	/*
1273 	 * Free the reference to the original LSO message as it is
1274 	 * being replaced by seg_cahin.
1275 	 */
1276 	freemsg(omp);
1277 	*head = seg_chain;
1278 	*tail = nhdrmp;
1279 	*count = nsegs;
1280 	return;
1281 
1282 fail:
1283 	*head = NULL;
1284 	*tail = NULL;
1285 	*count = 0;
1286 }
1287 
1288 #define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1289 
1290 /*
1291  * Emulate various hardware offload features in software. Take a chain
1292  * of packets as input and emulate the hardware features specified in
1293  * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1294  * pointer given as input, and its tail pointer is written to
1295  * '*otail'. The number of packets in the new chain is written to
1296  * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1297  * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1298  * which case 'mp_chain' will simply stay a NULL chain.
1299  *
1300  * While unlikely, it is technically possible that this function could
1301  * receive a non-NULL chain as input and return a NULL chain as output
1302  * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1303  * zero). This could happen if all the packets in the chain are
1304  * dropped or if we fail to allocate new mblks. In this case, there is
1305  * nothing for the caller to free. In any event, the caller shouldn't
1306  * assume that '*mp_chain' is non-NULL on return.
1307  *
1308  * This function was written with three main use cases in mind.
1309  *
1310  * 1. To emulate hardware offloads when traveling mac-loopback (two
1311  *    clients on the same mac). This is wired up in mac_tx_send().
1312  *
1313  * 2. To provide hardware offloads to the client when the underlying
1314  *    provider cannot. This is currently wired up in mac_tx() but we
1315  *    still only negotiate offloads when the underlying provider
1316  *    supports them.
1317  *
1318  * 3. To emulate real hardware in simnet.
1319  */
1320 void
1321 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1322 {
1323 	mblk_t *head = NULL, *tail = NULL;
1324 	uint_t count = 0;
1325 
1326 	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1327 	ASSERT3P(mp_chain, !=, NULL);
1328 
1329 	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1330 		mblk_t *tmp, *next, *tmphead, *tmptail;
1331 		struct ether_header *ehp;
1332 		uint32_t flags;
1333 		uint_t len = MBLKL(mp), l2len;
1334 
1335 		/* Perform LSO/cksum one message at a time. */
1336 		next = mp->b_next;
1337 		mp->b_next = NULL;
1338 
1339 		/*
1340 		 * For our sanity the first mblk should contain at
1341 		 * least the full L2 header.
1342 		 */
1343 		if (len < sizeof (struct ether_header)) {
1344 			mac_drop_pkt(mp, "packet too short (A): %u", len);
1345 			mp = next;
1346 			continue;
1347 		}
1348 
1349 		ehp = (struct ether_header *)mp->b_rptr;
1350 		if (ntohs(ehp->ether_type) == VLAN_TPID)
1351 			l2len = sizeof (struct ether_vlan_header);
1352 		else
1353 			l2len = sizeof (struct ether_header);
1354 
1355 		/*
1356 		 * If the first mblk is solely the L2 header, then
1357 		 * there better be more data.
1358 		 */
1359 		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1360 			mac_drop_pkt(mp, "packet too short (C): %u", len);
1361 			mp = next;
1362 			continue;
1363 		}
1364 
1365 		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1366 
1367 		/*
1368 		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1369 		 * because we don't want to mask-out the LSO flag.
1370 		 */
1371 		flags = DB_CKSUMFLAGS(mp);
1372 
1373 		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1374 			uint_t tmpcount = 0;
1375 
1376 			/*
1377 			 * LSO fix-up handles checksum emulation
1378 			 * inline (if requested). It also frees mp.
1379 			 */
1380 			mac_sw_lso(mp, emul, &tmphead, &tmptail,
1381 			    &tmpcount);
1382 			if (tmphead == NULL) {
1383 				/* mac_sw_lso() freed the mp. */
1384 				mp = next;
1385 				continue;
1386 			}
1387 			count += tmpcount;
1388 		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1389 			tmp = mac_sw_cksum(mp, emul);
1390 			if (tmp == NULL) {
1391 				/* mac_sw_cksum() freed the mp. */
1392 				mp = next;
1393 				continue;
1394 			}
1395 			tmphead = tmp;
1396 			tmptail = tmp;
1397 			count++;
1398 		} else {
1399 			/* There is nothing to emulate. */
1400 			tmp = mp;
1401 			tmphead = tmp;
1402 			tmptail = tmp;
1403 			count++;
1404 		}
1405 
1406 		/*
1407 		 * The tmp mblk chain is either the start of the new
1408 		 * chain or added to the tail of the new chain.
1409 		 */
1410 		if (head == NULL) {
1411 			head = tmphead;
1412 			tail = tmptail;
1413 		} else {
1414 			/* Attach the new mblk to the end of the new chain. */
1415 			tail->b_next = tmphead;
1416 			tail = tmptail;
1417 		}
1418 
1419 		mp = next;
1420 	}
1421 
1422 	*mp_chain = head;
1423 
1424 	if (otail != NULL)
1425 		*otail = tail;
1426 
1427 	if (ocount != NULL)
1428 		*ocount = count;
1429 }
1430 
1431 /*
1432  * Add VLAN tag to the specified mblk.
1433  */
1434 mblk_t *
1435 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1436 {
1437 	mblk_t *hmp;
1438 	struct ether_vlan_header *evhp;
1439 	struct ether_header *ehp;
1440 
1441 	ASSERT(pri != 0 || vid != 0);
1442 
1443 	/*
1444 	 * Allocate an mblk for the new tagged ethernet header,
1445 	 * and copy the MAC addresses and ethertype from the
1446 	 * original header.
1447 	 */
1448 
1449 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1450 	if (hmp == NULL) {
1451 		freemsg(mp);
1452 		return (NULL);
1453 	}
1454 
1455 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
1456 	ehp = (struct ether_header *)mp->b_rptr;
1457 
1458 	bcopy(ehp, evhp, (ETHERADDRL * 2));
1459 	evhp->ether_type = ehp->ether_type;
1460 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1461 
1462 	hmp->b_wptr += sizeof (struct ether_vlan_header);
1463 	mp->b_rptr += sizeof (struct ether_header);
1464 
1465 	/*
1466 	 * Free the original message if it's now empty. Link the
1467 	 * rest of messages to the header message.
1468 	 */
1469 	mac_hcksum_clone(mp, hmp);
1470 	if (MBLKL(mp) == 0) {
1471 		hmp->b_cont = mp->b_cont;
1472 		freeb(mp);
1473 	} else {
1474 		hmp->b_cont = mp;
1475 	}
1476 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1477 
1478 	/*
1479 	 * Initialize the new TCI (Tag Control Information).
1480 	 */
1481 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1482 
1483 	return (hmp);
1484 }
1485 
1486 /*
1487  * Adds a VLAN tag with the specified VID and priority to each mblk of
1488  * the specified chain.
1489  */
1490 mblk_t *
1491 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1492 {
1493 	mblk_t *next_mp, **prev, *mp;
1494 
1495 	mp = mp_chain;
1496 	prev = &mp_chain;
1497 
1498 	while (mp != NULL) {
1499 		next_mp = mp->b_next;
1500 		mp->b_next = NULL;
1501 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1502 			freemsgchain(next_mp);
1503 			break;
1504 		}
1505 		*prev = mp;
1506 		prev = &mp->b_next;
1507 		mp = mp->b_next = next_mp;
1508 	}
1509 
1510 	return (mp_chain);
1511 }
1512 
1513 /*
1514  * Strip VLAN tag
1515  */
1516 mblk_t *
1517 mac_strip_vlan_tag(mblk_t *mp)
1518 {
1519 	mblk_t *newmp;
1520 	struct ether_vlan_header *evhp;
1521 
1522 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1523 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1524 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1525 
1526 		if (DB_REF(mp) > 1) {
1527 			newmp = copymsg(mp);
1528 			if (newmp == NULL)
1529 				return (NULL);
1530 			freemsg(mp);
1531 			mp = newmp;
1532 		}
1533 
1534 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1535 
1536 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1537 		mp->b_rptr += VLAN_TAGSZ;
1538 	}
1539 	return (mp);
1540 }
1541 
1542 /*
1543  * Strip VLAN tag from each mblk of the chain.
1544  */
1545 mblk_t *
1546 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1547 {
1548 	mblk_t *mp, *next_mp, **prev;
1549 
1550 	mp = mp_chain;
1551 	prev = &mp_chain;
1552 
1553 	while (mp != NULL) {
1554 		next_mp = mp->b_next;
1555 		mp->b_next = NULL;
1556 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1557 			freemsgchain(next_mp);
1558 			break;
1559 		}
1560 		*prev = mp;
1561 		prev = &mp->b_next;
1562 		mp = mp->b_next = next_mp;
1563 	}
1564 
1565 	return (mp_chain);
1566 }
1567 
1568 /*
1569  * Default callback function. Used when the datapath is not yet initialized.
1570  */
1571 /* ARGSUSED */
1572 void
1573 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1574     boolean_t loopback)
1575 {
1576 	freemsgchain(mp_chain);
1577 }
1578 
1579 /*
1580  * Determines the IPv6 header length accounting for all the optional IPv6
1581  * headers (hop-by-hop, destination, routing and fragment). The header length
1582  * and next header value (a transport header) is captured.
1583  *
1584  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1585  * returns B_TRUE.
1586  */
1587 boolean_t
1588 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1589     uint8_t *next_hdr, ip6_frag_t **fragp)
1590 {
1591 	uint16_t length;
1592 	uint_t	ehdrlen;
1593 	uint8_t *whereptr;
1594 	uint8_t *nexthdrp;
1595 	ip6_dest_t *desthdr;
1596 	ip6_rthdr_t *rthdr;
1597 	ip6_frag_t *fraghdr;
1598 
1599 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1600 		return (B_FALSE);
1601 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1602 	length = IPV6_HDR_LEN;
1603 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1604 
1605 	if (fragp != NULL)
1606 		*fragp = NULL;
1607 
1608 	nexthdrp = &ip6h->ip6_nxt;
1609 	while (whereptr < endptr) {
1610 		/* Is there enough left for len + nexthdr? */
1611 		if (whereptr + MIN_EHDR_LEN > endptr)
1612 			break;
1613 
1614 		switch (*nexthdrp) {
1615 		case IPPROTO_HOPOPTS:
1616 		case IPPROTO_DSTOPTS:
1617 			/* Assumes the headers are identical for hbh and dst */
1618 			desthdr = (ip6_dest_t *)whereptr;
1619 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
1620 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
1621 				return (B_FALSE);
1622 			nexthdrp = &desthdr->ip6d_nxt;
1623 			break;
1624 		case IPPROTO_ROUTING:
1625 			rthdr = (ip6_rthdr_t *)whereptr;
1626 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
1627 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
1628 				return (B_FALSE);
1629 			nexthdrp = &rthdr->ip6r_nxt;
1630 			break;
1631 		case IPPROTO_FRAGMENT:
1632 			fraghdr = (ip6_frag_t *)whereptr;
1633 			ehdrlen = sizeof (ip6_frag_t);
1634 			if ((uchar_t *)&fraghdr[1] > endptr)
1635 				return (B_FALSE);
1636 			nexthdrp = &fraghdr->ip6f_nxt;
1637 			if (fragp != NULL)
1638 				*fragp = fraghdr;
1639 			break;
1640 		case IPPROTO_NONE:
1641 			/* No next header means we're finished */
1642 		default:
1643 			*hdr_length = length;
1644 			*next_hdr = *nexthdrp;
1645 			return (B_TRUE);
1646 		}
1647 		length += ehdrlen;
1648 		whereptr += ehdrlen;
1649 		*hdr_length = length;
1650 		*next_hdr = *nexthdrp;
1651 	}
1652 	switch (*nexthdrp) {
1653 	case IPPROTO_HOPOPTS:
1654 	case IPPROTO_DSTOPTS:
1655 	case IPPROTO_ROUTING:
1656 	case IPPROTO_FRAGMENT:
1657 		/*
1658 		 * If any know extension headers are still to be processed,
1659 		 * the packet's malformed (or at least all the IP header(s) are
1660 		 * not in the same mblk - and that should never happen.
1661 		 */
1662 		return (B_FALSE);
1663 
1664 	default:
1665 		/*
1666 		 * If we get here, we know that all of the IP headers were in
1667 		 * the same mblk, even if the ULP header is in the next mblk.
1668 		 */
1669 		*hdr_length = length;
1670 		*next_hdr = *nexthdrp;
1671 		return (B_TRUE);
1672 	}
1673 }
1674 
1675 /*
1676  * The following set of routines are there to take care of interrupt
1677  * re-targeting for legacy (fixed) interrupts. Some older versions
1678  * of the popular NICs like e1000g do not support MSI-X interrupts
1679  * and they reserve fixed interrupts for RX/TX rings. To re-target
1680  * these interrupts, PCITOOL ioctls need to be used.
1681  */
1682 typedef struct mac_dladm_intr {
1683 	int	ino;
1684 	int	cpu_id;
1685 	char	driver_path[MAXPATHLEN];
1686 	char	nexus_path[MAXPATHLEN];
1687 } mac_dladm_intr_t;
1688 
1689 /* Bind the interrupt to cpu_num */
1690 static int
1691 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1692 {
1693 	pcitool_intr_set_t	iset;
1694 	int			err;
1695 
1696 	iset.old_cpu = oldcpuid;
1697 	iset.ino = ino;
1698 	iset.cpu_id = cpu_num;
1699 	iset.user_version = PCITOOL_VERSION;
1700 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1701 	    kcred, NULL);
1702 
1703 	return (err);
1704 }
1705 
1706 /*
1707  * Search interrupt information. iget is filled in with the info to search
1708  */
1709 static boolean_t
1710 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1711 {
1712 	int	i;
1713 	char	driver_path[2 * MAXPATHLEN];
1714 
1715 	for (i = 0; i < iget_p->num_devs; i++) {
1716 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1717 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1718 		    ":%s%d", iget_p->dev[i].driver_name,
1719 		    iget_p->dev[i].dev_inst);
1720 		/* Match the device path for the device path */
1721 		if (strcmp(driver_path, dln->driver_path) == 0) {
1722 			dln->ino = iget_p->ino;
1723 			dln->cpu_id = iget_p->cpu_id;
1724 			return (B_TRUE);
1725 		}
1726 	}
1727 	return (B_FALSE);
1728 }
1729 
1730 /*
1731  * Get information about ino, i.e. if this is the interrupt for our
1732  * device and where it is bound etc.
1733  */
1734 static boolean_t
1735 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1736     mac_dladm_intr_t *dln)
1737 {
1738 	pcitool_intr_get_t	*iget_p;
1739 	int			ipsz;
1740 	int			nipsz;
1741 	int			err;
1742 	uint8_t			inum;
1743 
1744 	/*
1745 	 * Check if SLEEP is OK, i.e if could come here in response to
1746 	 * changing the fanout due to some callback from the driver, say
1747 	 * link speed changes.
1748 	 */
1749 	ipsz = PCITOOL_IGET_SIZE(0);
1750 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1751 
1752 	iget_p->num_devs_ret = 0;
1753 	iget_p->user_version = PCITOOL_VERSION;
1754 	iget_p->cpu_id = oldcpuid;
1755 	iget_p->ino = ino;
1756 
1757 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1758 	    FKIOCTL, kcred, NULL);
1759 	if (err != 0) {
1760 		kmem_free(iget_p, ipsz);
1761 		return (B_FALSE);
1762 	}
1763 	if (iget_p->num_devs == 0) {
1764 		kmem_free(iget_p, ipsz);
1765 		return (B_FALSE);
1766 	}
1767 	inum = iget_p->num_devs;
1768 	if (iget_p->num_devs_ret < iget_p->num_devs) {
1769 		/* Reallocate */
1770 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1771 
1772 		kmem_free(iget_p, ipsz);
1773 		ipsz = nipsz;
1774 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1775 
1776 		iget_p->num_devs_ret = inum;
1777 		iget_p->cpu_id = oldcpuid;
1778 		iget_p->ino = ino;
1779 		iget_p->user_version = PCITOOL_VERSION;
1780 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1781 		    FKIOCTL, kcred, NULL);
1782 		if (err != 0) {
1783 			kmem_free(iget_p, ipsz);
1784 			return (B_FALSE);
1785 		}
1786 		/* defensive */
1787 		if (iget_p->num_devs != iget_p->num_devs_ret) {
1788 			kmem_free(iget_p, ipsz);
1789 			return (B_FALSE);
1790 		}
1791 	}
1792 
1793 	if (mac_search_intrinfo(iget_p, dln)) {
1794 		kmem_free(iget_p, ipsz);
1795 		return (B_TRUE);
1796 	}
1797 	kmem_free(iget_p, ipsz);
1798 	return (B_FALSE);
1799 }
1800 
1801 /*
1802  * Get the interrupts and check each one to see if it is for our device.
1803  */
1804 static int
1805 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1806 {
1807 	pcitool_intr_info_t	intr_info;
1808 	int			err;
1809 	int			ino;
1810 	int			oldcpuid;
1811 
1812 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1813 	    FKIOCTL, kcred, NULL);
1814 	if (err != 0)
1815 		return (-1);
1816 
1817 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1818 		for (ino = 0; ino < intr_info.num_intr; ino++) {
1819 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1820 				if (dln->cpu_id == cpuid)
1821 					return (0);
1822 				return (1);
1823 			}
1824 		}
1825 	}
1826 	return (-1);
1827 }
1828 
1829 /*
1830  * Obtain the nexus parent node info. for mdip.
1831  */
1832 static dev_info_t *
1833 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1834 {
1835 	struct dev_info		*tdip = (struct dev_info *)mdip;
1836 	struct ddi_minor_data	*minordata;
1837 	dev_info_t		*pdip;
1838 	char			pathname[MAXPATHLEN];
1839 
1840 	while (tdip != NULL) {
1841 		/*
1842 		 * The netboot code could call this function while walking the
1843 		 * device tree so we need to use ndi_devi_tryenter() here to
1844 		 * avoid deadlock.
1845 		 */
1846 		if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1847 			break;
1848 
1849 		for (minordata = tdip->devi_minor; minordata != NULL;
1850 		    minordata = minordata->next) {
1851 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1852 			    strlen(DDI_NT_INTRCTL)) == 0) {
1853 				pdip = minordata->dip;
1854 				(void) ddi_pathname(pdip, pathname);
1855 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
1856 				    "/devices%s:intr", pathname);
1857 				(void) ddi_pathname_minor(minordata, pathname);
1858 				ndi_devi_exit((dev_info_t *)tdip);
1859 				return (pdip);
1860 			}
1861 		}
1862 		ndi_devi_exit((dev_info_t *)tdip);
1863 		tdip = tdip->devi_parent;
1864 	}
1865 	return (NULL);
1866 }
1867 
1868 /*
1869  * For a primary MAC client, if the user has set a list or CPUs or
1870  * we have obtained it implicitly, we try to retarget the interrupt
1871  * for that device on one of the CPUs in the list.
1872  * We assign the interrupt to the same CPU as the poll thread.
1873  */
1874 static boolean_t
1875 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1876 {
1877 	ldi_handle_t		lh = NULL;
1878 	ldi_ident_t		li = NULL;
1879 	int			err;
1880 	int			ret;
1881 	mac_dladm_intr_t	dln;
1882 	dev_info_t		*dip;
1883 	struct ddi_minor_data	*minordata;
1884 
1885 	dln.nexus_path[0] = '\0';
1886 	dln.driver_path[0] = '\0';
1887 
1888 	minordata = ((struct dev_info *)mdip)->devi_minor;
1889 	while (minordata != NULL) {
1890 		if (minordata->type == DDM_MINOR)
1891 			break;
1892 		minordata = minordata->next;
1893 	}
1894 	if (minordata == NULL)
1895 		return (B_FALSE);
1896 
1897 	(void) ddi_pathname_minor(minordata, dln.driver_path);
1898 
1899 	dip = mac_get_nexus_node(mdip, &dln);
1900 	/* defensive */
1901 	if (dip == NULL)
1902 		return (B_FALSE);
1903 
1904 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1905 	if (err != 0)
1906 		return (B_FALSE);
1907 
1908 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1909 	if (err != 0)
1910 		return (B_FALSE);
1911 
1912 	ret = mac_validate_intr(lh, &dln, cpuid);
1913 	if (ret < 0) {
1914 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
1915 		return (B_FALSE);
1916 	}
1917 	/* cmn_note? */
1918 	if (ret != 0)
1919 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1920 		    != 0) {
1921 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
1922 			return (B_FALSE);
1923 		}
1924 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
1925 	return (B_TRUE);
1926 }
1927 
1928 void
1929 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1930 {
1931 	dev_info_t		*mdip = (dev_info_t *)arg;
1932 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1933 	mac_resource_props_t	*mrp;
1934 	mac_perim_handle_t	mph;
1935 	flow_entry_t		*flent = mcip->mci_flent;
1936 	mac_soft_ring_set_t	*rx_srs;
1937 	mac_cpus_t		*srs_cpu;
1938 
1939 	if (!mac_check_interrupt_binding(mdip, cpuid))
1940 		cpuid = -1;
1941 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1942 	mrp = MCIP_RESOURCE_PROPS(mcip);
1943 	mrp->mrp_rx_intr_cpu = cpuid;
1944 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1945 		rx_srs = flent->fe_rx_srs[1];
1946 		srs_cpu = &rx_srs->srs_cpu;
1947 		srs_cpu->mc_rx_intr_cpu = cpuid;
1948 	}
1949 	mac_perim_exit(mph);
1950 }
1951 
1952 int32_t
1953 mac_client_intr_cpu(mac_client_handle_t mch)
1954 {
1955 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1956 	mac_cpus_t		*srs_cpu;
1957 	mac_soft_ring_set_t	*rx_srs;
1958 	flow_entry_t		*flent = mcip->mci_flent;
1959 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
1960 	mac_ring_t		*ring;
1961 	mac_intr_t		*mintr;
1962 
1963 	/*
1964 	 * Check if we need to retarget the interrupt. We do this only
1965 	 * for the primary MAC client. We do this if we have the only
1966 	 * exclusive ring in the group.
1967 	 */
1968 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1969 		rx_srs = flent->fe_rx_srs[1];
1970 		srs_cpu = &rx_srs->srs_cpu;
1971 		ring = rx_srs->srs_ring;
1972 		mintr = &ring->mr_info.mri_intr;
1973 		/*
1974 		 * If ddi_handle is present or the poll CPU is
1975 		 * already bound to the interrupt CPU, return -1.
1976 		 */
1977 		if (mintr->mi_ddi_handle != NULL ||
1978 		    ((mrp->mrp_ncpus != 0) &&
1979 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1980 			return (-1);
1981 		}
1982 		return (srs_cpu->mc_rx_pollid);
1983 	}
1984 	return (-1);
1985 }
1986 
1987 void *
1988 mac_get_devinfo(mac_handle_t mh)
1989 {
1990 	mac_impl_t	*mip = (mac_impl_t *)mh;
1991 
1992 	return ((void *)mip->mi_dip);
1993 }
1994 
1995 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1996 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1997 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1998 
1999 uint64_t
2000 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
2001 {
2002 	struct ether_header *ehp;
2003 	uint64_t hash = 0;
2004 	uint16_t sap;
2005 	uint_t skip_len;
2006 	uint8_t proto;
2007 	boolean_t ip_fragmented;
2008 
2009 	/*
2010 	 * We may want to have one of these per MAC type plugin in the
2011 	 * future. For now supports only ethernet.
2012 	 */
2013 	if (media != DL_ETHER)
2014 		return (0L);
2015 
2016 	/* for now we support only outbound packets */
2017 	ASSERT(is_outbound);
2018 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
2019 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
2020 
2021 	/* compute L2 hash */
2022 
2023 	ehp = (struct ether_header *)mp->b_rptr;
2024 
2025 	if ((policy & MAC_PKT_HASH_L2) != 0) {
2026 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
2027 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
2028 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
2029 		policy &= ~MAC_PKT_HASH_L2;
2030 	}
2031 
2032 	if (policy == 0)
2033 		goto done;
2034 
2035 	/* skip ethernet header */
2036 
2037 	sap = ntohs(ehp->ether_type);
2038 	if (sap == ETHERTYPE_VLAN) {
2039 		struct ether_vlan_header *evhp;
2040 		mblk_t *newmp = NULL;
2041 
2042 		skip_len = sizeof (struct ether_vlan_header);
2043 		if (MBLKL(mp) < skip_len) {
2044 			/* the vlan tag is the payload, pull up first */
2045 			newmp = msgpullup(mp, -1);
2046 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
2047 				goto done;
2048 			}
2049 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
2050 		} else {
2051 			evhp = (struct ether_vlan_header *)mp->b_rptr;
2052 		}
2053 
2054 		sap = ntohs(evhp->ether_type);
2055 		freemsg(newmp);
2056 	} else {
2057 		skip_len = sizeof (struct ether_header);
2058 	}
2059 
2060 	/* if ethernet header is in its own mblk, skip it */
2061 	if (MBLKL(mp) <= skip_len) {
2062 		skip_len -= MBLKL(mp);
2063 		mp = mp->b_cont;
2064 		if (mp == NULL)
2065 			goto done;
2066 	}
2067 
2068 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
2069 
2070 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
2071 
2072 	switch (sap) {
2073 	case ETHERTYPE_IP: {
2074 		ipha_t *iphp;
2075 
2076 		/*
2077 		 * If the header is not aligned or the header doesn't fit
2078 		 * in the mblk, bail now. Note that this may cause packets
2079 		 * reordering.
2080 		 */
2081 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
2082 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2083 		    !OK_32PTR((char *)iphp))
2084 			goto done;
2085 
2086 		proto = iphp->ipha_protocol;
2087 		skip_len += IPH_HDR_LENGTH(iphp);
2088 
2089 		/* Check if the packet is fragmented. */
2090 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2091 		    IPH_OFFSET;
2092 
2093 		/*
2094 		 * For fragmented packets, use addresses in addition to
2095 		 * the frag_id to generate the hash inorder to get
2096 		 * better distribution.
2097 		 */
2098 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2099 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2100 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2101 
2102 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2103 			    PKT_HASH_4BYTES(ip_dst));
2104 			policy &= ~MAC_PKT_HASH_L3;
2105 		}
2106 
2107 		if (ip_fragmented) {
2108 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2109 			hash ^= PKT_HASH_2BYTES(identp);
2110 			goto done;
2111 		}
2112 		break;
2113 	}
2114 	case ETHERTYPE_IPV6: {
2115 		ip6_t *ip6hp;
2116 		ip6_frag_t *frag = NULL;
2117 		uint16_t hdr_length;
2118 
2119 		/*
2120 		 * If the header is not aligned or the header doesn't fit
2121 		 * in the mblk, bail now. Note that this may cause packets
2122 		 * reordering.
2123 		 */
2124 
2125 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2126 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2127 		    !OK_32PTR((char *)ip6hp))
2128 			goto done;
2129 
2130 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2131 		    &proto, &frag))
2132 			goto done;
2133 		skip_len += hdr_length;
2134 
2135 		/*
2136 		 * For fragmented packets, use addresses in addition to
2137 		 * the frag_id to generate the hash inorder to get
2138 		 * better distribution.
2139 		 */
2140 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2141 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2142 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2143 
2144 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2145 			    PKT_HASH_4BYTES(ip_dst));
2146 			policy &= ~MAC_PKT_HASH_L3;
2147 		}
2148 
2149 		if (frag != NULL) {
2150 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2151 			hash ^= PKT_HASH_4BYTES(identp);
2152 			goto done;
2153 		}
2154 		break;
2155 	}
2156 	default:
2157 		goto done;
2158 	}
2159 
2160 	if (policy == 0)
2161 		goto done;
2162 
2163 	/* if ip header is in its own mblk, skip it */
2164 	if (MBLKL(mp) <= skip_len) {
2165 		skip_len -= MBLKL(mp);
2166 		mp = mp->b_cont;
2167 		if (mp == NULL)
2168 			goto done;
2169 	}
2170 
2171 	/* parse ULP header */
2172 again:
2173 	switch (proto) {
2174 	case IPPROTO_TCP:
2175 	case IPPROTO_UDP:
2176 	case IPPROTO_ESP:
2177 	case IPPROTO_SCTP:
2178 		/*
2179 		 * These Internet Protocols are intentionally designed
2180 		 * for hashing from the git-go.  Port numbers are in the first
2181 		 * word for transports, SPI is first for ESP.
2182 		 */
2183 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2184 			goto done;
2185 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2186 		break;
2187 
2188 	case IPPROTO_AH: {
2189 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2190 		uint_t ah_length = AH_TOTAL_LEN(ah);
2191 
2192 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2193 			goto done;
2194 
2195 		proto = ah->ah_nexthdr;
2196 		skip_len += ah_length;
2197 
2198 		/* if AH header is in its own mblk, skip it */
2199 		if (MBLKL(mp) <= skip_len) {
2200 			skip_len -= MBLKL(mp);
2201 			mp = mp->b_cont;
2202 			if (mp == NULL)
2203 				goto done;
2204 		}
2205 
2206 		goto again;
2207 	}
2208 	}
2209 
2210 done:
2211 	return (hash);
2212 }
2213