xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision ff67a31b6b184e832f89a53763c02c35bd1a7291)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  */
25 
26 /*
27  * MAC Services Module - misc utilities
28  */
29 
30 #include <sys/types.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/mac_client_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/strsubr.h>
37 #include <sys/strsun.h>
38 #include <sys/vlan.h>
39 #include <sys/pattr.h>
40 #include <sys/pci_tools.h>
41 #include <inet/ip.h>
42 #include <inet/ip_impl.h>
43 #include <inet/ip6.h>
44 #include <sys/vtrace.h>
45 #include <sys/dlpi.h>
46 #include <sys/sunndi.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51 #include <inet/tcp.h>
52 #include <inet/udp_impl.h>
53 #include <inet/sctp_ip.h>
54 
55 /*
56  * The next two functions are used for dropping packets or chains of
57  * packets, respectively. We could use one function for both but
58  * separating the use cases allows us to specify intent and prevent
59  * dropping more data than intended.
60  *
61  * The purpose of these functions is to aid the debugging effort,
62  * especially in production. Rather than use freemsg()/freemsgchain(),
63  * it's preferable to use these functions when dropping a packet in
64  * the MAC layer. These functions should only be used during
65  * unexpected conditions. That is, any time a packet is dropped
66  * outside of the regular, successful datapath. Consolidating all
67  * drops on these functions allows the user to trace one location and
68  * determine why the packet was dropped based on the msg. It also
69  * allows the user to inspect the packet before it is freed. Finally,
70  * it allows the user to avoid tracing freemsg()/freemsgchain() thus
71  * keeping the hot path running as efficiently as possible.
72  *
73  * NOTE: At this time not all MAC drops are aggregated on these
74  * functions; but that is the plan. This comment should be erased once
75  * completed.
76  */
77 
78 /*PRINTFLIKE2*/
79 void
80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
81 {
82 	va_list adx;
83 	char msg[128];
84 	char *msgp = msg;
85 
86 	ASSERT3P(mp->b_next, ==, NULL);
87 
88 	va_start(adx, fmt);
89 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
90 	va_end(adx);
91 
92 	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
93 	freemsg(mp);
94 }
95 
96 /*PRINTFLIKE2*/
97 void
98 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
99 {
100 	va_list adx;
101 	char msg[128];
102 	char *msgp = msg;
103 
104 	va_start(adx, fmt);
105 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
106 	va_end(adx);
107 
108 	/*
109 	 * We could use freemsgchain() for the actual freeing but
110 	 * since we are already walking the chain to fire the dtrace
111 	 * probe we might as well free the msg here too.
112 	 */
113 	for (mblk_t *mp = chain, *next; mp != NULL; ) {
114 		next = mp->b_next;
115 		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
116 		freemsg(mp);
117 		mp = next;
118 	}
119 }
120 
121 /*
122  * Copy an mblk, preserving its hardware checksum flags.
123  */
124 static mblk_t *
125 mac_copymsg_cksum(mblk_t *mp)
126 {
127 	mblk_t *mp1;
128 
129 	mp1 = copymsg(mp);
130 	if (mp1 == NULL)
131 		return (NULL);
132 
133 	mac_hcksum_clone(mp, mp1);
134 
135 	return (mp1);
136 }
137 
138 /*
139  * Copy an mblk chain, presenting the hardware checksum flags of the
140  * individual mblks.
141  */
142 mblk_t *
143 mac_copymsgchain_cksum(mblk_t *mp)
144 {
145 	mblk_t *nmp = NULL;
146 	mblk_t **nmpp = &nmp;
147 
148 	for (; mp != NULL; mp = mp->b_next) {
149 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
150 			freemsgchain(nmp);
151 			return (NULL);
152 		}
153 
154 		nmpp = &((*nmpp)->b_next);
155 	}
156 
157 	return (nmp);
158 }
159 
160 /*
161  * Calculate the ULP checksum for IPv4. Return true if the calculation
162  * was successful, or false if an error occurred. If the later, place
163  * an error message into '*err'.
164  */
165 static boolean_t
166 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
167     const char **err)
168 {
169 	const uint8_t proto = ipha->ipha_protocol;
170 	size_t len;
171 	const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
172 	/* ULP offset from start of L2. */
173 	const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
174 	ipaddr_t src, dst;
175 	uint32_t cksum;
176 	uint16_t *up;
177 
178 	/*
179 	 * We need a pointer to the ULP checksum. We're assuming the
180 	 * ULP checksum pointer resides in the first mblk. Our native
181 	 * TCP stack should always put the headers in the first mblk,
182 	 * but currently we have no way to guarantee that other
183 	 * clients don't spread headers (or even header fields) across
184 	 * mblks.
185 	 */
186 	switch (proto) {
187 	case IPPROTO_TCP:
188 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
189 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
190 			*err = "mblk doesn't contain TCP header";
191 			goto bail;
192 		}
193 
194 		up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
195 		cksum = IP_TCP_CSUM_COMP;
196 		break;
197 
198 	case IPPROTO_UDP:
199 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
200 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
201 			*err = "mblk doesn't contain UDP header";
202 			goto bail;
203 		}
204 
205 		up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
206 		cksum = IP_UDP_CSUM_COMP;
207 		break;
208 
209 	case IPPROTO_SCTP: {
210 		sctp_hdr_t *sctph;
211 
212 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
213 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
214 			*err = "mblk doesn't contain SCTP header";
215 			goto bail;
216 		}
217 
218 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
219 		sctph->sh_chksum = 0;
220 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
221 		return (B_TRUE);
222 	}
223 
224 	default:
225 		*err = "unexpected protocol";
226 		goto bail;
227 
228 	}
229 
230 	/* Pseudo-header checksum. */
231 	src = ipha->ipha_src;
232 	dst = ipha->ipha_dst;
233 	len = ntohs(ipha->ipha_length) - ip_hdr_sz;
234 
235 	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
236 	cksum += htons(len);
237 
238 	/*
239 	 * We have already accounted for the pseudo checksum above.
240 	 * Make sure the ULP checksum field is zero before computing
241 	 * the rest.
242 	 */
243 	*up = 0;
244 	cksum = IP_CSUM(mp, ulp_offset, cksum);
245 	*up = (uint16_t)(cksum ? cksum : ~cksum);
246 
247 	return (B_TRUE);
248 
249 bail:
250 	return (B_FALSE);
251 }
252 
253 /*
254  * Calculate the ULP checksum for IPv6. Return true if the calculation
255  * was successful, or false if an error occurred. If the later, place
256  * an error message into '*err'.
257  */
258 static boolean_t
259 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
260 {
261 	ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
262 	const uint8_t proto = ip6h->ip6_nxt;
263 	const uint16_t *iphs = (uint16_t *)ip6h;
264 	/* ULP offset from start of L2. */
265 	uint32_t ulp_offset;
266 	size_t len;
267 	uint32_t cksum;
268 	uint16_t *up;
269 	uint16_t ip_hdr_sz;
270 
271 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
272 		*err = "malformed IPv6 header";
273 		goto bail;
274 	}
275 
276 	ulp_offset = ip_hdr_offset + ip_hdr_sz;
277 
278 	/*
279 	 * We need a pointer to the ULP checksum. We're assuming the
280 	 * ULP checksum pointer resides in the first mblk. Our native
281 	 * TCP stack should always put the headers in the first mblk,
282 	 * but currently we have no way to guarantee that other
283 	 * clients don't spread headers (or even header fields) across
284 	 * mblks.
285 	 */
286 	switch (proto) {
287 	case IPPROTO_TCP:
288 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
289 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
290 			*err = "mblk doesn't contain TCP header";
291 			goto bail;
292 		}
293 
294 		up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
295 		cksum = IP_TCP_CSUM_COMP;
296 		break;
297 
298 	case IPPROTO_UDP:
299 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
300 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
301 			*err = "mblk doesn't contain UDP header";
302 			goto bail;
303 		}
304 
305 		up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
306 		cksum = IP_UDP_CSUM_COMP;
307 		break;
308 
309 	case IPPROTO_SCTP: {
310 		sctp_hdr_t *sctph;
311 
312 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
313 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
314 			*err = "mblk doesn't contain SCTP header";
315 			goto bail;
316 		}
317 
318 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
319 		/*
320 		 * Zero out the checksum field to ensure proper
321 		 * checksum calculation.
322 		 */
323 		sctph->sh_chksum = 0;
324 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
325 		return (B_TRUE);
326 	}
327 
328 	default:
329 		*err = "unexpected protocol";
330 		goto bail;
331 	}
332 
333 	/*
334 	 * The payload length includes the payload and the IPv6
335 	 * extension headers; the idea is to subtract the extension
336 	 * header length to get the real payload length.
337 	 */
338 	len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
339 	cksum += len;
340 
341 	/*
342 	 * We accumulate the pseudo header checksum in cksum; then we
343 	 * call IP_CSUM to compute the checksum over the payload.
344 	 */
345 	cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
346 	    iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
347 	    iphs[16] + iphs[17] + iphs[18] + iphs[19];
348 	cksum = IP_CSUM(mp, ulp_offset, cksum);
349 
350 	/* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
351 	if (proto == IPPROTO_UDP && cksum == 0)
352 		cksum = ~cksum;
353 
354 	*up = (uint16_t)cksum;
355 
356 	return (B_TRUE);
357 
358 bail:
359 	return (B_FALSE);
360 }
361 
362 /*
363  * Perform software checksum on a single message, if needed. The
364  * emulation performed is determined by an intersection of the mblk's
365  * flags and the emul flags requested. The emul flags are documented
366  * in mac.h.
367  */
368 static mblk_t *
369 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
370 {
371 	mblk_t *skipped_hdr = NULL;
372 	uint32_t flags, start, stuff, end, value;
373 	uint32_t ip_hdr_offset;
374 	uint16_t etype;
375 	size_t ip_hdr_sz;
376 	struct ether_header *ehp;
377 	const char *err = "";
378 
379 	/*
380 	 * This function should only be called from mac_hw_emul()
381 	 * which handles mblk chains and the shared ref case.
382 	 */
383 	ASSERT3P(mp->b_next, ==, NULL);
384 
385 	mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
386 
387 	flags = DB_CKSUMFLAGS(mp);
388 
389 	/* Why call this if checksum emulation isn't needed? */
390 	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
391 
392 	/*
393 	 * Ethernet, and optionally VLAN header. mac_hw_emul() has
394 	 * already verified we have enough data to read the L2 header.
395 	 */
396 	ehp = (struct ether_header *)mp->b_rptr;
397 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
398 		struct ether_vlan_header *evhp;
399 
400 		evhp = (struct ether_vlan_header *)mp->b_rptr;
401 		etype = ntohs(evhp->ether_type);
402 		ip_hdr_offset = sizeof (struct ether_vlan_header);
403 	} else {
404 		etype = ntohs(ehp->ether_type);
405 		ip_hdr_offset = sizeof (struct ether_header);
406 	}
407 
408 	/*
409 	 * If this packet isn't IP, then leave it alone. We don't want
410 	 * to affect non-IP traffic like ARP. Assume the IP header
411 	 * doesn't include any options, for now. We will use the
412 	 * correct size later after we know there are enough bytes to
413 	 * at least fill out the basic header.
414 	 */
415 	switch (etype) {
416 	case ETHERTYPE_IP:
417 		ip_hdr_sz = sizeof (ipha_t);
418 		break;
419 	case ETHERTYPE_IPV6:
420 		ip_hdr_sz = sizeof (ip6_t);
421 		break;
422 	default:
423 		return (mp);
424 	}
425 
426 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
427 
428 	/*
429 	 * If the first mblk of this packet contains only the ethernet
430 	 * header, skip past it for now. Packets with their data
431 	 * contained in only a single mblk can then use the fastpaths
432 	 * tuned to that possibility.
433 	 */
434 	if (MBLKL(mp) == ip_hdr_offset) {
435 		ip_hdr_offset -= MBLKL(mp);
436 		/* This is guaranteed by mac_hw_emul(). */
437 		ASSERT3P(mp->b_cont, !=, NULL);
438 		skipped_hdr = mp;
439 		mp = mp->b_cont;
440 	}
441 
442 	/*
443 	 * Both full and partial checksum rely on finding the IP
444 	 * header in the current mblk. Our native TCP stack honors
445 	 * this assumption but it's prudent to guard our future
446 	 * clients that might not honor this contract.
447 	 */
448 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
449 	if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
450 		err = "mblk doesn't contain IP header";
451 		goto bail;
452 	}
453 
454 	/*
455 	 * We are about to modify the header mblk; make sure we are
456 	 * modifying our own copy. The code that follows assumes that
457 	 * the IP/ULP headers exist in this mblk (and drops the
458 	 * message if they don't).
459 	 */
460 	if (DB_REF(mp) > 1) {
461 		mblk_t *tmp = copyb(mp);
462 
463 		if (tmp == NULL) {
464 			err = "copyb failed";
465 			goto bail;
466 		}
467 
468 		if (skipped_hdr != NULL) {
469 			ASSERT3P(skipped_hdr->b_cont, ==, mp);
470 			skipped_hdr->b_cont = tmp;
471 		}
472 
473 		tmp->b_cont = mp->b_cont;
474 		freeb(mp);
475 		mp = tmp;
476 	}
477 
478 	if (etype == ETHERTYPE_IP) {
479 		ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
480 
481 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
482 			if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
483 				goto bail;
484 		}
485 
486 		/* We always update the ULP checksum flags. */
487 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
488 			flags &= ~HCK_FULLCKSUM;
489 			flags |= HCK_FULLCKSUM_OK;
490 			value = 0;
491 		}
492 
493 		/*
494 		 * While unlikely, it's possible to write code that
495 		 * might end up calling mac_sw_cksum() twice on the
496 		 * same mblk (performing both LSO and checksum
497 		 * emualtion in a single mblk chain loop -- the LSO
498 		 * emulation inserts a new chain into the existing
499 		 * chain and then the loop iterates back over the new
500 		 * segments and emulates the checksum a second time).
501 		 * Normally this wouldn't be a problem, because the
502 		 * HCK_*_OK flags are supposed to indicate that we
503 		 * don't need to do peform the work. But
504 		 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
505 		 * same value; so we cannot use these flags to
506 		 * determine if the IP header checksum has already
507 		 * been calculated or not. For this reason, we zero
508 		 * out the the checksum first. In the future, we
509 		 * should fix the HCK_* flags.
510 		 */
511 		if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
512 			ipha->ipha_hdr_checksum = 0;
513 			ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
514 			flags &= ~HCK_IPV4_HDRCKSUM;
515 			flags |= HCK_IPV4_HDRCKSUM_OK;
516 		}
517 	} else if (etype == ETHERTYPE_IPV6) {
518 		/* There is no IP header checksum for IPv6. */
519 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
520 			if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
521 				goto bail;
522 			flags &= ~HCK_FULLCKSUM;
523 			flags |= HCK_FULLCKSUM_OK;
524 			value = 0;
525 		}
526 	}
527 
528 	/*
529 	 * Partial checksum is the same for both IPv4 and IPv6.
530 	 */
531 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
532 		uint16_t *up, partial, cksum;
533 		uchar_t *ipp; /* ptr to beginning of IP header */
534 
535 		ipp = mp->b_rptr + ip_hdr_offset;
536 		up = (uint16_t *)((uchar_t *)ipp + stuff);
537 		partial = *up;
538 		*up = 0;
539 
540 		ASSERT3S(end, >, start);
541 		cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
542 		*up = cksum != 0 ? cksum : ~cksum;
543 	}
544 
545 	/* We always update the ULP checksum flags. */
546 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
547 		flags &= ~HCK_PARTIALCKSUM;
548 		flags |= HCK_FULLCKSUM_OK;
549 		value = 0;
550 	}
551 
552 	mac_hcksum_set(mp, start, stuff, end, value, flags);
553 
554 	/* Don't forget to reattach the header. */
555 	if (skipped_hdr != NULL) {
556 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
557 
558 		/*
559 		 * Duplicate the HCKSUM data into the header mblk.
560 		 * This mimics mac_add_vlan_tag which ensures that
561 		 * both the first mblk _and_ the first data bearing
562 		 * mblk possess the HCKSUM information. Consumers like
563 		 * IP will end up discarding the ether_header mblk, so
564 		 * for now, it is important that the data be available
565 		 * in both places.
566 		 */
567 		mac_hcksum_clone(mp, skipped_hdr);
568 		mp = skipped_hdr;
569 	}
570 
571 	return (mp);
572 
573 bail:
574 	if (skipped_hdr != NULL) {
575 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
576 		mp = skipped_hdr;
577 	}
578 
579 	mac_drop_pkt(mp, err);
580 	return (NULL);
581 }
582 
583 /*
584  * Build a single data segment from an LSO packet. The mblk chain
585  * returned, seg_head, represents the data segment and is always
586  * exactly seg_len bytes long. The lso_mp and offset input/output
587  * parameters track our position in the LSO packet. This function
588  * exists solely as a helper to mac_sw_lso().
589  *
590  * Case A
591  *
592  *     The current lso_mp is larger than the requested seg_len. The
593  *     beginning of seg_head may start at the beginning of lso_mp or
594  *     offset into it. In either case, a single mblk is returned, and
595  *     *offset is updated to reflect our new position in the current
596  *     lso_mp.
597  *
598  *          +----------------------------+
599  *          |  in *lso_mp / out *lso_mp  |
600  *          +----------------------------+
601  *          ^                        ^
602  *          |                        |
603  *          |                        |
604  *          |                        |
605  *          +------------------------+
606  *          |        seg_head        |
607  *          +------------------------+
608  *          ^                        ^
609  *          |                        |
610  *   in *offset = 0        out *offset = seg_len
611  *
612  *          |------   seg_len    ----|
613  *
614  *
615  *       +------------------------------+
616  *       |   in *lso_mp / out *lso_mp   |
617  *       +------------------------------+
618  *          ^                        ^
619  *          |                        |
620  *          |                        |
621  *          |                        |
622  *          +------------------------+
623  *          |        seg_head        |
624  *          +------------------------+
625  *          ^                        ^
626  *          |                        |
627  *   in *offset = N        out *offset = N + seg_len
628  *
629  *          |------   seg_len    ----|
630  *
631  *
632  *
633  * Case B
634  *
635  *    The requested seg_len consumes exactly the rest of the lso_mp.
636  *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
637  *    The seg_head may start at the beginning of the lso_mp or at some
638  *    offset into it. In either case we return a single mblk, reset
639  *    *offset to zero, and walk to the next lso_mp.
640  *
641  *          +------------------------+           +------------------------+
642  *          |       in *lso_mp       |---------->|      out *lso_mp       |
643  *          +------------------------+           +------------------------+
644  *          ^                        ^           ^
645  *          |                        |           |
646  *          |                        |    out *offset = 0
647  *          |                        |
648  *          +------------------------+
649  *          |        seg_head        |
650  *          +------------------------+
651  *          ^
652  *          |
653  *   in *offset = 0
654  *
655  *          |------   seg_len    ----|
656  *
657  *
658  *
659  *      +----------------------------+           +------------------------+
660  *      |         in *lso_mp         |---------->|      out *lso_mp       |
661  *      +----------------------------+           +------------------------+
662  *          ^                        ^           ^
663  *          |                        |           |
664  *          |                        |    out *offset = 0
665  *          |                        |
666  *          +------------------------+
667  *          |        seg_head        |
668  *          +------------------------+
669  *          ^
670  *          |
671  *   in *offset = N
672  *
673  *          |------   seg_len    ----|
674  *
675  *
676  * Case C
677  *
678  *    The requested seg_len is greater than the current lso_mp. In
679  *    this case we must consume LSO mblks until we have enough data to
680  *    satisfy either case (A) or (B) above. We will return multiple
681  *    mblks linked via b_cont, offset will be set based on the cases
682  *    above, and lso_mp will walk forward at least one mblk, but maybe
683  *    more.
684  *
685  *    N.B. This digram is not exhaustive. The seg_head may start on
686  *    the beginning of an lso_mp. The seg_tail may end exactly on the
687  *    boundary of an lso_mp. And there may be two (in this case the
688  *    middle block wouldn't exist), three, or more mblks in the
689  *    seg_head chain. This is meant as one example of what might
690  *    happen. The main thing to remember is that the seg_tail mblk
691  *    must be one of case (A) or (B) above.
692  *
693  *  +------------------+    +----------------+    +------------------+
694  *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
695  *  +------------------+    +----------------+    +------------------+
696  *        ^            ^    ^                ^    ^            ^
697  *        |            |    |                |    |            |
698  *        |            |    |                |    |            |
699  *        |            |    |                |    |            |
700  *        |            |    |                |    |            |
701  *        +------------+    +----------------+    +------------+
702  *        |  seg_head  |--->|                |--->|  seg_tail  |
703  *        +------------+    +----------------+    +------------+
704  *        ^                                                    ^
705  *        |                                                    |
706  *  in *offset = N                          out *offset = MBLKL(seg_tail)
707  *
708  *        |-------------------   seg_len    -------------------|
709  *
710  */
711 static mblk_t *
712 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
713 {
714 	mblk_t *seg_head, *seg_tail, *seg_mp;
715 
716 	ASSERT3P(*lso_mp, !=, NULL);
717 	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
718 
719 	seg_mp = dupb(*lso_mp);
720 	if (seg_mp == NULL)
721 		return (NULL);
722 
723 	seg_head = seg_mp;
724 	seg_tail = seg_mp;
725 
726 	/* Continue where we left off from in the lso_mp. */
727 	seg_mp->b_rptr += *offset;
728 
729 last_mblk:
730 	/* Case (A) */
731 	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
732 		*offset += seg_len;
733 		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
734 		return (seg_head);
735 	}
736 
737 	/* Case (B) */
738 	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
739 		*offset = 0;
740 		*lso_mp = (*lso_mp)->b_cont;
741 		return (seg_head);
742 	}
743 
744 	/* Case (C) */
745 	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
746 
747 	/*
748 	 * The current LSO mblk doesn't have enough data to satisfy
749 	 * seg_len -- continue peeling off LSO mblks to build the new
750 	 * segment message. If allocation fails we free the previously
751 	 * allocated segment mblks and return NULL.
752 	 */
753 	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
754 		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
755 		seg_len -= MBLKL(seg_mp);
756 		*offset = 0;
757 		*lso_mp = (*lso_mp)->b_cont;
758 		seg_mp = dupb(*lso_mp);
759 
760 		if (seg_mp == NULL) {
761 			freemsgchain(seg_head);
762 			return (NULL);
763 		}
764 
765 		seg_tail->b_cont = seg_mp;
766 		seg_tail = seg_mp;
767 	}
768 
769 	/*
770 	 * We've walked enough LSO mblks that we can now satisfy the
771 	 * remaining seg_len. At this point we need to jump back to
772 	 * determine if we have arrived at case (A) or (B).
773 	 */
774 
775 	/* Just to be paranoid that we didn't underflow. */
776 	ASSERT3U(seg_len, <, IP_MAXPACKET);
777 	ASSERT3U(seg_len, >, 0);
778 	goto last_mblk;
779 }
780 
781 /*
782  * Perform software segmentation of a single LSO message. Take an LSO
783  * message as input and return head/tail pointers as output. This
784  * function should not be invoked directly but instead through
785  * mac_hw_emul().
786  *
787  * The resulting chain is comprised of multiple (nsegs) MSS sized
788  * segments. Each segment will consist of two or more mblks joined by
789  * b_cont: a header and one or more data mblks. The header mblk is
790  * allocated anew for each message. The first segment's header is used
791  * as a template for the rest with adjustments made for things such as
792  * ID, sequence, length, TCP flags, etc. The data mblks reference into
793  * the existing LSO mblk (passed in as omp) by way of dupb(). Their
794  * b_rptr/b_wptr values are adjusted to reference only the fraction of
795  * the LSO message they are responsible for. At the successful
796  * completion of this function the original mblk (omp) is freed,
797  * leaving the newely created segment chain as the only remaining
798  * reference to the data.
799  */
800 static void
801 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
802     uint_t *count)
803 {
804 	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
805 	uint32_t mss;
806 	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
807 	uint32_t oleft;
808 	uint_t nsegs, seg;
809 	int len;
810 
811 	struct ether_vlan_header *oevh;
812 	const ipha_t *oiph;
813 	const tcph_t *otcph;
814 	ipha_t *niph;
815 	tcph_t *ntcph;
816 	uint16_t ip_id;
817 	uint32_t tcp_seq, tcp_sum, otcp_sum;
818 
819 	uint32_t offset;
820 	mblk_t *odatamp;
821 	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
822 	mblk_t *tmptail;
823 
824 	ASSERT3P(head, !=, NULL);
825 	ASSERT3P(tail, !=, NULL);
826 	ASSERT3P(count, !=, NULL);
827 	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
828 
829 	/* Assume we are dealing with a single LSO message. */
830 	ASSERT3P(omp->b_next, ==, NULL);
831 
832 	/*
833 	 * XXX: This is a hack to deal with mac_add_vlan_tag().
834 	 *
835 	 * When VLANs are in play, mac_add_vlan_tag() creates a new
836 	 * mblk with just the ether_vlan_header and tacks it onto the
837 	 * front of 'omp'. This breaks the assumptions made below;
838 	 * namely that the TCP/IP headers are in the first mblk. In
839 	 * this case, since we already have to pay the cost of LSO
840 	 * emulation, we simply pull up everything. While this might
841 	 * seem irksome, keep in mind this will only apply in a couple
842 	 * of scenarios: a) an LSO-capable VLAN client sending to a
843 	 * non-LSO-capable client over the "MAC/bridge loopback"
844 	 * datapath or b) an LSO-capable VLAN client is sending to a
845 	 * client that, for whatever reason, doesn't have DLS-bypass
846 	 * enabled. Finally, we have to check for both a tagged and
847 	 * untagged sized mblk depending on if the mblk came via
848 	 * mac_promisc_dispatch() or mac_rx_deliver().
849 	 *
850 	 * In the future, two things should be done:
851 	 *
852 	 * 1. This function should make use of some yet to be
853 	 *    implemented "mblk helpers". These helper functions would
854 	 *    perform all the b_cont walking for us and guarantee safe
855 	 *    access to the mblk data.
856 	 *
857 	 * 2. We should add some slop to the mblks so that
858 	 *    mac_add_vlan_tag() can just edit the first mblk instead
859 	 *    of allocating on the hot path.
860 	 */
861 	if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
862 	    MBLKL(omp) == sizeof (struct ether_header)) {
863 		mblk_t *tmp = msgpullup(omp, -1);
864 
865 		if (tmp == NULL) {
866 			mac_drop_pkt(omp, "failed to pull up");
867 			goto fail;
868 		}
869 
870 		mac_hcksum_clone(omp, tmp);
871 		freemsg(omp);
872 		omp = tmp;
873 	}
874 
875 	mss = DB_LSOMSS(omp);
876 	ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
877 	    sizeof (struct ether_vlan_header));
878 	opktlen = msgsize(omp);
879 
880 	/*
881 	 * First, get references to the IP and TCP headers and
882 	 * determine the total TCP length (header + data).
883 	 *
884 	 * Thanks to mac_hw_emul() we know that the first mblk must
885 	 * contain (at minimum) the full L2 header. However, this
886 	 * function assumes more than that. It assumes the L2/L3/L4
887 	 * headers are all contained in the first mblk of a message
888 	 * (i.e., no b_cont walking for headers). While this is a
889 	 * current reality (our native TCP stack and viona both
890 	 * enforce this) things may become more nuanced in the future
891 	 * (e.g. when introducing encap support or adding new
892 	 * clients). For now we guard against this case by dropping
893 	 * the packet.
894 	 */
895 	oevh = (struct ether_vlan_header *)omp->b_rptr;
896 	if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
897 		oehlen = sizeof (struct ether_vlan_header);
898 	else
899 		oehlen = sizeof (struct ether_header);
900 
901 	ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
902 	if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
903 		mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
904 		goto fail;
905 	}
906 
907 	oiph = (ipha_t *)(omp->b_rptr + oehlen);
908 	oiphlen = IPH_HDR_LENGTH(oiph);
909 	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
910 	otcphlen = TCP_HDR_LENGTH(otcph);
911 
912 	/*
913 	 * Currently we only support LSO for TCP/IPv4.
914 	 */
915 	if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
916 		mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
917 		    IPH_HDR_VERSION(oiph));
918 		goto fail;
919 	}
920 
921 	if (oiph->ipha_protocol != IPPROTO_TCP) {
922 		mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
923 		    oiph->ipha_protocol);
924 		goto fail;
925 	}
926 
927 	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
928 		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
929 		goto fail;
930 	}
931 
932 	ohdrslen = oehlen + oiphlen + otcphlen;
933 	if ((len = MBLKL(omp)) < ohdrslen) {
934 		mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
935 		    ohdrslen);
936 		goto fail;
937 	}
938 
939 	/*
940 	 * Either we have data in the first mblk or it's just the
941 	 * header. In either case, we need to set rptr to the start of
942 	 * the TCP data.
943 	 */
944 	if (len > ohdrslen) {
945 		odatamp = omp;
946 		offset = ohdrslen;
947 	} else {
948 		ASSERT3U(len, ==, ohdrslen);
949 		odatamp = omp->b_cont;
950 		offset = 0;
951 	}
952 
953 	/* Make sure we still have enough data. */
954 	ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
955 
956 	/*
957 	 * If a MAC negotiated LSO then it must negotioate both
958 	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
959 	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
960 	 * change during LSO segmentation (only the 3 fields of the
961 	 * pseudo header checksum don't change: src, dst, proto). Thus
962 	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
963 	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
964 	 * function to emulate those checksums in software. However,
965 	 * that assumes a world where we only expose LSO if the
966 	 * underlying hardware exposes LSO. Moving forward the plan is
967 	 * to assume LSO in the upper layers and have MAC perform
968 	 * software LSO when the underlying provider doesn't support
969 	 * it. In such a world, if the provider doesn't support LSO
970 	 * but does support hardware checksum offload, then we could
971 	 * simply perform the segmentation and allow the hardware to
972 	 * calculate the checksums. To the hardware it's just another
973 	 * chain of non-LSO packets.
974 	 */
975 	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
976 	ocsum_flags = DB_CKSUMFLAGS(omp);
977 	ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
978 	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
979 
980 	/*
981 	 * If hardware only provides partial checksum then software
982 	 * must supply the pseudo-header checksum. In the case of LSO
983 	 * we leave the TCP length at zero to be filled in by
984 	 * hardware. This function must handle two scenarios.
985 	 *
986 	 * 1. Being called by a MAC client on the Rx path to segment
987 	 *    an LSO packet and calculate the checksum.
988 	 *
989 	 * 2. Being called by a MAC provider to segment an LSO packet.
990 	 *    In this case the LSO segmentation is performed in
991 	 *    software (by this routine) but the MAC provider should
992 	 *    still calculate the TCP/IP checksums in hardware.
993 	 *
994 	 *  To elaborate on the second case: we cannot have the
995 	 *  scenario where IP sends LSO packets but the underlying HW
996 	 *  doesn't support checksum offload -- because in that case
997 	 *  TCP/IP would calculate the checksum in software (for the
998 	 *  LSO packet) but then MAC would segment the packet and have
999 	 *  to redo all the checksum work. So IP should never do LSO
1000 	 *  if HW doesn't support both IP and TCP checksum.
1001 	 */
1002 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1003 		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
1004 		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
1005 	}
1006 
1007 	odatalen = opktlen - ohdrslen;
1008 
1009 	/*
1010 	 * Subtract one to account for the case where the data length
1011 	 * is evenly divisble by the MSS. Add one to account for the
1012 	 * fact that the division will always result in one less
1013 	 * segment than needed.
1014 	 */
1015 	nsegs = ((odatalen - 1) / mss) + 1;
1016 	if (nsegs < 2) {
1017 		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
1018 		goto fail;
1019 	}
1020 
1021 	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
1022 	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
1023 	    nsegs);
1024 
1025 	seg_chain = NULL;
1026 	tmptail = seg_chain;
1027 	oleft = odatalen;
1028 
1029 	for (uint_t i = 0; i < nsegs; i++) {
1030 		boolean_t last_seg = ((i + 1) == nsegs);
1031 		uint32_t seg_len;
1032 
1033 		/*
1034 		 * If we fail to allocate, then drop the partially
1035 		 * allocated chain as well as the LSO packet. Let the
1036 		 * sender deal with the fallout.
1037 		 */
1038 		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
1039 			freemsgchain(seg_chain);
1040 			mac_drop_pkt(omp, "failed to alloc segment header");
1041 			goto fail;
1042 		}
1043 		ASSERT3P(nhdrmp->b_cont, ==, NULL);
1044 
1045 		if (seg_chain == NULL) {
1046 			seg_chain = nhdrmp;
1047 		} else {
1048 			ASSERT3P(tmptail, !=, NULL);
1049 			tmptail->b_next = nhdrmp;
1050 		}
1051 
1052 		tmptail = nhdrmp;
1053 
1054 		/*
1055 		 * Calculate this segment's lengh. It's either the MSS
1056 		 * or whatever remains for the last segment.
1057 		 */
1058 		seg_len = last_seg ? oleft : mss;
1059 		ASSERT3U(seg_len, <=, mss);
1060 		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
1061 
1062 		if (ndatamp == NULL) {
1063 			freemsgchain(seg_chain);
1064 			mac_drop_pkt(omp, "LSO failed to segment data");
1065 			goto fail;
1066 		}
1067 
1068 		/* Attach data mblk to header mblk. */
1069 		nhdrmp->b_cont = ndatamp;
1070 		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
1071 		ASSERT3U(seg_len, <=, oleft);
1072 		oleft -= seg_len;
1073 	}
1074 
1075 	/* We should have consumed entire LSO msg. */
1076 	ASSERT3S(oleft, ==, 0);
1077 	ASSERT3P(odatamp, ==, NULL);
1078 
1079 	/*
1080 	 * All seg data mblks are referenced by the header mblks, null
1081 	 * out this pointer to catch any bad derefs.
1082 	 */
1083 	ndatamp = NULL;
1084 
1085 	/*
1086 	 * Set headers and checksum for first segment.
1087 	 */
1088 	nhdrmp = seg_chain;
1089 	bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
1090 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1091 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1092 	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
1093 	niph->ipha_length = htons(oiphlen + otcphlen + mss);
1094 	niph->ipha_hdr_checksum = 0;
1095 	ip_id = ntohs(niph->ipha_ident);
1096 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1097 	tcp_seq = BE32_TO_U32(ntcph->th_seq);
1098 	tcp_seq += mss;
1099 
1100 	/*
1101 	 * The first segment shouldn't:
1102 	 *
1103 	 *	o indicate end of data transmission (FIN),
1104 	 *	o indicate immediate handling of the data (PUSH).
1105 	 */
1106 	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1107 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1108 
1109 	/*
1110 	 * If the underlying HW provides partial checksum, then make
1111 	 * sure to correct the pseudo header checksum before calling
1112 	 * mac_sw_cksum(). The native TCP stack doesn't include the
1113 	 * length field in the pseudo header when LSO is in play -- so
1114 	 * we need to calculate it here.
1115 	 */
1116 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1117 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1118 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1119 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1120 		tcp_sum = BE16_TO_U16(ntcph->th_sum);
1121 		otcp_sum = tcp_sum;
1122 		tcp_sum += mss + otcphlen;
1123 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1124 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1125 	}
1126 
1127 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1128 	    (emul & MAC_HWCKSUM_EMULS)) {
1129 		next_nhdrmp = nhdrmp->b_next;
1130 		nhdrmp->b_next = NULL;
1131 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1132 		nhdrmp->b_next = next_nhdrmp;
1133 		next_nhdrmp = NULL;
1134 
1135 		/*
1136 		 * We may have freed the nhdrmp argument during
1137 		 * checksum emulation, make sure that seg_chain
1138 		 * references a valid mblk.
1139 		 */
1140 		seg_chain = nhdrmp;
1141 	}
1142 
1143 	ASSERT3P(nhdrmp, !=, NULL);
1144 
1145 	seg = 1;
1146 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1147 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1148 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1149 	    uint_t, seg);
1150 	seg++;
1151 
1152 	/* There better be at least 2 segs. */
1153 	ASSERT3P(nhdrmp->b_next, !=, NULL);
1154 	prev_nhdrmp = nhdrmp;
1155 	nhdrmp = nhdrmp->b_next;
1156 
1157 	/*
1158 	 * Now adjust the headers of the middle segments. For each
1159 	 * header we need to adjust the following.
1160 	 *
1161 	 *	o IP ID
1162 	 *	o IP length
1163 	 *	o TCP sequence
1164 	 *	o TCP flags
1165 	 *	o cksum flags
1166 	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
1167 	 */
1168 	for (; seg < nsegs; seg++) {
1169 		/*
1170 		 * We use seg_chain as a reference to the first seg
1171 		 * header mblk -- this first header is a template for
1172 		 * the rest of the segments. This copy will include
1173 		 * the now updated checksum values from the first
1174 		 * header. We must reset these checksum values to
1175 		 * their original to make sure we produce the correct
1176 		 * value.
1177 		 */
1178 		bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1179 		nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1180 		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1181 		niph->ipha_ident = htons(++ip_id);
1182 		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1183 		niph->ipha_length = htons(oiphlen + otcphlen + mss);
1184 		niph->ipha_hdr_checksum = 0;
1185 		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1186 		U32_TO_BE32(tcp_seq, ntcph->th_seq);
1187 		tcp_seq += mss;
1188 		/*
1189 		 * Just like the first segment, the middle segments
1190 		 * shouldn't have these flags set.
1191 		 */
1192 		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1193 		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1194 
1195 		if (ocsum_flags & HCK_PARTIALCKSUM) {
1196 			/*
1197 			 * First and middle segs have same
1198 			 * pseudo-header checksum.
1199 			 */
1200 			U16_TO_BE16(tcp_sum, ntcph->th_sum);
1201 			DB_CKSUMSTART(nhdrmp) = ocsum_start;
1202 			DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1203 			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1204 		}
1205 
1206 		if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1207 		    (emul & MAC_HWCKSUM_EMULS)) {
1208 			next_nhdrmp = nhdrmp->b_next;
1209 			nhdrmp->b_next = NULL;
1210 			nhdrmp = mac_sw_cksum(nhdrmp, emul);
1211 			nhdrmp->b_next = next_nhdrmp;
1212 			next_nhdrmp = NULL;
1213 			/* We may have freed the original nhdrmp. */
1214 			prev_nhdrmp->b_next = nhdrmp;
1215 		}
1216 
1217 		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1218 		    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1219 		    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1220 		    uint_t, mss, uint_t, seg);
1221 
1222 		ASSERT3P(nhdrmp->b_next, !=, NULL);
1223 		prev_nhdrmp = nhdrmp;
1224 		nhdrmp = nhdrmp->b_next;
1225 	}
1226 
1227 	/* Make sure we are on the last segment. */
1228 	ASSERT3U(seg, ==, nsegs);
1229 	ASSERT3P(nhdrmp->b_next, ==, NULL);
1230 
1231 	/*
1232 	 * Now we set the last segment header. The difference being
1233 	 * that FIN/PSH/RST flags are allowed.
1234 	 */
1235 	bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1236 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1237 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1238 	niph->ipha_ident = htons(++ip_id);
1239 	len = msgsize(nhdrmp->b_cont);
1240 	ASSERT3S(len, >, 0);
1241 	niph->ipha_length = htons(oiphlen + otcphlen + len);
1242 	niph->ipha_hdr_checksum = 0;
1243 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1244 	U32_TO_BE32(tcp_seq, ntcph->th_seq);
1245 
1246 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1247 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1248 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1249 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1250 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1251 		tcp_sum = otcp_sum;
1252 		tcp_sum += len + otcphlen;
1253 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1254 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1255 	}
1256 
1257 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1258 	    (emul & MAC_HWCKSUM_EMULS)) {
1259 		/* This should be the last mblk. */
1260 		ASSERT3P(nhdrmp->b_next, ==, NULL);
1261 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1262 		prev_nhdrmp->b_next = nhdrmp;
1263 	}
1264 
1265 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1266 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1267 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1268 	    uint_t, seg);
1269 
1270 	/*
1271 	 * Free the reference to the original LSO message as it is
1272 	 * being replaced by seg_cahin.
1273 	 */
1274 	freemsg(omp);
1275 	*head = seg_chain;
1276 	*tail = nhdrmp;
1277 	*count = nsegs;
1278 	return;
1279 
1280 fail:
1281 	*head = NULL;
1282 	*tail = NULL;
1283 	*count = 0;
1284 }
1285 
1286 #define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1287 
1288 /*
1289  * Emulate various hardware offload features in software. Take a chain
1290  * of packets as input and emulate the hardware features specified in
1291  * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1292  * pointer given as input, and its tail pointer is written to
1293  * '*otail'. The number of packets in the new chain is written to
1294  * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1295  * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1296  * which case 'mp_chain' will simply stay a NULL chain.
1297  *
1298  * While unlikely, it is technically possible that this function could
1299  * receive a non-NULL chain as input and return a NULL chain as output
1300  * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1301  * zero). This could happen if all the packets in the chain are
1302  * dropped or if we fail to allocate new mblks. In this case, there is
1303  * nothing for the caller to free. In any event, the caller shouldn't
1304  * assume that '*mp_chain' is non-NULL on return.
1305  *
1306  * This function was written with three main use cases in mind.
1307  *
1308  * 1. To emulate hardware offloads when traveling mac-loopback (two
1309  *    clients on the same mac). This is wired up in mac_tx_send().
1310  *
1311  * 2. To provide hardware offloads to the client when the underlying
1312  *    provider cannot. This is currently wired up in mac_tx() but we
1313  *    still only negotiate offloads when the underlying provider
1314  *    supports them.
1315  *
1316  * 3. To emulate real hardware in simnet.
1317  */
1318 void
1319 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1320 {
1321 	mblk_t *head = NULL, *tail = NULL;
1322 	uint_t count = 0;
1323 
1324 	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1325 	ASSERT3P(mp_chain, !=, NULL);
1326 
1327 	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1328 		mblk_t *tmp, *next, *tmphead, *tmptail;
1329 		struct ether_header *ehp;
1330 		uint32_t flags;
1331 		uint_t len = MBLKL(mp), l2len;
1332 
1333 		/* Perform LSO/cksum one message at a time. */
1334 		next = mp->b_next;
1335 		mp->b_next = NULL;
1336 
1337 		/*
1338 		 * For our sanity the first mblk should contain at
1339 		 * least the full L2 header.
1340 		 */
1341 		if (len < sizeof (struct ether_header)) {
1342 			mac_drop_pkt(mp, "packet too short (A): %u", len);
1343 			mp = next;
1344 			continue;
1345 		}
1346 
1347 		ehp = (struct ether_header *)mp->b_rptr;
1348 		if (ntohs(ehp->ether_type) == VLAN_TPID)
1349 			l2len = sizeof (struct ether_vlan_header);
1350 		else
1351 			l2len = sizeof (struct ether_header);
1352 
1353 		/*
1354 		 * If the first mblk is solely the L2 header, then
1355 		 * there better be more data.
1356 		 */
1357 		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1358 			mac_drop_pkt(mp, "packet too short (C): %u", len);
1359 			mp = next;
1360 			continue;
1361 		}
1362 
1363 		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1364 
1365 		/*
1366 		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1367 		 * because we don't want to mask-out the LSO flag.
1368 		 */
1369 		flags = DB_CKSUMFLAGS(mp);
1370 
1371 		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1372 			uint_t tmpcount = 0;
1373 
1374 			/*
1375 			 * LSO fix-up handles checksum emulation
1376 			 * inline (if requested). It also frees mp.
1377 			 */
1378 			mac_sw_lso(mp, emul, &tmphead, &tmptail,
1379 			    &tmpcount);
1380 			if (tmphead == NULL) {
1381 				/* mac_sw_lso() freed the mp. */
1382 				mp = next;
1383 				continue;
1384 			}
1385 			count += tmpcount;
1386 		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1387 			tmp = mac_sw_cksum(mp, emul);
1388 			if (tmp == NULL) {
1389 				/* mac_sw_cksum() freed the mp. */
1390 				mp = next;
1391 				continue;
1392 			}
1393 			tmphead = tmp;
1394 			tmptail = tmp;
1395 			count++;
1396 		} else {
1397 			/* There is nothing to emulate. */
1398 			tmp = mp;
1399 			tmphead = tmp;
1400 			tmptail = tmp;
1401 			count++;
1402 		}
1403 
1404 		/*
1405 		 * The tmp mblk chain is either the start of the new
1406 		 * chain or added to the tail of the new chain.
1407 		 */
1408 		if (head == NULL) {
1409 			head = tmphead;
1410 			tail = tmptail;
1411 		} else {
1412 			/* Attach the new mblk to the end of the new chain. */
1413 			tail->b_next = tmphead;
1414 			tail = tmptail;
1415 		}
1416 
1417 		mp = next;
1418 	}
1419 
1420 	*mp_chain = head;
1421 
1422 	if (otail != NULL)
1423 		*otail = tail;
1424 
1425 	if (ocount != NULL)
1426 		*ocount = count;
1427 }
1428 
1429 /*
1430  * Add VLAN tag to the specified mblk.
1431  */
1432 mblk_t *
1433 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1434 {
1435 	mblk_t *hmp;
1436 	struct ether_vlan_header *evhp;
1437 	struct ether_header *ehp;
1438 
1439 	ASSERT(pri != 0 || vid != 0);
1440 
1441 	/*
1442 	 * Allocate an mblk for the new tagged ethernet header,
1443 	 * and copy the MAC addresses and ethertype from the
1444 	 * original header.
1445 	 */
1446 
1447 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1448 	if (hmp == NULL) {
1449 		freemsg(mp);
1450 		return (NULL);
1451 	}
1452 
1453 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
1454 	ehp = (struct ether_header *)mp->b_rptr;
1455 
1456 	bcopy(ehp, evhp, (ETHERADDRL * 2));
1457 	evhp->ether_type = ehp->ether_type;
1458 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1459 
1460 	hmp->b_wptr += sizeof (struct ether_vlan_header);
1461 	mp->b_rptr += sizeof (struct ether_header);
1462 
1463 	/*
1464 	 * Free the original message if it's now empty. Link the
1465 	 * rest of messages to the header message.
1466 	 */
1467 	mac_hcksum_clone(mp, hmp);
1468 	if (MBLKL(mp) == 0) {
1469 		hmp->b_cont = mp->b_cont;
1470 		freeb(mp);
1471 	} else {
1472 		hmp->b_cont = mp;
1473 	}
1474 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1475 
1476 	/*
1477 	 * Initialize the new TCI (Tag Control Information).
1478 	 */
1479 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1480 
1481 	return (hmp);
1482 }
1483 
1484 /*
1485  * Adds a VLAN tag with the specified VID and priority to each mblk of
1486  * the specified chain.
1487  */
1488 mblk_t *
1489 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1490 {
1491 	mblk_t *next_mp, **prev, *mp;
1492 
1493 	mp = mp_chain;
1494 	prev = &mp_chain;
1495 
1496 	while (mp != NULL) {
1497 		next_mp = mp->b_next;
1498 		mp->b_next = NULL;
1499 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1500 			freemsgchain(next_mp);
1501 			break;
1502 		}
1503 		*prev = mp;
1504 		prev = &mp->b_next;
1505 		mp = mp->b_next = next_mp;
1506 	}
1507 
1508 	return (mp_chain);
1509 }
1510 
1511 /*
1512  * Strip VLAN tag
1513  */
1514 mblk_t *
1515 mac_strip_vlan_tag(mblk_t *mp)
1516 {
1517 	mblk_t *newmp;
1518 	struct ether_vlan_header *evhp;
1519 
1520 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1521 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1522 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1523 
1524 		if (DB_REF(mp) > 1) {
1525 			newmp = copymsg(mp);
1526 			if (newmp == NULL)
1527 				return (NULL);
1528 			freemsg(mp);
1529 			mp = newmp;
1530 		}
1531 
1532 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1533 
1534 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1535 		mp->b_rptr += VLAN_TAGSZ;
1536 	}
1537 	return (mp);
1538 }
1539 
1540 /*
1541  * Strip VLAN tag from each mblk of the chain.
1542  */
1543 mblk_t *
1544 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1545 {
1546 	mblk_t *mp, *next_mp, **prev;
1547 
1548 	mp = mp_chain;
1549 	prev = &mp_chain;
1550 
1551 	while (mp != NULL) {
1552 		next_mp = mp->b_next;
1553 		mp->b_next = NULL;
1554 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1555 			freemsgchain(next_mp);
1556 			break;
1557 		}
1558 		*prev = mp;
1559 		prev = &mp->b_next;
1560 		mp = mp->b_next = next_mp;
1561 	}
1562 
1563 	return (mp_chain);
1564 }
1565 
1566 /*
1567  * Default callback function. Used when the datapath is not yet initialized.
1568  */
1569 /* ARGSUSED */
1570 void
1571 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1572     boolean_t loopback)
1573 {
1574 	freemsgchain(mp_chain);
1575 }
1576 
1577 /*
1578  * Determines the IPv6 header length accounting for all the optional IPv6
1579  * headers (hop-by-hop, destination, routing and fragment). The header length
1580  * and next header value (a transport header) is captured.
1581  *
1582  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1583  * returns B_TRUE.
1584  */
1585 boolean_t
1586 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1587     uint8_t *next_hdr, ip6_frag_t **fragp)
1588 {
1589 	uint16_t length;
1590 	uint_t	ehdrlen;
1591 	uint8_t *whereptr;
1592 	uint8_t *nexthdrp;
1593 	ip6_dest_t *desthdr;
1594 	ip6_rthdr_t *rthdr;
1595 	ip6_frag_t *fraghdr;
1596 
1597 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1598 		return (B_FALSE);
1599 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1600 	length = IPV6_HDR_LEN;
1601 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1602 
1603 	if (fragp != NULL)
1604 		*fragp = NULL;
1605 
1606 	nexthdrp = &ip6h->ip6_nxt;
1607 	while (whereptr < endptr) {
1608 		/* Is there enough left for len + nexthdr? */
1609 		if (whereptr + MIN_EHDR_LEN > endptr)
1610 			break;
1611 
1612 		switch (*nexthdrp) {
1613 		case IPPROTO_HOPOPTS:
1614 		case IPPROTO_DSTOPTS:
1615 			/* Assumes the headers are identical for hbh and dst */
1616 			desthdr = (ip6_dest_t *)whereptr;
1617 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
1618 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
1619 				return (B_FALSE);
1620 			nexthdrp = &desthdr->ip6d_nxt;
1621 			break;
1622 		case IPPROTO_ROUTING:
1623 			rthdr = (ip6_rthdr_t *)whereptr;
1624 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
1625 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
1626 				return (B_FALSE);
1627 			nexthdrp = &rthdr->ip6r_nxt;
1628 			break;
1629 		case IPPROTO_FRAGMENT:
1630 			fraghdr = (ip6_frag_t *)whereptr;
1631 			ehdrlen = sizeof (ip6_frag_t);
1632 			if ((uchar_t *)&fraghdr[1] > endptr)
1633 				return (B_FALSE);
1634 			nexthdrp = &fraghdr->ip6f_nxt;
1635 			if (fragp != NULL)
1636 				*fragp = fraghdr;
1637 			break;
1638 		case IPPROTO_NONE:
1639 			/* No next header means we're finished */
1640 		default:
1641 			*hdr_length = length;
1642 			*next_hdr = *nexthdrp;
1643 			return (B_TRUE);
1644 		}
1645 		length += ehdrlen;
1646 		whereptr += ehdrlen;
1647 		*hdr_length = length;
1648 		*next_hdr = *nexthdrp;
1649 	}
1650 	switch (*nexthdrp) {
1651 	case IPPROTO_HOPOPTS:
1652 	case IPPROTO_DSTOPTS:
1653 	case IPPROTO_ROUTING:
1654 	case IPPROTO_FRAGMENT:
1655 		/*
1656 		 * If any know extension headers are still to be processed,
1657 		 * the packet's malformed (or at least all the IP header(s) are
1658 		 * not in the same mblk - and that should never happen.
1659 		 */
1660 		return (B_FALSE);
1661 
1662 	default:
1663 		/*
1664 		 * If we get here, we know that all of the IP headers were in
1665 		 * the same mblk, even if the ULP header is in the next mblk.
1666 		 */
1667 		*hdr_length = length;
1668 		*next_hdr = *nexthdrp;
1669 		return (B_TRUE);
1670 	}
1671 }
1672 
1673 /*
1674  * The following set of routines are there to take care of interrupt
1675  * re-targeting for legacy (fixed) interrupts. Some older versions
1676  * of the popular NICs like e1000g do not support MSI-X interrupts
1677  * and they reserve fixed interrupts for RX/TX rings. To re-target
1678  * these interrupts, PCITOOL ioctls need to be used.
1679  */
1680 typedef struct mac_dladm_intr {
1681 	int	ino;
1682 	int	cpu_id;
1683 	char	driver_path[MAXPATHLEN];
1684 	char	nexus_path[MAXPATHLEN];
1685 } mac_dladm_intr_t;
1686 
1687 /* Bind the interrupt to cpu_num */
1688 static int
1689 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1690 {
1691 	pcitool_intr_set_t	iset;
1692 	int			err;
1693 
1694 	iset.old_cpu = oldcpuid;
1695 	iset.ino = ino;
1696 	iset.cpu_id = cpu_num;
1697 	iset.user_version = PCITOOL_VERSION;
1698 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1699 	    kcred, NULL);
1700 
1701 	return (err);
1702 }
1703 
1704 /*
1705  * Search interrupt information. iget is filled in with the info to search
1706  */
1707 static boolean_t
1708 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1709 {
1710 	int	i;
1711 	char	driver_path[2 * MAXPATHLEN];
1712 
1713 	for (i = 0; i < iget_p->num_devs; i++) {
1714 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1715 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1716 		    ":%s%d", iget_p->dev[i].driver_name,
1717 		    iget_p->dev[i].dev_inst);
1718 		/* Match the device path for the device path */
1719 		if (strcmp(driver_path, dln->driver_path) == 0) {
1720 			dln->ino = iget_p->ino;
1721 			dln->cpu_id = iget_p->cpu_id;
1722 			return (B_TRUE);
1723 		}
1724 	}
1725 	return (B_FALSE);
1726 }
1727 
1728 /*
1729  * Get information about ino, i.e. if this is the interrupt for our
1730  * device and where it is bound etc.
1731  */
1732 static boolean_t
1733 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1734     mac_dladm_intr_t *dln)
1735 {
1736 	pcitool_intr_get_t	*iget_p;
1737 	int			ipsz;
1738 	int			nipsz;
1739 	int			err;
1740 	uint8_t			inum;
1741 
1742 	/*
1743 	 * Check if SLEEP is OK, i.e if could come here in response to
1744 	 * changing the fanout due to some callback from the driver, say
1745 	 * link speed changes.
1746 	 */
1747 	ipsz = PCITOOL_IGET_SIZE(0);
1748 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1749 
1750 	iget_p->num_devs_ret = 0;
1751 	iget_p->user_version = PCITOOL_VERSION;
1752 	iget_p->cpu_id = oldcpuid;
1753 	iget_p->ino = ino;
1754 
1755 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1756 	    FKIOCTL, kcred, NULL);
1757 	if (err != 0) {
1758 		kmem_free(iget_p, ipsz);
1759 		return (B_FALSE);
1760 	}
1761 	if (iget_p->num_devs == 0) {
1762 		kmem_free(iget_p, ipsz);
1763 		return (B_FALSE);
1764 	}
1765 	inum = iget_p->num_devs;
1766 	if (iget_p->num_devs_ret < iget_p->num_devs) {
1767 		/* Reallocate */
1768 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1769 
1770 		kmem_free(iget_p, ipsz);
1771 		ipsz = nipsz;
1772 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1773 
1774 		iget_p->num_devs_ret = inum;
1775 		iget_p->cpu_id = oldcpuid;
1776 		iget_p->ino = ino;
1777 		iget_p->user_version = PCITOOL_VERSION;
1778 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1779 		    FKIOCTL, kcred, NULL);
1780 		if (err != 0) {
1781 			kmem_free(iget_p, ipsz);
1782 			return (B_FALSE);
1783 		}
1784 		/* defensive */
1785 		if (iget_p->num_devs != iget_p->num_devs_ret) {
1786 			kmem_free(iget_p, ipsz);
1787 			return (B_FALSE);
1788 		}
1789 	}
1790 
1791 	if (mac_search_intrinfo(iget_p, dln)) {
1792 		kmem_free(iget_p, ipsz);
1793 		return (B_TRUE);
1794 	}
1795 	kmem_free(iget_p, ipsz);
1796 	return (B_FALSE);
1797 }
1798 
1799 /*
1800  * Get the interrupts and check each one to see if it is for our device.
1801  */
1802 static int
1803 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1804 {
1805 	pcitool_intr_info_t	intr_info;
1806 	int			err;
1807 	int			ino;
1808 	int			oldcpuid;
1809 
1810 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1811 	    FKIOCTL, kcred, NULL);
1812 	if (err != 0)
1813 		return (-1);
1814 
1815 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1816 		for (ino = 0; ino < intr_info.num_intr; ino++) {
1817 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1818 				if (dln->cpu_id == cpuid)
1819 					return (0);
1820 				return (1);
1821 			}
1822 		}
1823 	}
1824 	return (-1);
1825 }
1826 
1827 /*
1828  * Obtain the nexus parent node info. for mdip.
1829  */
1830 static dev_info_t *
1831 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1832 {
1833 	struct dev_info		*tdip = (struct dev_info *)mdip;
1834 	struct ddi_minor_data	*minordata;
1835 	int			circ;
1836 	dev_info_t		*pdip;
1837 	char			pathname[MAXPATHLEN];
1838 
1839 	while (tdip != NULL) {
1840 		/*
1841 		 * The netboot code could call this function while walking the
1842 		 * device tree so we need to use ndi_devi_tryenter() here to
1843 		 * avoid deadlock.
1844 		 */
1845 		if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
1846 			break;
1847 
1848 		for (minordata = tdip->devi_minor; minordata != NULL;
1849 		    minordata = minordata->next) {
1850 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1851 			    strlen(DDI_NT_INTRCTL)) == 0) {
1852 				pdip = minordata->dip;
1853 				(void) ddi_pathname(pdip, pathname);
1854 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
1855 				    "/devices%s:intr", pathname);
1856 				(void) ddi_pathname_minor(minordata, pathname);
1857 				ndi_devi_exit((dev_info_t *)tdip, circ);
1858 				return (pdip);
1859 			}
1860 		}
1861 		ndi_devi_exit((dev_info_t *)tdip, circ);
1862 		tdip = tdip->devi_parent;
1863 	}
1864 	return (NULL);
1865 }
1866 
1867 /*
1868  * For a primary MAC client, if the user has set a list or CPUs or
1869  * we have obtained it implicitly, we try to retarget the interrupt
1870  * for that device on one of the CPUs in the list.
1871  * We assign the interrupt to the same CPU as the poll thread.
1872  */
1873 static boolean_t
1874 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1875 {
1876 	ldi_handle_t		lh = NULL;
1877 	ldi_ident_t		li = NULL;
1878 	int			err;
1879 	int			ret;
1880 	mac_dladm_intr_t	dln;
1881 	dev_info_t		*dip;
1882 	struct ddi_minor_data	*minordata;
1883 
1884 	dln.nexus_path[0] = '\0';
1885 	dln.driver_path[0] = '\0';
1886 
1887 	minordata = ((struct dev_info *)mdip)->devi_minor;
1888 	while (minordata != NULL) {
1889 		if (minordata->type == DDM_MINOR)
1890 			break;
1891 		minordata = minordata->next;
1892 	}
1893 	if (minordata == NULL)
1894 		return (B_FALSE);
1895 
1896 	(void) ddi_pathname_minor(minordata, dln.driver_path);
1897 
1898 	dip = mac_get_nexus_node(mdip, &dln);
1899 	/* defensive */
1900 	if (dip == NULL)
1901 		return (B_FALSE);
1902 
1903 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1904 	if (err != 0)
1905 		return (B_FALSE);
1906 
1907 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1908 	if (err != 0)
1909 		return (B_FALSE);
1910 
1911 	ret = mac_validate_intr(lh, &dln, cpuid);
1912 	if (ret < 0) {
1913 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
1914 		return (B_FALSE);
1915 	}
1916 	/* cmn_note? */
1917 	if (ret != 0)
1918 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1919 		    != 0) {
1920 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
1921 			return (B_FALSE);
1922 		}
1923 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
1924 	return (B_TRUE);
1925 }
1926 
1927 void
1928 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1929 {
1930 	dev_info_t		*mdip = (dev_info_t *)arg;
1931 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1932 	mac_resource_props_t	*mrp;
1933 	mac_perim_handle_t	mph;
1934 	flow_entry_t		*flent = mcip->mci_flent;
1935 	mac_soft_ring_set_t	*rx_srs;
1936 	mac_cpus_t		*srs_cpu;
1937 
1938 	if (!mac_check_interrupt_binding(mdip, cpuid))
1939 		cpuid = -1;
1940 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1941 	mrp = MCIP_RESOURCE_PROPS(mcip);
1942 	mrp->mrp_rx_intr_cpu = cpuid;
1943 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1944 		rx_srs = flent->fe_rx_srs[1];
1945 		srs_cpu = &rx_srs->srs_cpu;
1946 		srs_cpu->mc_rx_intr_cpu = cpuid;
1947 	}
1948 	mac_perim_exit(mph);
1949 }
1950 
1951 int32_t
1952 mac_client_intr_cpu(mac_client_handle_t mch)
1953 {
1954 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1955 	mac_cpus_t		*srs_cpu;
1956 	mac_soft_ring_set_t	*rx_srs;
1957 	flow_entry_t		*flent = mcip->mci_flent;
1958 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
1959 	mac_ring_t		*ring;
1960 	mac_intr_t		*mintr;
1961 
1962 	/*
1963 	 * Check if we need to retarget the interrupt. We do this only
1964 	 * for the primary MAC client. We do this if we have the only
1965 	 * exclusive ring in the group.
1966 	 */
1967 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1968 		rx_srs = flent->fe_rx_srs[1];
1969 		srs_cpu = &rx_srs->srs_cpu;
1970 		ring = rx_srs->srs_ring;
1971 		mintr = &ring->mr_info.mri_intr;
1972 		/*
1973 		 * If ddi_handle is present or the poll CPU is
1974 		 * already bound to the interrupt CPU, return -1.
1975 		 */
1976 		if (mintr->mi_ddi_handle != NULL ||
1977 		    ((mrp->mrp_ncpus != 0) &&
1978 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1979 			return (-1);
1980 		}
1981 		return (srs_cpu->mc_rx_pollid);
1982 	}
1983 	return (-1);
1984 }
1985 
1986 void *
1987 mac_get_devinfo(mac_handle_t mh)
1988 {
1989 	mac_impl_t	*mip = (mac_impl_t *)mh;
1990 
1991 	return ((void *)mip->mi_dip);
1992 }
1993 
1994 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1995 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1996 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1997 
1998 uint64_t
1999 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
2000 {
2001 	struct ether_header *ehp;
2002 	uint64_t hash = 0;
2003 	uint16_t sap;
2004 	uint_t skip_len;
2005 	uint8_t proto;
2006 	boolean_t ip_fragmented;
2007 
2008 	/*
2009 	 * We may want to have one of these per MAC type plugin in the
2010 	 * future. For now supports only ethernet.
2011 	 */
2012 	if (media != DL_ETHER)
2013 		return (0L);
2014 
2015 	/* for now we support only outbound packets */
2016 	ASSERT(is_outbound);
2017 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
2018 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
2019 
2020 	/* compute L2 hash */
2021 
2022 	ehp = (struct ether_header *)mp->b_rptr;
2023 
2024 	if ((policy & MAC_PKT_HASH_L2) != 0) {
2025 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
2026 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
2027 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
2028 		policy &= ~MAC_PKT_HASH_L2;
2029 	}
2030 
2031 	if (policy == 0)
2032 		goto done;
2033 
2034 	/* skip ethernet header */
2035 
2036 	sap = ntohs(ehp->ether_type);
2037 	if (sap == ETHERTYPE_VLAN) {
2038 		struct ether_vlan_header *evhp;
2039 		mblk_t *newmp = NULL;
2040 
2041 		skip_len = sizeof (struct ether_vlan_header);
2042 		if (MBLKL(mp) < skip_len) {
2043 			/* the vlan tag is the payload, pull up first */
2044 			newmp = msgpullup(mp, -1);
2045 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
2046 				goto done;
2047 			}
2048 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
2049 		} else {
2050 			evhp = (struct ether_vlan_header *)mp->b_rptr;
2051 		}
2052 
2053 		sap = ntohs(evhp->ether_type);
2054 		freemsg(newmp);
2055 	} else {
2056 		skip_len = sizeof (struct ether_header);
2057 	}
2058 
2059 	/* if ethernet header is in its own mblk, skip it */
2060 	if (MBLKL(mp) <= skip_len) {
2061 		skip_len -= MBLKL(mp);
2062 		mp = mp->b_cont;
2063 		if (mp == NULL)
2064 			goto done;
2065 	}
2066 
2067 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
2068 
2069 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
2070 
2071 	switch (sap) {
2072 	case ETHERTYPE_IP: {
2073 		ipha_t *iphp;
2074 
2075 		/*
2076 		 * If the header is not aligned or the header doesn't fit
2077 		 * in the mblk, bail now. Note that this may cause packets
2078 		 * reordering.
2079 		 */
2080 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
2081 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2082 		    !OK_32PTR((char *)iphp))
2083 			goto done;
2084 
2085 		proto = iphp->ipha_protocol;
2086 		skip_len += IPH_HDR_LENGTH(iphp);
2087 
2088 		/* Check if the packet is fragmented. */
2089 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2090 		    IPH_OFFSET;
2091 
2092 		/*
2093 		 * For fragmented packets, use addresses in addition to
2094 		 * the frag_id to generate the hash inorder to get
2095 		 * better distribution.
2096 		 */
2097 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2098 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2099 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2100 
2101 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2102 			    PKT_HASH_4BYTES(ip_dst));
2103 			policy &= ~MAC_PKT_HASH_L3;
2104 		}
2105 
2106 		if (ip_fragmented) {
2107 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2108 			hash ^= PKT_HASH_2BYTES(identp);
2109 			goto done;
2110 		}
2111 		break;
2112 	}
2113 	case ETHERTYPE_IPV6: {
2114 		ip6_t *ip6hp;
2115 		ip6_frag_t *frag = NULL;
2116 		uint16_t hdr_length;
2117 
2118 		/*
2119 		 * If the header is not aligned or the header doesn't fit
2120 		 * in the mblk, bail now. Note that this may cause packets
2121 		 * reordering.
2122 		 */
2123 
2124 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2125 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2126 		    !OK_32PTR((char *)ip6hp))
2127 			goto done;
2128 
2129 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2130 		    &proto, &frag))
2131 			goto done;
2132 		skip_len += hdr_length;
2133 
2134 		/*
2135 		 * For fragmented packets, use addresses in addition to
2136 		 * the frag_id to generate the hash inorder to get
2137 		 * better distribution.
2138 		 */
2139 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2140 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2141 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2142 
2143 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2144 			    PKT_HASH_4BYTES(ip_dst));
2145 			policy &= ~MAC_PKT_HASH_L3;
2146 		}
2147 
2148 		if (frag != NULL) {
2149 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2150 			hash ^= PKT_HASH_4BYTES(identp);
2151 			goto done;
2152 		}
2153 		break;
2154 	}
2155 	default:
2156 		goto done;
2157 	}
2158 
2159 	if (policy == 0)
2160 		goto done;
2161 
2162 	/* if ip header is in its own mblk, skip it */
2163 	if (MBLKL(mp) <= skip_len) {
2164 		skip_len -= MBLKL(mp);
2165 		mp = mp->b_cont;
2166 		if (mp == NULL)
2167 			goto done;
2168 	}
2169 
2170 	/* parse ULP header */
2171 again:
2172 	switch (proto) {
2173 	case IPPROTO_TCP:
2174 	case IPPROTO_UDP:
2175 	case IPPROTO_ESP:
2176 	case IPPROTO_SCTP:
2177 		/*
2178 		 * These Internet Protocols are intentionally designed
2179 		 * for hashing from the git-go.  Port numbers are in the first
2180 		 * word for transports, SPI is first for ESP.
2181 		 */
2182 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2183 			goto done;
2184 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2185 		break;
2186 
2187 	case IPPROTO_AH: {
2188 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2189 		uint_t ah_length = AH_TOTAL_LEN(ah);
2190 
2191 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2192 			goto done;
2193 
2194 		proto = ah->ah_nexthdr;
2195 		skip_len += ah_length;
2196 
2197 		/* if AH header is in its own mblk, skip it */
2198 		if (MBLKL(mp) <= skip_len) {
2199 			skip_len -= MBLKL(mp);
2200 			mp = mp->b_cont;
2201 			if (mp == NULL)
2202 				goto done;
2203 		}
2204 
2205 		goto again;
2206 	}
2207 	}
2208 
2209 done:
2210 	return (hash);
2211 }
2212