xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision a386cc11a86ecb60f5a48078d22c1500e2ad003e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * MAC Services Module - misc utilities
27  */
28 
29 #include <sys/types.h>
30 #include <sys/mac.h>
31 #include <sys/mac_impl.h>
32 #include <sys/mac_client_priv.h>
33 #include <sys/mac_client_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/strsubr.h>
36 #include <sys/strsun.h>
37 #include <sys/vlan.h>
38 #include <sys/pattr.h>
39 #include <sys/pci_tools.h>
40 #include <inet/ip.h>
41 #include <inet/ip_impl.h>
42 #include <inet/ip6.h>
43 #include <sys/vtrace.h>
44 #include <sys/dlpi.h>
45 #include <sys/sunndi.h>
46 #include <inet/ipsec_impl.h>
47 #include <inet/sadb.h>
48 #include <inet/ipsecesp.h>
49 #include <inet/ipsecah.h>
50 
51 /*
52  * Copy an mblk, preserving its hardware checksum flags.
53  */
54 static mblk_t *
55 mac_copymsg_cksum(mblk_t *mp)
56 {
57 	mblk_t *mp1;
58 	uint32_t start, stuff, end, value, flags;
59 
60 	mp1 = copymsg(mp);
61 	if (mp1 == NULL)
62 		return (NULL);
63 
64 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
65 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
66 	    flags, KM_NOSLEEP);
67 
68 	return (mp1);
69 }
70 
71 /*
72  * Copy an mblk chain, presenting the hardware checksum flags of the
73  * individual mblks.
74  */
75 mblk_t *
76 mac_copymsgchain_cksum(mblk_t *mp)
77 {
78 	mblk_t *nmp = NULL;
79 	mblk_t **nmpp = &nmp;
80 
81 	for (; mp != NULL; mp = mp->b_next) {
82 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
83 			freemsgchain(nmp);
84 			return (NULL);
85 		}
86 
87 		nmpp = &((*nmpp)->b_next);
88 	}
89 
90 	return (nmp);
91 }
92 
93 /*
94  * Process the specified mblk chain for proper handling of hardware
95  * checksum offload. This routine is invoked for loopback traffic
96  * between MAC clients.
97  * The function handles a NULL mblk chain passed as argument.
98  */
99 mblk_t *
100 mac_fix_cksum(mblk_t *mp_chain)
101 {
102 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
103 	uint32_t flags, start, stuff, end, value;
104 
105 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
106 		uint16_t len;
107 		uint32_t offset;
108 		struct ether_header *ehp;
109 		uint16_t sap;
110 
111 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
112 		    &flags);
113 		if (flags == 0)
114 			continue;
115 
116 		/*
117 		 * Since the processing of checksum offload for loopback
118 		 * traffic requires modification of the packet contents,
119 		 * ensure sure that we are always modifying our own copy.
120 		 */
121 		if (DB_REF(mp) > 1) {
122 			mp1 = copymsg(mp);
123 			if (mp1 == NULL)
124 				continue;
125 			mp1->b_next = mp->b_next;
126 			mp->b_next = NULL;
127 			freemsg(mp);
128 			if (prev != NULL)
129 				prev->b_next = mp1;
130 			else
131 				new_chain = mp1;
132 			mp = mp1;
133 		}
134 
135 		/*
136 		 * Ethernet, and optionally VLAN header.
137 		 */
138 		/* LINTED: improper alignment cast */
139 		ehp = (struct ether_header *)mp->b_rptr;
140 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
141 			struct ether_vlan_header *evhp;
142 
143 			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
144 			/* LINTED: improper alignment cast */
145 			evhp = (struct ether_vlan_header *)mp->b_rptr;
146 			sap = ntohs(evhp->ether_type);
147 			offset = sizeof (struct ether_vlan_header);
148 		} else {
149 			sap = ntohs(ehp->ether_type);
150 			offset = sizeof (struct ether_header);
151 		}
152 
153 		if (MBLKL(mp) <= offset) {
154 			offset -= MBLKL(mp);
155 			if (mp->b_cont == NULL) {
156 				/* corrupted packet, skip it */
157 				if (prev != NULL)
158 					prev->b_next = mp->b_next;
159 				else
160 					new_chain = mp->b_next;
161 				mp1 = mp->b_next;
162 				mp->b_next = NULL;
163 				freemsg(mp);
164 				mp = mp1;
165 				continue;
166 			}
167 			mp = mp->b_cont;
168 		}
169 
170 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
171 			ipha_t *ipha = NULL;
172 
173 			/*
174 			 * In order to compute the full and header
175 			 * checksums, we need to find and parse
176 			 * the IP and/or ULP headers.
177 			 */
178 
179 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
180 
181 			/*
182 			 * IP header.
183 			 */
184 			if (sap != ETHERTYPE_IP)
185 				continue;
186 
187 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
188 			/* LINTED: improper alignment cast */
189 			ipha = (ipha_t *)(mp->b_rptr + offset);
190 
191 			if (flags & HCK_FULLCKSUM) {
192 				ipaddr_t src, dst;
193 				uint32_t cksum;
194 				uint16_t *up;
195 				uint8_t proto;
196 
197 				/*
198 				 * Pointer to checksum field in ULP header.
199 				 */
200 				proto = ipha->ipha_protocol;
201 				ASSERT(ipha->ipha_version_and_hdr_length ==
202 				    IP_SIMPLE_HDR_VERSION);
203 
204 				switch (proto) {
205 				case IPPROTO_TCP:
206 					/* LINTED: improper alignment cast */
207 					up = IPH_TCPH_CHECKSUMP(ipha,
208 					    IP_SIMPLE_HDR_LENGTH);
209 					break;
210 
211 				case IPPROTO_UDP:
212 					/* LINTED: improper alignment cast */
213 					up = IPH_UDPH_CHECKSUMP(ipha,
214 					    IP_SIMPLE_HDR_LENGTH);
215 					break;
216 
217 				default:
218 					cmn_err(CE_WARN, "mac_fix_cksum: "
219 					    "unexpected protocol: %d", proto);
220 					continue;
221 				}
222 
223 				/*
224 				 * Pseudo-header checksum.
225 				 */
226 				src = ipha->ipha_src;
227 				dst = ipha->ipha_dst;
228 				len = ntohs(ipha->ipha_length) -
229 				    IP_SIMPLE_HDR_LENGTH;
230 
231 				cksum = (dst >> 16) + (dst & 0xFFFF) +
232 				    (src >> 16) + (src & 0xFFFF);
233 				cksum += htons(len);
234 
235 				/*
236 				 * The checksum value stored in the packet needs
237 				 * to be correct. Compute it here.
238 				 */
239 				*up = 0;
240 				cksum += (((proto) == IPPROTO_UDP) ?
241 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
242 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
243 				    offset, cksum);
244 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
245 
246 				/*
247 				 * Flag the packet so that it appears
248 				 * that the checksum has already been
249 				 * verified by the hardware.
250 				 */
251 				flags &= ~HCK_FULLCKSUM;
252 				flags |= HCK_FULLCKSUM_OK;
253 				value = 0;
254 			}
255 
256 			if (flags & HCK_IPV4_HDRCKSUM) {
257 				ASSERT(ipha != NULL);
258 				ipha->ipha_hdr_checksum =
259 				    (uint16_t)ip_csum_hdr(ipha);
260 				flags &= ~HCK_IPV4_HDRCKSUM;
261 				flags |= HCK_IPV4_HDRCKSUM_OK;
262 
263 			}
264 		}
265 
266 		if (flags & HCK_PARTIALCKSUM) {
267 			uint16_t *up, partial, cksum;
268 			uchar_t *ipp; /* ptr to beginning of IP header */
269 
270 			if (mp->b_cont != NULL) {
271 				mblk_t *mp1;
272 
273 				mp1 = msgpullup(mp, offset + end);
274 				if (mp1 == NULL)
275 					continue;
276 				mp1->b_next = mp->b_next;
277 				mp->b_next = NULL;
278 				freemsg(mp);
279 				if (prev != NULL)
280 					prev->b_next = mp1;
281 				else
282 					new_chain = mp1;
283 				mp = mp1;
284 			}
285 
286 			ipp = mp->b_rptr + offset;
287 			/* LINTED: cast may result in improper alignment */
288 			up = (uint16_t *)((uchar_t *)ipp + stuff);
289 			partial = *up;
290 			*up = 0;
291 
292 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
293 			    end - start, partial);
294 			cksum = ~cksum;
295 			*up = cksum ? cksum : ~cksum;
296 
297 			/*
298 			 * Since we already computed the whole checksum,
299 			 * indicate to the stack that it has already
300 			 * been verified by the hardware.
301 			 */
302 			flags &= ~HCK_PARTIALCKSUM;
303 			flags |= HCK_FULLCKSUM_OK;
304 			value = 0;
305 		}
306 
307 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
308 		    value, flags, KM_NOSLEEP);
309 	}
310 
311 	return (new_chain);
312 }
313 
314 /*
315  * Add VLAN tag to the specified mblk.
316  */
317 mblk_t *
318 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
319 {
320 	mblk_t *hmp;
321 	struct ether_vlan_header *evhp;
322 	struct ether_header *ehp;
323 	uint32_t start, stuff, end, value, flags;
324 
325 	ASSERT(pri != 0 || vid != 0);
326 
327 	/*
328 	 * Allocate an mblk for the new tagged ethernet header,
329 	 * and copy the MAC addresses and ethertype from the
330 	 * original header.
331 	 */
332 
333 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
334 	if (hmp == NULL) {
335 		freemsg(mp);
336 		return (NULL);
337 	}
338 
339 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
340 	ehp = (struct ether_header *)mp->b_rptr;
341 
342 	bcopy(ehp, evhp, (ETHERADDRL * 2));
343 	evhp->ether_type = ehp->ether_type;
344 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
345 
346 	hmp->b_wptr += sizeof (struct ether_vlan_header);
347 	mp->b_rptr += sizeof (struct ether_header);
348 
349 	/*
350 	 * Free the original message if it's now empty. Link the
351 	 * rest of messages to the header message.
352 	 */
353 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
354 	(void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
355 	    KM_NOSLEEP);
356 	if (MBLKL(mp) == 0) {
357 		hmp->b_cont = mp->b_cont;
358 		freeb(mp);
359 	} else {
360 		hmp->b_cont = mp;
361 	}
362 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
363 
364 	/*
365 	 * Initialize the new TCI (Tag Control Information).
366 	 */
367 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
368 
369 	return (hmp);
370 }
371 
372 /*
373  * Adds a VLAN tag with the specified VID and priority to each mblk of
374  * the specified chain.
375  */
376 mblk_t *
377 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
378 {
379 	mblk_t *next_mp, **prev, *mp;
380 
381 	mp = mp_chain;
382 	prev = &mp_chain;
383 
384 	while (mp != NULL) {
385 		next_mp = mp->b_next;
386 		mp->b_next = NULL;
387 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
388 			freemsgchain(next_mp);
389 			break;
390 		}
391 		*prev = mp;
392 		prev = &mp->b_next;
393 		mp = mp->b_next = next_mp;
394 	}
395 
396 	return (mp_chain);
397 }
398 
399 /*
400  * Strip VLAN tag
401  */
402 mblk_t *
403 mac_strip_vlan_tag(mblk_t *mp)
404 {
405 	mblk_t *newmp;
406 	struct ether_vlan_header *evhp;
407 
408 	evhp = (struct ether_vlan_header *)mp->b_rptr;
409 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
410 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
411 
412 		if (DB_REF(mp) > 1) {
413 			newmp = copymsg(mp);
414 			if (newmp == NULL)
415 				return (NULL);
416 			freemsg(mp);
417 			mp = newmp;
418 		}
419 
420 		evhp = (struct ether_vlan_header *)mp->b_rptr;
421 
422 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
423 		mp->b_rptr += VLAN_TAGSZ;
424 	}
425 	return (mp);
426 }
427 
428 /*
429  * Strip VLAN tag from each mblk of the chain.
430  */
431 mblk_t *
432 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
433 {
434 	mblk_t *mp, *next_mp, **prev;
435 
436 	mp = mp_chain;
437 	prev = &mp_chain;
438 
439 	while (mp != NULL) {
440 		next_mp = mp->b_next;
441 		mp->b_next = NULL;
442 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
443 			freemsgchain(next_mp);
444 			break;
445 		}
446 		*prev = mp;
447 		prev = &mp->b_next;
448 		mp = mp->b_next = next_mp;
449 	}
450 
451 	return (mp_chain);
452 }
453 
454 /*
455  * Default callback function. Used when the datapath is not yet initialized.
456  */
457 /* ARGSUSED */
458 void
459 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
460     boolean_t loopback)
461 {
462 	mblk_t	*mp1 = mp;
463 
464 	while (mp1 != NULL) {
465 		mp1->b_prev = NULL;
466 		mp1->b_queue = NULL;
467 		mp1 = mp1->b_next;
468 	}
469 	freemsgchain(mp);
470 }
471 
472 /*
473  * Determines the IPv6 header length accounting for all the optional IPv6
474  * headers (hop-by-hop, destination, routing and fragment). The header length
475  * and next header value (a transport header) is captured.
476  *
477  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
478  * returns B_TRUE.
479  */
480 boolean_t
481 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
482     uint8_t *next_hdr, ip6_frag_t **fragp)
483 {
484 	uint16_t length;
485 	uint_t	ehdrlen;
486 	uint8_t *whereptr;
487 	uint8_t *nexthdrp;
488 	ip6_dest_t *desthdr;
489 	ip6_rthdr_t *rthdr;
490 	ip6_frag_t *fraghdr;
491 
492 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
493 		return (B_FALSE);
494 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
495 	length = IPV6_HDR_LEN;
496 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
497 
498 	if (fragp != NULL)
499 		*fragp = NULL;
500 
501 	nexthdrp = &ip6h->ip6_nxt;
502 	while (whereptr < endptr) {
503 		/* Is there enough left for len + nexthdr? */
504 		if (whereptr + MIN_EHDR_LEN > endptr)
505 			break;
506 
507 		switch (*nexthdrp) {
508 		case IPPROTO_HOPOPTS:
509 		case IPPROTO_DSTOPTS:
510 			/* Assumes the headers are identical for hbh and dst */
511 			desthdr = (ip6_dest_t *)whereptr;
512 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
513 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
514 				return (B_FALSE);
515 			nexthdrp = &desthdr->ip6d_nxt;
516 			break;
517 		case IPPROTO_ROUTING:
518 			rthdr = (ip6_rthdr_t *)whereptr;
519 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
520 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
521 				return (B_FALSE);
522 			nexthdrp = &rthdr->ip6r_nxt;
523 			break;
524 		case IPPROTO_FRAGMENT:
525 			fraghdr = (ip6_frag_t *)whereptr;
526 			ehdrlen = sizeof (ip6_frag_t);
527 			if ((uchar_t *)&fraghdr[1] > endptr)
528 				return (B_FALSE);
529 			nexthdrp = &fraghdr->ip6f_nxt;
530 			if (fragp != NULL)
531 				*fragp = fraghdr;
532 			break;
533 		case IPPROTO_NONE:
534 			/* No next header means we're finished */
535 		default:
536 			*hdr_length = length;
537 			*next_hdr = *nexthdrp;
538 			return (B_TRUE);
539 		}
540 		length += ehdrlen;
541 		whereptr += ehdrlen;
542 		*hdr_length = length;
543 		*next_hdr = *nexthdrp;
544 	}
545 	switch (*nexthdrp) {
546 	case IPPROTO_HOPOPTS:
547 	case IPPROTO_DSTOPTS:
548 	case IPPROTO_ROUTING:
549 	case IPPROTO_FRAGMENT:
550 		/*
551 		 * If any know extension headers are still to be processed,
552 		 * the packet's malformed (or at least all the IP header(s) are
553 		 * not in the same mblk - and that should never happen.
554 		 */
555 		return (B_FALSE);
556 
557 	default:
558 		/*
559 		 * If we get here, we know that all of the IP headers were in
560 		 * the same mblk, even if the ULP header is in the next mblk.
561 		 */
562 		*hdr_length = length;
563 		*next_hdr = *nexthdrp;
564 		return (B_TRUE);
565 	}
566 }
567 
568 /*
569  * The following set of routines are there to take care of interrupt
570  * re-targeting for legacy (fixed) interrupts. Some older versions
571  * of the popular NICs like e1000g do not support MSI-X interrupts
572  * and they reserve fixed interrupts for RX/TX rings. To re-target
573  * these interrupts, PCITOOL ioctls need to be used.
574  */
575 typedef struct mac_dladm_intr {
576 	int	ino;
577 	int	cpu_id;
578 	char	driver_path[MAXPATHLEN];
579 	char	nexus_path[MAXPATHLEN];
580 } mac_dladm_intr_t;
581 
582 /* Bind the interrupt to cpu_num */
583 static int
584 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
585 {
586 	pcitool_intr_set_t	iset;
587 	int			err;
588 
589 	iset.old_cpu = oldcpuid;
590 	iset.ino = ino;
591 	iset.cpu_id = cpu_num;
592 	iset.user_version = PCITOOL_VERSION;
593 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
594 	    kcred, NULL);
595 
596 	return (err);
597 }
598 
599 /*
600  * Search interrupt information. iget is filled in with the info to search
601  */
602 static boolean_t
603 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
604 {
605 	int	i;
606 	char	driver_path[2 * MAXPATHLEN];
607 
608 	for (i = 0; i < iget_p->num_devs; i++) {
609 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
610 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
611 		    ":%s%d", iget_p->dev[i].driver_name,
612 		    iget_p->dev[i].dev_inst);
613 		/* Match the device path for the device path */
614 		if (strcmp(driver_path, dln->driver_path) == 0) {
615 			dln->ino = iget_p->ino;
616 			dln->cpu_id = iget_p->cpu_id;
617 			return (B_TRUE);
618 		}
619 	}
620 	return (B_FALSE);
621 }
622 
623 /*
624  * Get information about ino, i.e. if this is the interrupt for our
625  * device and where it is bound etc.
626  */
627 static boolean_t
628 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
629     mac_dladm_intr_t *dln)
630 {
631 	pcitool_intr_get_t	*iget_p;
632 	int			ipsz;
633 	int			nipsz;
634 	int			err;
635 	uint8_t			inum;
636 
637 	/*
638 	 * Check if SLEEP is OK, i.e if could come here in response to
639 	 * changing the fanout due to some callback from the driver, say
640 	 * link speed changes.
641 	 */
642 	ipsz = PCITOOL_IGET_SIZE(0);
643 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
644 
645 	iget_p->num_devs_ret = 0;
646 	iget_p->user_version = PCITOOL_VERSION;
647 	iget_p->cpu_id = oldcpuid;
648 	iget_p->ino = ino;
649 
650 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
651 	    FKIOCTL, kcred, NULL);
652 	if (err != 0) {
653 		kmem_free(iget_p, ipsz);
654 		return (B_FALSE);
655 	}
656 	if (iget_p->num_devs == 0) {
657 		kmem_free(iget_p, ipsz);
658 		return (B_FALSE);
659 	}
660 	inum = iget_p->num_devs;
661 	if (iget_p->num_devs_ret < iget_p->num_devs) {
662 		/* Reallocate */
663 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
664 
665 		kmem_free(iget_p, ipsz);
666 		ipsz = nipsz;
667 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
668 
669 		iget_p->num_devs_ret = inum;
670 		iget_p->cpu_id = oldcpuid;
671 		iget_p->ino = ino;
672 		iget_p->user_version = PCITOOL_VERSION;
673 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
674 		    FKIOCTL, kcred, NULL);
675 		if (err != 0) {
676 			kmem_free(iget_p, ipsz);
677 			return (B_FALSE);
678 		}
679 		/* defensive */
680 		if (iget_p->num_devs != iget_p->num_devs_ret) {
681 			kmem_free(iget_p, ipsz);
682 			return (B_FALSE);
683 		}
684 	}
685 
686 	if (mac_search_intrinfo(iget_p, dln)) {
687 		kmem_free(iget_p, ipsz);
688 		return (B_TRUE);
689 	}
690 	kmem_free(iget_p, ipsz);
691 	return (B_FALSE);
692 }
693 
694 /*
695  * Get the interrupts and check each one to see if it is for our device.
696  */
697 static int
698 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
699 {
700 	pcitool_intr_info_t	intr_info;
701 	int			err;
702 	int			ino;
703 	int			oldcpuid;
704 
705 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
706 	    FKIOCTL, kcred, NULL);
707 	if (err != 0)
708 		return (-1);
709 
710 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
711 		for (ino = 0; ino < intr_info.num_intr; ino++) {
712 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
713 				if (dln->cpu_id == cpuid)
714 					return (0);
715 				return (1);
716 			}
717 		}
718 	}
719 	return (-1);
720 }
721 
722 /*
723  * Obtain the nexus parent node info. for mdip.
724  */
725 static dev_info_t *
726 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
727 {
728 	struct dev_info		*tdip = (struct dev_info *)mdip;
729 	struct ddi_minor_data	*minordata;
730 	int			circ;
731 	dev_info_t		*pdip;
732 	char			pathname[MAXPATHLEN];
733 
734 	while (tdip != NULL) {
735 		/*
736 		 * The netboot code could call this function while walking the
737 		 * device tree so we need to use ndi_devi_tryenter() here to
738 		 * avoid deadlock.
739 		 */
740 		if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
741 			break;
742 
743 		for (minordata = tdip->devi_minor; minordata != NULL;
744 		    minordata = minordata->next) {
745 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
746 			    strlen(DDI_NT_INTRCTL)) == 0) {
747 				pdip = minordata->dip;
748 				(void) ddi_pathname(pdip, pathname);
749 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
750 				    "/devices%s:intr", pathname);
751 				(void) ddi_pathname_minor(minordata, pathname);
752 				ndi_devi_exit((dev_info_t *)tdip, circ);
753 				return (pdip);
754 			}
755 		}
756 		ndi_devi_exit((dev_info_t *)tdip, circ);
757 		tdip = tdip->devi_parent;
758 	}
759 	return (NULL);
760 }
761 
762 /*
763  * For a primary MAC client, if the user has set a list or CPUs or
764  * we have obtained it implicitly, we try to retarget the interrupt
765  * for that device on one of the CPUs in the list.
766  * We assign the interrupt to the same CPU as the poll thread.
767  */
768 static boolean_t
769 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
770 {
771 	ldi_handle_t		lh = NULL;
772 	ldi_ident_t		li = NULL;
773 	int			err;
774 	int			ret;
775 	mac_dladm_intr_t	dln;
776 	dev_info_t		*dip;
777 	struct ddi_minor_data	*minordata;
778 
779 	dln.nexus_path[0] = '\0';
780 	dln.driver_path[0] = '\0';
781 
782 	minordata = ((struct dev_info *)mdip)->devi_minor;
783 	while (minordata != NULL) {
784 		if (minordata->type == DDM_MINOR)
785 			break;
786 		minordata = minordata->next;
787 	}
788 	if (minordata == NULL)
789 		return (B_FALSE);
790 
791 	(void) ddi_pathname_minor(minordata, dln.driver_path);
792 
793 	dip = mac_get_nexus_node(mdip, &dln);
794 	/* defensive */
795 	if (dip == NULL)
796 		return (B_FALSE);
797 
798 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
799 	if (err != 0)
800 		return (B_FALSE);
801 
802 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
803 	if (err != 0)
804 		return (B_FALSE);
805 
806 	ret = mac_validate_intr(lh, &dln, cpuid);
807 	if (ret < 0) {
808 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
809 		return (B_FALSE);
810 	}
811 	/* cmn_note? */
812 	if (ret != 0)
813 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
814 		    != 0) {
815 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
816 			return (B_FALSE);
817 		}
818 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
819 	return (B_TRUE);
820 }
821 
822 void
823 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
824 {
825 	dev_info_t		*mdip = (dev_info_t *)arg;
826 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
827 	mac_resource_props_t	*mrp;
828 	mac_perim_handle_t	mph;
829 	flow_entry_t		*flent = mcip->mci_flent;
830 	mac_soft_ring_set_t	*rx_srs;
831 	mac_cpus_t		*srs_cpu;
832 
833 	if (!mac_check_interrupt_binding(mdip, cpuid))
834 		cpuid = -1;
835 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
836 	mrp = MCIP_RESOURCE_PROPS(mcip);
837 	mrp->mrp_rx_intr_cpu = cpuid;
838 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
839 		rx_srs = flent->fe_rx_srs[1];
840 		srs_cpu = &rx_srs->srs_cpu;
841 		srs_cpu->mc_rx_intr_cpu = cpuid;
842 	}
843 	mac_perim_exit(mph);
844 }
845 
846 int32_t
847 mac_client_intr_cpu(mac_client_handle_t mch)
848 {
849 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
850 	mac_cpus_t		*srs_cpu;
851 	mac_soft_ring_set_t	*rx_srs;
852 	flow_entry_t		*flent = mcip->mci_flent;
853 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
854 	mac_ring_t		*ring;
855 	mac_intr_t		*mintr;
856 
857 	/*
858 	 * Check if we need to retarget the interrupt. We do this only
859 	 * for the primary MAC client. We do this if we have the only
860 	 * exclusive ring in the group.
861 	 */
862 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
863 		rx_srs = flent->fe_rx_srs[1];
864 		srs_cpu = &rx_srs->srs_cpu;
865 		ring = rx_srs->srs_ring;
866 		mintr = &ring->mr_info.mri_intr;
867 		/*
868 		 * If ddi_handle is present or the poll CPU is
869 		 * already bound to the interrupt CPU, return -1.
870 		 */
871 		if (mintr->mi_ddi_handle != NULL ||
872 		    ((mrp->mrp_ncpus != 0) &&
873 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
874 			return (-1);
875 		}
876 		return (srs_cpu->mc_rx_pollid);
877 	}
878 	return (-1);
879 }
880 
881 void *
882 mac_get_devinfo(mac_handle_t mh)
883 {
884 	mac_impl_t	*mip = (mac_impl_t *)mh;
885 
886 	return ((void *)mip->mi_dip);
887 }
888 
889 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
890 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
891 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
892 
893 uint64_t
894 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
895 {
896 	struct ether_header *ehp;
897 	uint64_t hash = 0;
898 	uint16_t sap;
899 	uint_t skip_len;
900 	uint8_t proto;
901 	boolean_t ip_fragmented;
902 
903 	/*
904 	 * We may want to have one of these per MAC type plugin in the
905 	 * future. For now supports only ethernet.
906 	 */
907 	if (media != DL_ETHER)
908 		return (0L);
909 
910 	/* for now we support only outbound packets */
911 	ASSERT(is_outbound);
912 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
913 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
914 
915 	/* compute L2 hash */
916 
917 	ehp = (struct ether_header *)mp->b_rptr;
918 
919 	if ((policy & MAC_PKT_HASH_L2) != 0) {
920 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
921 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
922 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
923 		policy &= ~MAC_PKT_HASH_L2;
924 	}
925 
926 	if (policy == 0)
927 		goto done;
928 
929 	/* skip ethernet header */
930 
931 	sap = ntohs(ehp->ether_type);
932 	if (sap == ETHERTYPE_VLAN) {
933 		struct ether_vlan_header *evhp;
934 		mblk_t *newmp = NULL;
935 
936 		skip_len = sizeof (struct ether_vlan_header);
937 		if (MBLKL(mp) < skip_len) {
938 			/* the vlan tag is the payload, pull up first */
939 			newmp = msgpullup(mp, -1);
940 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
941 				goto done;
942 			}
943 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
944 		} else {
945 			evhp = (struct ether_vlan_header *)mp->b_rptr;
946 		}
947 
948 		sap = ntohs(evhp->ether_type);
949 		freemsg(newmp);
950 	} else {
951 		skip_len = sizeof (struct ether_header);
952 	}
953 
954 	/* if ethernet header is in its own mblk, skip it */
955 	if (MBLKL(mp) <= skip_len) {
956 		skip_len -= MBLKL(mp);
957 		mp = mp->b_cont;
958 		if (mp == NULL)
959 			goto done;
960 	}
961 
962 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
963 
964 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
965 
966 	switch (sap) {
967 	case ETHERTYPE_IP: {
968 		ipha_t *iphp;
969 
970 		/*
971 		 * If the header is not aligned or the header doesn't fit
972 		 * in the mblk, bail now. Note that this may cause packets
973 		 * reordering.
974 		 */
975 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
976 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
977 		    !OK_32PTR((char *)iphp))
978 			goto done;
979 
980 		proto = iphp->ipha_protocol;
981 		skip_len += IPH_HDR_LENGTH(iphp);
982 
983 		/* Check if the packet is fragmented. */
984 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
985 		    IPH_OFFSET;
986 
987 		/*
988 		 * For fragmented packets, use addresses in addition to
989 		 * the frag_id to generate the hash inorder to get
990 		 * better distribution.
991 		 */
992 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
993 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
994 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
995 
996 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
997 			    PKT_HASH_4BYTES(ip_dst));
998 			policy &= ~MAC_PKT_HASH_L3;
999 		}
1000 
1001 		if (ip_fragmented) {
1002 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1003 			hash ^= PKT_HASH_2BYTES(identp);
1004 			goto done;
1005 		}
1006 		break;
1007 	}
1008 	case ETHERTYPE_IPV6: {
1009 		ip6_t *ip6hp;
1010 		ip6_frag_t *frag = NULL;
1011 		uint16_t hdr_length;
1012 
1013 		/*
1014 		 * If the header is not aligned or the header doesn't fit
1015 		 * in the mblk, bail now. Note that this may cause packets
1016 		 * reordering.
1017 		 */
1018 
1019 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1020 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1021 		    !OK_32PTR((char *)ip6hp))
1022 			goto done;
1023 
1024 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1025 		    &proto, &frag))
1026 			goto done;
1027 		skip_len += hdr_length;
1028 
1029 		/*
1030 		 * For fragmented packets, use addresses in addition to
1031 		 * the frag_id to generate the hash inorder to get
1032 		 * better distribution.
1033 		 */
1034 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1035 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1036 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1037 
1038 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
1039 			    PKT_HASH_4BYTES(ip_dst));
1040 			policy &= ~MAC_PKT_HASH_L3;
1041 		}
1042 
1043 		if (frag != NULL) {
1044 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1045 			hash ^= PKT_HASH_4BYTES(identp);
1046 			goto done;
1047 		}
1048 		break;
1049 	}
1050 	default:
1051 		goto done;
1052 	}
1053 
1054 	if (policy == 0)
1055 		goto done;
1056 
1057 	/* if ip header is in its own mblk, skip it */
1058 	if (MBLKL(mp) <= skip_len) {
1059 		skip_len -= MBLKL(mp);
1060 		mp = mp->b_cont;
1061 		if (mp == NULL)
1062 			goto done;
1063 	}
1064 
1065 	/* parse ULP header */
1066 again:
1067 	switch (proto) {
1068 	case IPPROTO_TCP:
1069 	case IPPROTO_UDP:
1070 	case IPPROTO_ESP:
1071 	case IPPROTO_SCTP:
1072 		/*
1073 		 * These Internet Protocols are intentionally designed
1074 		 * for hashing from the git-go.  Port numbers are in the first
1075 		 * word for transports, SPI is first for ESP.
1076 		 */
1077 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1078 			goto done;
1079 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1080 		break;
1081 
1082 	case IPPROTO_AH: {
1083 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1084 		uint_t ah_length = AH_TOTAL_LEN(ah);
1085 
1086 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1087 			goto done;
1088 
1089 		proto = ah->ah_nexthdr;
1090 		skip_len += ah_length;
1091 
1092 		/* if AH header is in its own mblk, skip it */
1093 		if (MBLKL(mp) <= skip_len) {
1094 			skip_len -= MBLKL(mp);
1095 			mp = mp->b_cont;
1096 			if (mp == NULL)
1097 				goto done;
1098 		}
1099 
1100 		goto again;
1101 	}
1102 	}
1103 
1104 done:
1105 	return (hash);
1106 }
1107