1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * MAC Services Module - misc utilities
27 */
28
29 #include <sys/types.h>
30 #include <sys/mac.h>
31 #include <sys/mac_impl.h>
32 #include <sys/mac_client_priv.h>
33 #include <sys/mac_client_impl.h>
34 #include <sys/mac_soft_ring.h>
35 #include <sys/strsubr.h>
36 #include <sys/strsun.h>
37 #include <sys/vlan.h>
38 #include <sys/pattr.h>
39 #include <sys/pci_tools.h>
40 #include <inet/ip.h>
41 #include <inet/ip_impl.h>
42 #include <inet/ip6.h>
43 #include <sys/vtrace.h>
44 #include <sys/dlpi.h>
45 #include <sys/sunndi.h>
46 #include <inet/ipsec_impl.h>
47 #include <inet/sadb.h>
48 #include <inet/ipsecesp.h>
49 #include <inet/ipsecah.h>
50
51 /*
52 * Copy an mblk, preserving its hardware checksum flags.
53 */
54 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)55 mac_copymsg_cksum(mblk_t *mp)
56 {
57 mblk_t *mp1;
58 uint32_t start, stuff, end, value, flags;
59
60 mp1 = copymsg(mp);
61 if (mp1 == NULL)
62 return (NULL);
63
64 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
65 (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
66 flags, KM_NOSLEEP);
67
68 return (mp1);
69 }
70
71 /*
72 * Copy an mblk chain, presenting the hardware checksum flags of the
73 * individual mblks.
74 */
75 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)76 mac_copymsgchain_cksum(mblk_t *mp)
77 {
78 mblk_t *nmp = NULL;
79 mblk_t **nmpp = &nmp;
80
81 for (; mp != NULL; mp = mp->b_next) {
82 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
83 freemsgchain(nmp);
84 return (NULL);
85 }
86
87 nmpp = &((*nmpp)->b_next);
88 }
89
90 return (nmp);
91 }
92
93 /*
94 * Process the specified mblk chain for proper handling of hardware
95 * checksum offload. This routine is invoked for loopback traffic
96 * between MAC clients.
97 * The function handles a NULL mblk chain passed as argument.
98 */
99 mblk_t *
mac_fix_cksum(mblk_t * mp_chain)100 mac_fix_cksum(mblk_t *mp_chain)
101 {
102 mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
103 uint32_t flags, start, stuff, end, value;
104
105 for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
106 uint16_t len;
107 uint32_t offset;
108 struct ether_header *ehp;
109 uint16_t sap;
110
111 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
112 &flags);
113 if (flags == 0)
114 continue;
115
116 /*
117 * Since the processing of checksum offload for loopback
118 * traffic requires modification of the packet contents,
119 * ensure sure that we are always modifying our own copy.
120 */
121 if (DB_REF(mp) > 1) {
122 mp1 = copymsg(mp);
123 if (mp1 == NULL)
124 continue;
125 mp1->b_next = mp->b_next;
126 mp->b_next = NULL;
127 freemsg(mp);
128 if (prev != NULL)
129 prev->b_next = mp1;
130 else
131 new_chain = mp1;
132 mp = mp1;
133 }
134
135 /*
136 * Ethernet, and optionally VLAN header.
137 */
138 /* LINTED: improper alignment cast */
139 ehp = (struct ether_header *)mp->b_rptr;
140 if (ntohs(ehp->ether_type) == VLAN_TPID) {
141 struct ether_vlan_header *evhp;
142
143 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
144 /* LINTED: improper alignment cast */
145 evhp = (struct ether_vlan_header *)mp->b_rptr;
146 sap = ntohs(evhp->ether_type);
147 offset = sizeof (struct ether_vlan_header);
148 } else {
149 sap = ntohs(ehp->ether_type);
150 offset = sizeof (struct ether_header);
151 }
152
153 if (MBLKL(mp) <= offset) {
154 offset -= MBLKL(mp);
155 if (mp->b_cont == NULL) {
156 /* corrupted packet, skip it */
157 if (prev != NULL)
158 prev->b_next = mp->b_next;
159 else
160 new_chain = mp->b_next;
161 mp1 = mp->b_next;
162 mp->b_next = NULL;
163 freemsg(mp);
164 mp = mp1;
165 continue;
166 }
167 mp = mp->b_cont;
168 }
169
170 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
171 ipha_t *ipha = NULL;
172
173 /*
174 * In order to compute the full and header
175 * checksums, we need to find and parse
176 * the IP and/or ULP headers.
177 */
178
179 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
180
181 /*
182 * IP header.
183 */
184 if (sap != ETHERTYPE_IP)
185 continue;
186
187 ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
188 /* LINTED: improper alignment cast */
189 ipha = (ipha_t *)(mp->b_rptr + offset);
190
191 if (flags & HCK_FULLCKSUM) {
192 ipaddr_t src, dst;
193 uint32_t cksum;
194 uint16_t *up;
195 uint8_t proto;
196
197 /*
198 * Pointer to checksum field in ULP header.
199 */
200 proto = ipha->ipha_protocol;
201 ASSERT(ipha->ipha_version_and_hdr_length ==
202 IP_SIMPLE_HDR_VERSION);
203
204 switch (proto) {
205 case IPPROTO_TCP:
206 /* LINTED: improper alignment cast */
207 up = IPH_TCPH_CHECKSUMP(ipha,
208 IP_SIMPLE_HDR_LENGTH);
209 break;
210
211 case IPPROTO_UDP:
212 /* LINTED: improper alignment cast */
213 up = IPH_UDPH_CHECKSUMP(ipha,
214 IP_SIMPLE_HDR_LENGTH);
215 break;
216
217 default:
218 cmn_err(CE_WARN, "mac_fix_cksum: "
219 "unexpected protocol: %d", proto);
220 continue;
221 }
222
223 /*
224 * Pseudo-header checksum.
225 */
226 src = ipha->ipha_src;
227 dst = ipha->ipha_dst;
228 len = ntohs(ipha->ipha_length) -
229 IP_SIMPLE_HDR_LENGTH;
230
231 cksum = (dst >> 16) + (dst & 0xFFFF) +
232 (src >> 16) + (src & 0xFFFF);
233 cksum += htons(len);
234
235 /*
236 * The checksum value stored in the packet needs
237 * to be correct. Compute it here.
238 */
239 *up = 0;
240 cksum += (((proto) == IPPROTO_UDP) ?
241 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
242 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
243 offset, cksum);
244 *(up) = (uint16_t)(cksum ? cksum : ~cksum);
245
246 /*
247 * Flag the packet so that it appears
248 * that the checksum has already been
249 * verified by the hardware.
250 */
251 flags &= ~HCK_FULLCKSUM;
252 flags |= HCK_FULLCKSUM_OK;
253 value = 0;
254 }
255
256 if (flags & HCK_IPV4_HDRCKSUM) {
257 ASSERT(ipha != NULL);
258 ipha->ipha_hdr_checksum =
259 (uint16_t)ip_csum_hdr(ipha);
260 flags &= ~HCK_IPV4_HDRCKSUM;
261 flags |= HCK_IPV4_HDRCKSUM_OK;
262
263 }
264 }
265
266 if (flags & HCK_PARTIALCKSUM) {
267 uint16_t *up, partial, cksum;
268 uchar_t *ipp; /* ptr to beginning of IP header */
269
270 if (mp->b_cont != NULL) {
271 mblk_t *mp1;
272
273 mp1 = msgpullup(mp, offset + end);
274 if (mp1 == NULL)
275 continue;
276 mp1->b_next = mp->b_next;
277 mp->b_next = NULL;
278 freemsg(mp);
279 if (prev != NULL)
280 prev->b_next = mp1;
281 else
282 new_chain = mp1;
283 mp = mp1;
284 }
285
286 ipp = mp->b_rptr + offset;
287 /* LINTED: cast may result in improper alignment */
288 up = (uint16_t *)((uchar_t *)ipp + stuff);
289 partial = *up;
290 *up = 0;
291
292 cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
293 end - start, partial);
294 cksum = ~cksum;
295 *up = cksum ? cksum : ~cksum;
296
297 /*
298 * Since we already computed the whole checksum,
299 * indicate to the stack that it has already
300 * been verified by the hardware.
301 */
302 flags &= ~HCK_PARTIALCKSUM;
303 flags |= HCK_FULLCKSUM_OK;
304 value = 0;
305 }
306
307 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
308 value, flags, KM_NOSLEEP);
309 }
310
311 return (new_chain);
312 }
313
314 /*
315 * Add VLAN tag to the specified mblk.
316 */
317 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)318 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
319 {
320 mblk_t *hmp;
321 struct ether_vlan_header *evhp;
322 struct ether_header *ehp;
323 uint32_t start, stuff, end, value, flags;
324
325 ASSERT(pri != 0 || vid != 0);
326
327 /*
328 * Allocate an mblk for the new tagged ethernet header,
329 * and copy the MAC addresses and ethertype from the
330 * original header.
331 */
332
333 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
334 if (hmp == NULL) {
335 freemsg(mp);
336 return (NULL);
337 }
338
339 evhp = (struct ether_vlan_header *)hmp->b_rptr;
340 ehp = (struct ether_header *)mp->b_rptr;
341
342 bcopy(ehp, evhp, (ETHERADDRL * 2));
343 evhp->ether_type = ehp->ether_type;
344 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
345
346 hmp->b_wptr += sizeof (struct ether_vlan_header);
347 mp->b_rptr += sizeof (struct ether_header);
348
349 /*
350 * Free the original message if it's now empty. Link the
351 * rest of messages to the header message.
352 */
353 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
354 (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
355 KM_NOSLEEP);
356 if (MBLKL(mp) == 0) {
357 hmp->b_cont = mp->b_cont;
358 freeb(mp);
359 } else {
360 hmp->b_cont = mp;
361 }
362 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
363
364 /*
365 * Initialize the new TCI (Tag Control Information).
366 */
367 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
368
369 return (hmp);
370 }
371
372 /*
373 * Adds a VLAN tag with the specified VID and priority to each mblk of
374 * the specified chain.
375 */
376 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)377 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
378 {
379 mblk_t *next_mp, **prev, *mp;
380
381 mp = mp_chain;
382 prev = &mp_chain;
383
384 while (mp != NULL) {
385 next_mp = mp->b_next;
386 mp->b_next = NULL;
387 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
388 freemsgchain(next_mp);
389 break;
390 }
391 *prev = mp;
392 prev = &mp->b_next;
393 mp = mp->b_next = next_mp;
394 }
395
396 return (mp_chain);
397 }
398
399 /*
400 * Strip VLAN tag
401 */
402 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)403 mac_strip_vlan_tag(mblk_t *mp)
404 {
405 mblk_t *newmp;
406 struct ether_vlan_header *evhp;
407
408 evhp = (struct ether_vlan_header *)mp->b_rptr;
409 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
410 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
411
412 if (DB_REF(mp) > 1) {
413 newmp = copymsg(mp);
414 if (newmp == NULL)
415 return (NULL);
416 freemsg(mp);
417 mp = newmp;
418 }
419
420 evhp = (struct ether_vlan_header *)mp->b_rptr;
421
422 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
423 mp->b_rptr += VLAN_TAGSZ;
424 }
425 return (mp);
426 }
427
428 /*
429 * Strip VLAN tag from each mblk of the chain.
430 */
431 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)432 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
433 {
434 mblk_t *mp, *next_mp, **prev;
435
436 mp = mp_chain;
437 prev = &mp_chain;
438
439 while (mp != NULL) {
440 next_mp = mp->b_next;
441 mp->b_next = NULL;
442 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
443 freemsgchain(next_mp);
444 break;
445 }
446 *prev = mp;
447 prev = &mp->b_next;
448 mp = mp->b_next = next_mp;
449 }
450
451 return (mp_chain);
452 }
453
454 /*
455 * Default callback function. Used when the datapath is not yet initialized.
456 */
457 /* ARGSUSED */
458 void
mac_pkt_drop(void * arg,mac_resource_handle_t resource,mblk_t * mp,boolean_t loopback)459 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
460 boolean_t loopback)
461 {
462 mblk_t *mp1 = mp;
463
464 while (mp1 != NULL) {
465 mp1->b_prev = NULL;
466 mp1->b_queue = NULL;
467 mp1 = mp1->b_next;
468 }
469 freemsgchain(mp);
470 }
471
472 /*
473 * Determines the IPv6 header length accounting for all the optional IPv6
474 * headers (hop-by-hop, destination, routing and fragment). The header length
475 * and next header value (a transport header) is captured.
476 *
477 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
478 * returns B_TRUE.
479 */
480 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)481 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
482 uint8_t *next_hdr, ip6_frag_t **fragp)
483 {
484 uint16_t length;
485 uint_t ehdrlen;
486 uint8_t *whereptr;
487 uint8_t *nexthdrp;
488 ip6_dest_t *desthdr;
489 ip6_rthdr_t *rthdr;
490 ip6_frag_t *fraghdr;
491
492 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
493 return (B_FALSE);
494 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
495 length = IPV6_HDR_LEN;
496 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
497
498 if (fragp != NULL)
499 *fragp = NULL;
500
501 nexthdrp = &ip6h->ip6_nxt;
502 while (whereptr < endptr) {
503 /* Is there enough left for len + nexthdr? */
504 if (whereptr + MIN_EHDR_LEN > endptr)
505 break;
506
507 switch (*nexthdrp) {
508 case IPPROTO_HOPOPTS:
509 case IPPROTO_DSTOPTS:
510 /* Assumes the headers are identical for hbh and dst */
511 desthdr = (ip6_dest_t *)whereptr;
512 ehdrlen = 8 * (desthdr->ip6d_len + 1);
513 if ((uchar_t *)desthdr + ehdrlen > endptr)
514 return (B_FALSE);
515 nexthdrp = &desthdr->ip6d_nxt;
516 break;
517 case IPPROTO_ROUTING:
518 rthdr = (ip6_rthdr_t *)whereptr;
519 ehdrlen = 8 * (rthdr->ip6r_len + 1);
520 if ((uchar_t *)rthdr + ehdrlen > endptr)
521 return (B_FALSE);
522 nexthdrp = &rthdr->ip6r_nxt;
523 break;
524 case IPPROTO_FRAGMENT:
525 fraghdr = (ip6_frag_t *)whereptr;
526 ehdrlen = sizeof (ip6_frag_t);
527 if ((uchar_t *)&fraghdr[1] > endptr)
528 return (B_FALSE);
529 nexthdrp = &fraghdr->ip6f_nxt;
530 if (fragp != NULL)
531 *fragp = fraghdr;
532 break;
533 case IPPROTO_NONE:
534 /* No next header means we're finished */
535 default:
536 *hdr_length = length;
537 *next_hdr = *nexthdrp;
538 return (B_TRUE);
539 }
540 length += ehdrlen;
541 whereptr += ehdrlen;
542 *hdr_length = length;
543 *next_hdr = *nexthdrp;
544 }
545 switch (*nexthdrp) {
546 case IPPROTO_HOPOPTS:
547 case IPPROTO_DSTOPTS:
548 case IPPROTO_ROUTING:
549 case IPPROTO_FRAGMENT:
550 /*
551 * If any know extension headers are still to be processed,
552 * the packet's malformed (or at least all the IP header(s) are
553 * not in the same mblk - and that should never happen.
554 */
555 return (B_FALSE);
556
557 default:
558 /*
559 * If we get here, we know that all of the IP headers were in
560 * the same mblk, even if the ULP header is in the next mblk.
561 */
562 *hdr_length = length;
563 *next_hdr = *nexthdrp;
564 return (B_TRUE);
565 }
566 }
567
568 /*
569 * The following set of routines are there to take care of interrupt
570 * re-targeting for legacy (fixed) interrupts. Some older versions
571 * of the popular NICs like e1000g do not support MSI-X interrupts
572 * and they reserve fixed interrupts for RX/TX rings. To re-target
573 * these interrupts, PCITOOL ioctls need to be used.
574 */
575 typedef struct mac_dladm_intr {
576 int ino;
577 int cpu_id;
578 char driver_path[MAXPATHLEN];
579 char nexus_path[MAXPATHLEN];
580 } mac_dladm_intr_t;
581
582 /* Bind the interrupt to cpu_num */
583 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)584 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
585 {
586 pcitool_intr_set_t iset;
587 int err;
588
589 iset.old_cpu = oldcpuid;
590 iset.ino = ino;
591 iset.cpu_id = cpu_num;
592 iset.user_version = PCITOOL_VERSION;
593 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
594 kcred, NULL);
595
596 return (err);
597 }
598
599 /*
600 * Search interrupt information. iget is filled in with the info to search
601 */
602 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)603 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
604 {
605 int i;
606 char driver_path[2 * MAXPATHLEN];
607
608 for (i = 0; i < iget_p->num_devs; i++) {
609 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
610 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
611 ":%s%d", iget_p->dev[i].driver_name,
612 iget_p->dev[i].dev_inst);
613 /* Match the device path for the device path */
614 if (strcmp(driver_path, dln->driver_path) == 0) {
615 dln->ino = iget_p->ino;
616 dln->cpu_id = iget_p->cpu_id;
617 return (B_TRUE);
618 }
619 }
620 return (B_FALSE);
621 }
622
623 /*
624 * Get information about ino, i.e. if this is the interrupt for our
625 * device and where it is bound etc.
626 */
627 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)628 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
629 mac_dladm_intr_t *dln)
630 {
631 pcitool_intr_get_t *iget_p;
632 int ipsz;
633 int nipsz;
634 int err;
635 uint8_t inum;
636
637 /*
638 * Check if SLEEP is OK, i.e if could come here in response to
639 * changing the fanout due to some callback from the driver, say
640 * link speed changes.
641 */
642 ipsz = PCITOOL_IGET_SIZE(0);
643 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
644
645 iget_p->num_devs_ret = 0;
646 iget_p->user_version = PCITOOL_VERSION;
647 iget_p->cpu_id = oldcpuid;
648 iget_p->ino = ino;
649
650 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
651 FKIOCTL, kcred, NULL);
652 if (err != 0) {
653 kmem_free(iget_p, ipsz);
654 return (B_FALSE);
655 }
656 if (iget_p->num_devs == 0) {
657 kmem_free(iget_p, ipsz);
658 return (B_FALSE);
659 }
660 inum = iget_p->num_devs;
661 if (iget_p->num_devs_ret < iget_p->num_devs) {
662 /* Reallocate */
663 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
664
665 kmem_free(iget_p, ipsz);
666 ipsz = nipsz;
667 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
668
669 iget_p->num_devs_ret = inum;
670 iget_p->cpu_id = oldcpuid;
671 iget_p->ino = ino;
672 iget_p->user_version = PCITOOL_VERSION;
673 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
674 FKIOCTL, kcred, NULL);
675 if (err != 0) {
676 kmem_free(iget_p, ipsz);
677 return (B_FALSE);
678 }
679 /* defensive */
680 if (iget_p->num_devs != iget_p->num_devs_ret) {
681 kmem_free(iget_p, ipsz);
682 return (B_FALSE);
683 }
684 }
685
686 if (mac_search_intrinfo(iget_p, dln)) {
687 kmem_free(iget_p, ipsz);
688 return (B_TRUE);
689 }
690 kmem_free(iget_p, ipsz);
691 return (B_FALSE);
692 }
693
694 /*
695 * Get the interrupts and check each one to see if it is for our device.
696 */
697 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)698 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
699 {
700 pcitool_intr_info_t intr_info;
701 int err;
702 int ino;
703 int oldcpuid;
704
705 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
706 FKIOCTL, kcred, NULL);
707 if (err != 0)
708 return (-1);
709
710 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
711 for (ino = 0; ino < intr_info.num_intr; ino++) {
712 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
713 if (dln->cpu_id == cpuid)
714 return (0);
715 return (1);
716 }
717 }
718 }
719 return (-1);
720 }
721
722 /*
723 * Obtain the nexus parent node info. for mdip.
724 */
725 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)726 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
727 {
728 struct dev_info *tdip = (struct dev_info *)mdip;
729 struct ddi_minor_data *minordata;
730 int circ;
731 dev_info_t *pdip;
732 char pathname[MAXPATHLEN];
733
734 while (tdip != NULL) {
735 /*
736 * The netboot code could call this function while walking the
737 * device tree so we need to use ndi_devi_tryenter() here to
738 * avoid deadlock.
739 */
740 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
741 break;
742
743 for (minordata = tdip->devi_minor; minordata != NULL;
744 minordata = minordata->next) {
745 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
746 strlen(DDI_NT_INTRCTL)) == 0) {
747 pdip = minordata->dip;
748 (void) ddi_pathname(pdip, pathname);
749 (void) snprintf(dln->nexus_path, MAXPATHLEN,
750 "/devices%s:intr", pathname);
751 (void) ddi_pathname_minor(minordata, pathname);
752 ndi_devi_exit((dev_info_t *)tdip, circ);
753 return (pdip);
754 }
755 }
756 ndi_devi_exit((dev_info_t *)tdip, circ);
757 tdip = tdip->devi_parent;
758 }
759 return (NULL);
760 }
761
762 /*
763 * For a primary MAC client, if the user has set a list or CPUs or
764 * we have obtained it implicitly, we try to retarget the interrupt
765 * for that device on one of the CPUs in the list.
766 * We assign the interrupt to the same CPU as the poll thread.
767 */
768 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)769 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
770 {
771 ldi_handle_t lh = NULL;
772 ldi_ident_t li = NULL;
773 int err;
774 int ret;
775 mac_dladm_intr_t dln;
776 dev_info_t *dip;
777 struct ddi_minor_data *minordata;
778
779 dln.nexus_path[0] = '\0';
780 dln.driver_path[0] = '\0';
781
782 minordata = ((struct dev_info *)mdip)->devi_minor;
783 while (minordata != NULL) {
784 if (minordata->type == DDM_MINOR)
785 break;
786 minordata = minordata->next;
787 }
788 if (minordata == NULL)
789 return (B_FALSE);
790
791 (void) ddi_pathname_minor(minordata, dln.driver_path);
792
793 dip = mac_get_nexus_node(mdip, &dln);
794 /* defensive */
795 if (dip == NULL)
796 return (B_FALSE);
797
798 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
799 if (err != 0)
800 return (B_FALSE);
801
802 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
803 if (err != 0)
804 return (B_FALSE);
805
806 ret = mac_validate_intr(lh, &dln, cpuid);
807 if (ret < 0) {
808 (void) ldi_close(lh, FREAD|FWRITE, kcred);
809 return (B_FALSE);
810 }
811 /* cmn_note? */
812 if (ret != 0)
813 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
814 != 0) {
815 (void) ldi_close(lh, FREAD|FWRITE, kcred);
816 return (B_FALSE);
817 }
818 (void) ldi_close(lh, FREAD|FWRITE, kcred);
819 return (B_TRUE);
820 }
821
822 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)823 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
824 {
825 dev_info_t *mdip = (dev_info_t *)arg;
826 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
827 mac_resource_props_t *mrp;
828 mac_perim_handle_t mph;
829 flow_entry_t *flent = mcip->mci_flent;
830 mac_soft_ring_set_t *rx_srs;
831 mac_cpus_t *srs_cpu;
832
833 if (!mac_check_interrupt_binding(mdip, cpuid))
834 cpuid = -1;
835 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
836 mrp = MCIP_RESOURCE_PROPS(mcip);
837 mrp->mrp_rx_intr_cpu = cpuid;
838 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
839 rx_srs = flent->fe_rx_srs[1];
840 srs_cpu = &rx_srs->srs_cpu;
841 srs_cpu->mc_rx_intr_cpu = cpuid;
842 }
843 mac_perim_exit(mph);
844 }
845
846 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)847 mac_client_intr_cpu(mac_client_handle_t mch)
848 {
849 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
850 mac_cpus_t *srs_cpu;
851 mac_soft_ring_set_t *rx_srs;
852 flow_entry_t *flent = mcip->mci_flent;
853 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
854 mac_ring_t *ring;
855 mac_intr_t *mintr;
856
857 /*
858 * Check if we need to retarget the interrupt. We do this only
859 * for the primary MAC client. We do this if we have the only
860 * exclusive ring in the group.
861 */
862 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
863 rx_srs = flent->fe_rx_srs[1];
864 srs_cpu = &rx_srs->srs_cpu;
865 ring = rx_srs->srs_ring;
866 mintr = &ring->mr_info.mri_intr;
867 /*
868 * If ddi_handle is present or the poll CPU is
869 * already bound to the interrupt CPU, return -1.
870 */
871 if (mintr->mi_ddi_handle != NULL ||
872 ((mrp->mrp_ncpus != 0) &&
873 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
874 return (-1);
875 }
876 return (srs_cpu->mc_rx_pollid);
877 }
878 return (-1);
879 }
880
881 void *
mac_get_devinfo(mac_handle_t mh)882 mac_get_devinfo(mac_handle_t mh)
883 {
884 mac_impl_t *mip = (mac_impl_t *)mh;
885
886 return ((void *)mip->mi_dip);
887 }
888
889 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
890 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
891 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
892
893 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)894 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
895 {
896 struct ether_header *ehp;
897 uint64_t hash = 0;
898 uint16_t sap;
899 uint_t skip_len;
900 uint8_t proto;
901 boolean_t ip_fragmented;
902
903 /*
904 * We may want to have one of these per MAC type plugin in the
905 * future. For now supports only ethernet.
906 */
907 if (media != DL_ETHER)
908 return (0L);
909
910 /* for now we support only outbound packets */
911 ASSERT(is_outbound);
912 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
913 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
914
915 /* compute L2 hash */
916
917 ehp = (struct ether_header *)mp->b_rptr;
918
919 if ((policy & MAC_PKT_HASH_L2) != 0) {
920 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
921 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
922 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
923 policy &= ~MAC_PKT_HASH_L2;
924 }
925
926 if (policy == 0)
927 goto done;
928
929 /* skip ethernet header */
930
931 sap = ntohs(ehp->ether_type);
932 if (sap == ETHERTYPE_VLAN) {
933 struct ether_vlan_header *evhp;
934 mblk_t *newmp = NULL;
935
936 skip_len = sizeof (struct ether_vlan_header);
937 if (MBLKL(mp) < skip_len) {
938 /* the vlan tag is the payload, pull up first */
939 newmp = msgpullup(mp, -1);
940 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
941 goto done;
942 }
943 evhp = (struct ether_vlan_header *)newmp->b_rptr;
944 } else {
945 evhp = (struct ether_vlan_header *)mp->b_rptr;
946 }
947
948 sap = ntohs(evhp->ether_type);
949 freemsg(newmp);
950 } else {
951 skip_len = sizeof (struct ether_header);
952 }
953
954 /* if ethernet header is in its own mblk, skip it */
955 if (MBLKL(mp) <= skip_len) {
956 skip_len -= MBLKL(mp);
957 mp = mp->b_cont;
958 if (mp == NULL)
959 goto done;
960 }
961
962 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
963
964 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
965
966 switch (sap) {
967 case ETHERTYPE_IP: {
968 ipha_t *iphp;
969
970 /*
971 * If the header is not aligned or the header doesn't fit
972 * in the mblk, bail now. Note that this may cause packets
973 * reordering.
974 */
975 iphp = (ipha_t *)(mp->b_rptr + skip_len);
976 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
977 !OK_32PTR((char *)iphp))
978 goto done;
979
980 proto = iphp->ipha_protocol;
981 skip_len += IPH_HDR_LENGTH(iphp);
982
983 /* Check if the packet is fragmented. */
984 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
985 IPH_OFFSET;
986
987 /*
988 * For fragmented packets, use addresses in addition to
989 * the frag_id to generate the hash inorder to get
990 * better distribution.
991 */
992 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
993 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
994 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
995
996 hash ^= (PKT_HASH_4BYTES(ip_src) ^
997 PKT_HASH_4BYTES(ip_dst));
998 policy &= ~MAC_PKT_HASH_L3;
999 }
1000
1001 if (ip_fragmented) {
1002 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1003 hash ^= PKT_HASH_2BYTES(identp);
1004 goto done;
1005 }
1006 break;
1007 }
1008 case ETHERTYPE_IPV6: {
1009 ip6_t *ip6hp;
1010 ip6_frag_t *frag = NULL;
1011 uint16_t hdr_length;
1012
1013 /*
1014 * If the header is not aligned or the header doesn't fit
1015 * in the mblk, bail now. Note that this may cause packets
1016 * reordering.
1017 */
1018
1019 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1020 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1021 !OK_32PTR((char *)ip6hp))
1022 goto done;
1023
1024 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1025 &proto, &frag))
1026 goto done;
1027 skip_len += hdr_length;
1028
1029 /*
1030 * For fragmented packets, use addresses in addition to
1031 * the frag_id to generate the hash inorder to get
1032 * better distribution.
1033 */
1034 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1035 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1036 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1037
1038 hash ^= (PKT_HASH_4BYTES(ip_src) ^
1039 PKT_HASH_4BYTES(ip_dst));
1040 policy &= ~MAC_PKT_HASH_L3;
1041 }
1042
1043 if (frag != NULL) {
1044 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1045 hash ^= PKT_HASH_4BYTES(identp);
1046 goto done;
1047 }
1048 break;
1049 }
1050 default:
1051 goto done;
1052 }
1053
1054 if (policy == 0)
1055 goto done;
1056
1057 /* if ip header is in its own mblk, skip it */
1058 if (MBLKL(mp) <= skip_len) {
1059 skip_len -= MBLKL(mp);
1060 mp = mp->b_cont;
1061 if (mp == NULL)
1062 goto done;
1063 }
1064
1065 /* parse ULP header */
1066 again:
1067 switch (proto) {
1068 case IPPROTO_TCP:
1069 case IPPROTO_UDP:
1070 case IPPROTO_ESP:
1071 case IPPROTO_SCTP:
1072 /*
1073 * These Internet Protocols are intentionally designed
1074 * for hashing from the git-go. Port numbers are in the first
1075 * word for transports, SPI is first for ESP.
1076 */
1077 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1078 goto done;
1079 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1080 break;
1081
1082 case IPPROTO_AH: {
1083 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1084 uint_t ah_length = AH_TOTAL_LEN(ah);
1085
1086 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1087 goto done;
1088
1089 proto = ah->ah_nexthdr;
1090 skip_len += ah_length;
1091
1092 /* if AH header is in its own mblk, skip it */
1093 if (MBLKL(mp) <= skip_len) {
1094 skip_len -= MBLKL(mp);
1095 mp = mp->b_cont;
1096 if (mp == NULL)
1097 goto done;
1098 }
1099
1100 goto again;
1101 }
1102 }
1103
1104 done:
1105 return (hash);
1106 }
1107