/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* * MAC Services Module - misc utilities */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Copy an mblk, preserving its hardware checksum flags. */ static mblk_t * mac_copymsg_cksum(mblk_t *mp) { mblk_t *mp1; uint32_t start, stuff, end, value, flags; mp1 = copymsg(mp); if (mp1 == NULL) return (NULL); hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, flags, KM_NOSLEEP); return (mp1); } /* * Copy an mblk chain, presenting the hardware checksum flags of the * individual mblks. */ mblk_t * mac_copymsgchain_cksum(mblk_t *mp) { mblk_t *nmp = NULL; mblk_t **nmpp = &nmp; for (; mp != NULL; mp = mp->b_next) { if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { freemsgchain(nmp); return (NULL); } nmpp = &((*nmpp)->b_next); } return (nmp); } /* * Process the specified mblk chain for proper handling of hardware * checksum offload. This routine is invoked for loopback traffic * between MAC clients. * The function handles a NULL mblk chain passed as argument. */ mblk_t * mac_fix_cksum(mblk_t *mp_chain) { mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; uint32_t flags, start, stuff, end, value; for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { uint16_t len; uint32_t offset; struct ether_header *ehp; uint16_t sap; hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); if (flags == 0) continue; /* * Since the processing of checksum offload for loopback * traffic requires modification of the packet contents, * ensure sure that we are always modifying our own copy. */ if (DB_REF(mp) > 1) { mp1 = copymsg(mp); if (mp1 == NULL) continue; mp1->b_next = mp->b_next; mp->b_next = NULL; freemsg(mp); if (prev != NULL) prev->b_next = mp1; else new_chain = mp1; mp = mp1; } /* * Ethernet, and optionally VLAN header. */ /* LINTED: improper alignment cast */ ehp = (struct ether_header *)mp->b_rptr; if (ntohs(ehp->ether_type) == VLAN_TPID) { struct ether_vlan_header *evhp; ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); /* LINTED: improper alignment cast */ evhp = (struct ether_vlan_header *)mp->b_rptr; sap = ntohs(evhp->ether_type); offset = sizeof (struct ether_vlan_header); } else { sap = ntohs(ehp->ether_type); offset = sizeof (struct ether_header); } if (MBLKL(mp) <= offset) { offset -= MBLKL(mp); if (mp->b_cont == NULL) { /* corrupted packet, skip it */ if (prev != NULL) prev->b_next = mp->b_next; else new_chain = mp->b_next; mp1 = mp->b_next; mp->b_next = NULL; freemsg(mp); mp = mp1; continue; } mp = mp->b_cont; } if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { ipha_t *ipha = NULL; /* * In order to compute the full and header * checksums, we need to find and parse * the IP and/or ULP headers. */ sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; /* * IP header. */ if (sap != ETHERTYPE_IP) continue; ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); /* LINTED: improper alignment cast */ ipha = (ipha_t *)(mp->b_rptr + offset); if (flags & HCK_FULLCKSUM) { ipaddr_t src, dst; uint32_t cksum; uint16_t *up; uint8_t proto; /* * Pointer to checksum field in ULP header. */ proto = ipha->ipha_protocol; ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); switch (proto) { case IPPROTO_TCP: /* LINTED: improper alignment cast */ up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); break; case IPPROTO_UDP: /* LINTED: improper alignment cast */ up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); break; default: cmn_err(CE_WARN, "mac_fix_cksum: " "unexpected protocol: %d", proto); continue; } /* * Pseudo-header checksum. */ src = ipha->ipha_src; dst = ipha->ipha_dst; len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); cksum += htons(len); /* * The checksum value stored in the packet needs * to be correct. Compute it here. */ *up = 0; cksum += (((proto) == IPPROTO_UDP) ? IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + offset, cksum); *(up) = (uint16_t)(cksum ? cksum : ~cksum); /* * Flag the packet so that it appears * that the checksum has already been * verified by the hardware. */ flags &= ~HCK_FULLCKSUM; flags |= HCK_FULLCKSUM_OK; value = 0; } if (flags & HCK_IPV4_HDRCKSUM) { ASSERT(ipha != NULL); ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); flags &= ~HCK_IPV4_HDRCKSUM; flags |= HCK_IPV4_HDRCKSUM_OK; } } if (flags & HCK_PARTIALCKSUM) { uint16_t *up, partial, cksum; uchar_t *ipp; /* ptr to beginning of IP header */ if (mp->b_cont != NULL) { mblk_t *mp1; mp1 = msgpullup(mp, offset + end); if (mp1 == NULL) continue; mp1->b_next = mp->b_next; mp->b_next = NULL; freemsg(mp); if (prev != NULL) prev->b_next = mp1; else new_chain = mp1; mp = mp1; } ipp = mp->b_rptr + offset; /* LINTED: cast may result in improper alignment */ up = (uint16_t *)((uchar_t *)ipp + stuff); partial = *up; *up = 0; cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, end - start, partial); cksum = ~cksum; *up = cksum ? cksum : ~cksum; /* * Since we already computed the whole checksum, * indicate to the stack that it has already * been verified by the hardware. */ flags &= ~HCK_PARTIALCKSUM; flags |= HCK_FULLCKSUM_OK; value = 0; } (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, value, flags, KM_NOSLEEP); } return (new_chain); } /* * Add VLAN tag to the specified mblk. */ mblk_t * mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) { mblk_t *hmp; struct ether_vlan_header *evhp; struct ether_header *ehp; uint32_t start, stuff, end, value, flags; ASSERT(pri != 0 || vid != 0); /* * Allocate an mblk for the new tagged ethernet header, * and copy the MAC addresses and ethertype from the * original header. */ hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); if (hmp == NULL) { freemsg(mp); return (NULL); } evhp = (struct ether_vlan_header *)hmp->b_rptr; ehp = (struct ether_header *)mp->b_rptr; bcopy(ehp, evhp, (ETHERADDRL * 2)); evhp->ether_type = ehp->ether_type; evhp->ether_tpid = htons(ETHERTYPE_VLAN); hmp->b_wptr += sizeof (struct ether_vlan_header); mp->b_rptr += sizeof (struct ether_header); /* * Free the original message if it's now empty. Link the * rest of messages to the header message. */ hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags, KM_NOSLEEP); if (MBLKL(mp) == 0) { hmp->b_cont = mp->b_cont; freeb(mp); } else { hmp->b_cont = mp; } ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); /* * Initialize the new TCI (Tag Control Information). */ evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); return (hmp); } /* * Adds a VLAN tag with the specified VID and priority to each mblk of * the specified chain. */ mblk_t * mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) { mblk_t *next_mp, **prev, *mp; mp = mp_chain; prev = &mp_chain; while (mp != NULL) { next_mp = mp->b_next; mp->b_next = NULL; if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { freemsgchain(next_mp); break; } *prev = mp; prev = &mp->b_next; mp = mp->b_next = next_mp; } return (mp_chain); } /* * Strip VLAN tag */ mblk_t * mac_strip_vlan_tag(mblk_t *mp) { mblk_t *newmp; struct ether_vlan_header *evhp; evhp = (struct ether_vlan_header *)mp->b_rptr; if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); if (DB_REF(mp) > 1) { newmp = copymsg(mp); if (newmp == NULL) return (NULL); freemsg(mp); mp = newmp; } evhp = (struct ether_vlan_header *)mp->b_rptr; ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); mp->b_rptr += VLAN_TAGSZ; } return (mp); } /* * Strip VLAN tag from each mblk of the chain. */ mblk_t * mac_strip_vlan_tag_chain(mblk_t *mp_chain) { mblk_t *mp, *next_mp, **prev; mp = mp_chain; prev = &mp_chain; while (mp != NULL) { next_mp = mp->b_next; mp->b_next = NULL; if ((mp = mac_strip_vlan_tag(mp)) == NULL) { freemsgchain(next_mp); break; } *prev = mp; prev = &mp->b_next; mp = mp->b_next = next_mp; } return (mp_chain); } /* * Default callback function. Used when the datapath is not yet initialized. */ /* ARGSUSED */ void mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, boolean_t loopback) { mblk_t *mp1 = mp; while (mp1 != NULL) { mp1->b_prev = NULL; mp1->b_queue = NULL; mp1 = mp1->b_next; } freemsgchain(mp); } /* * Determines the IPv6 header length accounting for all the optional IPv6 * headers (hop-by-hop, destination, routing and fragment). The header length * and next header value (a transport header) is captured. * * Returns B_FALSE if all the IP headers are not in the same mblk otherwise * returns B_TRUE. */ boolean_t mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, uint8_t *next_hdr, ip6_frag_t **fragp) { uint16_t length; uint_t ehdrlen; uint8_t *whereptr; uint8_t *nexthdrp; ip6_dest_t *desthdr; ip6_rthdr_t *rthdr; ip6_frag_t *fraghdr; if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) return (B_FALSE); ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ if (fragp != NULL) *fragp = NULL; nexthdrp = &ip6h->ip6_nxt; while (whereptr < endptr) { /* Is there enough left for len + nexthdr? */ if (whereptr + MIN_EHDR_LEN > endptr) break; switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: /* Assumes the headers are identical for hbh and dst */ desthdr = (ip6_dest_t *)whereptr; ehdrlen = 8 * (desthdr->ip6d_len + 1); if ((uchar_t *)desthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &desthdr->ip6d_nxt; break; case IPPROTO_ROUTING: rthdr = (ip6_rthdr_t *)whereptr; ehdrlen = 8 * (rthdr->ip6r_len + 1); if ((uchar_t *)rthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &rthdr->ip6r_nxt; break; case IPPROTO_FRAGMENT: fraghdr = (ip6_frag_t *)whereptr; ehdrlen = sizeof (ip6_frag_t); if ((uchar_t *)&fraghdr[1] > endptr) return (B_FALSE); nexthdrp = &fraghdr->ip6f_nxt; if (fragp != NULL) *fragp = fraghdr; break; case IPPROTO_NONE: /* No next header means we're finished */ default: *hdr_length = length; *next_hdr = *nexthdrp; return (B_TRUE); } length += ehdrlen; whereptr += ehdrlen; *hdr_length = length; *next_hdr = *nexthdrp; } switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_FRAGMENT: /* * If any know extension headers are still to be processed, * the packet's malformed (or at least all the IP header(s) are * not in the same mblk - and that should never happen. */ return (B_FALSE); default: /* * If we get here, we know that all of the IP headers were in * the same mblk, even if the ULP header is in the next mblk. */ *hdr_length = length; *next_hdr = *nexthdrp; return (B_TRUE); } } /* * The following set of routines are there to take care of interrupt * re-targeting for legacy (fixed) interrupts. Some older versions * of the popular NICs like e1000g do not support MSI-X interrupts * and they reserve fixed interrupts for RX/TX rings. To re-target * these interrupts, PCITOOL ioctls need to be used. */ typedef struct mac_dladm_intr { int ino; int cpu_id; char driver_path[MAXPATHLEN]; char nexus_path[MAXPATHLEN]; } mac_dladm_intr_t; /* Bind the interrupt to cpu_num */ static int mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) { pcitool_intr_set_t iset; int err; iset.old_cpu = oldcpuid; iset.ino = ino; iset.cpu_id = cpu_num; iset.user_version = PCITOOL_VERSION; err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, kcred, NULL); return (err); } /* * Search interrupt information. iget is filled in with the info to search */ static boolean_t mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) { int i; char driver_path[2 * MAXPATHLEN]; for (i = 0; i < iget_p->num_devs; i++) { (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, ":%s%d", iget_p->dev[i].driver_name, iget_p->dev[i].dev_inst); /* Match the device path for the device path */ if (strcmp(driver_path, dln->driver_path) == 0) { dln->ino = iget_p->ino; dln->cpu_id = iget_p->cpu_id; return (B_TRUE); } } return (B_FALSE); } /* * Get information about ino, i.e. if this is the interrupt for our * device and where it is bound etc. */ static boolean_t mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, mac_dladm_intr_t *dln) { pcitool_intr_get_t *iget_p; int ipsz; int nipsz; int err; uint8_t inum; /* * Check if SLEEP is OK, i.e if could come here in response to * changing the fanout due to some callback from the driver, say * link speed changes. */ ipsz = PCITOOL_IGET_SIZE(0); iget_p = kmem_zalloc(ipsz, KM_SLEEP); iget_p->num_devs_ret = 0; iget_p->user_version = PCITOOL_VERSION; iget_p->cpu_id = oldcpuid; iget_p->ino = ino; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, FKIOCTL, kcred, NULL); if (err != 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } if (iget_p->num_devs == 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } inum = iget_p->num_devs; if (iget_p->num_devs_ret < iget_p->num_devs) { /* Reallocate */ nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); kmem_free(iget_p, ipsz); ipsz = nipsz; iget_p = kmem_zalloc(ipsz, KM_SLEEP); iget_p->num_devs_ret = inum; iget_p->cpu_id = oldcpuid; iget_p->ino = ino; iget_p->user_version = PCITOOL_VERSION; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, FKIOCTL, kcred, NULL); if (err != 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } /* defensive */ if (iget_p->num_devs != iget_p->num_devs_ret) { kmem_free(iget_p, ipsz); return (B_FALSE); } } if (mac_search_intrinfo(iget_p, dln)) { kmem_free(iget_p, ipsz); return (B_TRUE); } kmem_free(iget_p, ipsz); return (B_FALSE); } /* * Get the interrupts and check each one to see if it is for our device. */ static int mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) { pcitool_intr_info_t intr_info; int err; int ino; int oldcpuid; err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, FKIOCTL, kcred, NULL); if (err != 0) return (-1); for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { for (ino = 0; ino < intr_info.num_intr; ino++) { if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { if (dln->cpu_id == cpuid) return (0); return (1); } } } return (-1); } /* * Obtain the nexus parent node info. for mdip. */ static dev_info_t * mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) { struct dev_info *tdip = (struct dev_info *)mdip; struct ddi_minor_data *minordata; int circ; dev_info_t *pdip; char pathname[MAXPATHLEN]; while (tdip != NULL) { /* * The netboot code could call this function while walking the * device tree so we need to use ndi_devi_tryenter() here to * avoid deadlock. */ if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) break; for (minordata = tdip->devi_minor; minordata != NULL; minordata = minordata->next) { if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, strlen(DDI_NT_INTRCTL)) == 0) { pdip = minordata->dip; (void) ddi_pathname(pdip, pathname); (void) snprintf(dln->nexus_path, MAXPATHLEN, "/devices%s:intr", pathname); (void) ddi_pathname_minor(minordata, pathname); ndi_devi_exit((dev_info_t *)tdip, circ); return (pdip); } } ndi_devi_exit((dev_info_t *)tdip, circ); tdip = tdip->devi_parent; } return (NULL); } /* * For a primary MAC client, if the user has set a list or CPUs or * we have obtained it implicitly, we try to retarget the interrupt * for that device on one of the CPUs in the list. * We assign the interrupt to the same CPU as the poll thread. */ static boolean_t mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) { ldi_handle_t lh = NULL; ldi_ident_t li = NULL; int err; int ret; mac_dladm_intr_t dln; dev_info_t *dip; struct ddi_minor_data *minordata; dln.nexus_path[0] = '\0'; dln.driver_path[0] = '\0'; minordata = ((struct dev_info *)mdip)->devi_minor; while (minordata != NULL) { if (minordata->type == DDM_MINOR) break; minordata = minordata->next; } if (minordata == NULL) return (B_FALSE); (void) ddi_pathname_minor(minordata, dln.driver_path); dip = mac_get_nexus_node(mdip, &dln); /* defensive */ if (dip == NULL) return (B_FALSE); err = ldi_ident_from_major(ddi_driver_major(dip), &li); if (err != 0) return (B_FALSE); err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); if (err != 0) return (B_FALSE); ret = mac_validate_intr(lh, &dln, cpuid); if (ret < 0) { (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_FALSE); } /* cmn_note? */ if (ret != 0) if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) != 0) { (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_FALSE); } (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_TRUE); } void mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) { dev_info_t *mdip = (dev_info_t *)arg; mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_resource_props_t *mrp; mac_perim_handle_t mph; flow_entry_t *flent = mcip->mci_flent; mac_soft_ring_set_t *rx_srs; mac_cpus_t *srs_cpu; if (!mac_check_interrupt_binding(mdip, cpuid)) cpuid = -1; mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); mrp = MCIP_RESOURCE_PROPS(mcip); mrp->mrp_rx_intr_cpu = cpuid; if (flent != NULL && flent->fe_rx_srs_cnt == 2) { rx_srs = flent->fe_rx_srs[1]; srs_cpu = &rx_srs->srs_cpu; srs_cpu->mc_rx_intr_cpu = cpuid; } mac_perim_exit(mph); } int32_t mac_client_intr_cpu(mac_client_handle_t mch) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_cpus_t *srs_cpu; mac_soft_ring_set_t *rx_srs; flow_entry_t *flent = mcip->mci_flent; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); mac_ring_t *ring; mac_intr_t *mintr; /* * Check if we need to retarget the interrupt. We do this only * for the primary MAC client. We do this if we have the only * exclusive ring in the group. */ if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { rx_srs = flent->fe_rx_srs[1]; srs_cpu = &rx_srs->srs_cpu; ring = rx_srs->srs_ring; mintr = &ring->mr_info.mri_intr; /* * If ddi_handle is present or the poll CPU is * already bound to the interrupt CPU, return -1. */ if (mintr->mi_ddi_handle != NULL || ((mrp->mrp_ncpus != 0) && (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { return (-1); } return (srs_cpu->mc_rx_pollid); } return (-1); } void * mac_get_devinfo(mac_handle_t mh) { mac_impl_t *mip = (mac_impl_t *)mh; return ((void *)mip->mi_dip); } #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) uint64_t mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) { struct ether_header *ehp; uint64_t hash = 0; uint16_t sap; uint_t skip_len; uint8_t proto; boolean_t ip_fragmented; /* * We may want to have one of these per MAC type plugin in the * future. For now supports only ethernet. */ if (media != DL_ETHER) return (0L); /* for now we support only outbound packets */ ASSERT(is_outbound); ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); /* compute L2 hash */ ehp = (struct ether_header *)mp->b_rptr; if ((policy & MAC_PKT_HASH_L2) != 0) { uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); policy &= ~MAC_PKT_HASH_L2; } if (policy == 0) goto done; /* skip ethernet header */ sap = ntohs(ehp->ether_type); if (sap == ETHERTYPE_VLAN) { struct ether_vlan_header *evhp; mblk_t *newmp = NULL; skip_len = sizeof (struct ether_vlan_header); if (MBLKL(mp) < skip_len) { /* the vlan tag is the payload, pull up first */ newmp = msgpullup(mp, -1); if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { goto done; } evhp = (struct ether_vlan_header *)newmp->b_rptr; } else { evhp = (struct ether_vlan_header *)mp->b_rptr; } sap = ntohs(evhp->ether_type); freemsg(newmp); } else { skip_len = sizeof (struct ether_header); } /* if ethernet header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; /* compute IP src/dst addresses hash and skip IPv{4,6} header */ switch (sap) { case ETHERTYPE_IP: { ipha_t *iphp; /* * If the header is not aligned or the header doesn't fit * in the mblk, bail now. Note that this may cause packets * reordering. */ iphp = (ipha_t *)(mp->b_rptr + skip_len); if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || !OK_32PTR((char *)iphp)) goto done; proto = iphp->ipha_protocol; skip_len += IPH_HDR_LENGTH(iphp); /* Check if the packet is fragmented. */ ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & IPH_OFFSET; /* * For fragmented packets, use addresses in addition to * the frag_id to generate the hash inorder to get * better distribution. */ if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); hash ^= (PKT_HASH_4BYTES(ip_src) ^ PKT_HASH_4BYTES(ip_dst)); policy &= ~MAC_PKT_HASH_L3; } if (ip_fragmented) { uint8_t *identp = (uint8_t *)&iphp->ipha_ident; hash ^= PKT_HASH_2BYTES(identp); goto done; } break; } case ETHERTYPE_IPV6: { ip6_t *ip6hp; ip6_frag_t *frag = NULL; uint16_t hdr_length; /* * If the header is not aligned or the header doesn't fit * in the mblk, bail now. Note that this may cause packets * reordering. */ ip6hp = (ip6_t *)(mp->b_rptr + skip_len); if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || !OK_32PTR((char *)ip6hp)) goto done; if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, &proto, &frag)) goto done; skip_len += hdr_length; /* * For fragmented packets, use addresses in addition to * the frag_id to generate the hash inorder to get * better distribution. */ if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); hash ^= (PKT_HASH_4BYTES(ip_src) ^ PKT_HASH_4BYTES(ip_dst)); policy &= ~MAC_PKT_HASH_L3; } if (frag != NULL) { uint8_t *identp = (uint8_t *)&frag->ip6f_ident; hash ^= PKT_HASH_4BYTES(identp); goto done; } break; } default: goto done; } if (policy == 0) goto done; /* if ip header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } /* parse ULP header */ again: switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_ESP: case IPPROTO_SCTP: /* * These Internet Protocols are intentionally designed * for hashing from the git-go. Port numbers are in the first * word for transports, SPI is first for ESP. */ if (mp->b_rptr + skip_len + 4 > mp->b_wptr) goto done; hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); break; case IPPROTO_AH: { ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); uint_t ah_length = AH_TOTAL_LEN(ah); if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) goto done; proto = ah->ah_nexthdr; skip_len += ah_length; /* if AH header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } goto again; } } done: return (hash); }