/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright 2023 Oxide Computer Company */ /* * MAC Services Module - misc utilities */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The next two functions are used for dropping packets or chains of * packets, respectively. We could use one function for both but * separating the use cases allows us to specify intent and prevent * dropping more data than intended. * * The purpose of these functions is to aid the debugging effort, * especially in production. Rather than use freemsg()/freemsgchain(), * it's preferable to use these functions when dropping a packet in * the MAC layer. These functions should only be used during * unexpected conditions. That is, any time a packet is dropped * outside of the regular, successful datapath. Consolidating all * drops on these functions allows the user to trace one location and * determine why the packet was dropped based on the msg. It also * allows the user to inspect the packet before it is freed. Finally, * it allows the user to avoid tracing freemsg()/freemsgchain() thus * keeping the hot path running as efficiently as possible. * * NOTE: At this time not all MAC drops are aggregated on these * functions; but that is the plan. This comment should be erased once * completed. */ /*PRINTFLIKE2*/ void mac_drop_pkt(mblk_t *mp, const char *fmt, ...) { va_list adx; char msg[128]; char *msgp = msg; ASSERT3P(mp->b_next, ==, NULL); va_start(adx, fmt); (void) vsnprintf(msgp, sizeof (msg), fmt, adx); va_end(adx); DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); freemsg(mp); } /*PRINTFLIKE2*/ void mac_drop_chain(mblk_t *chain, const char *fmt, ...) { va_list adx; char msg[128]; char *msgp = msg; va_start(adx, fmt); (void) vsnprintf(msgp, sizeof (msg), fmt, adx); va_end(adx); /* * We could use freemsgchain() for the actual freeing but * since we are already walking the chain to fire the dtrace * probe we might as well free the msg here too. */ for (mblk_t *mp = chain, *next; mp != NULL; ) { next = mp->b_next; DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); mp->b_next = NULL; freemsg(mp); mp = next; } } /* * Copy an mblk, preserving its hardware checksum flags. */ static mblk_t * mac_copymsg_cksum(mblk_t *mp) { mblk_t *mp1; mp1 = copymsg(mp); if (mp1 == NULL) return (NULL); mac_hcksum_clone(mp, mp1); return (mp1); } /* * Copy an mblk chain, presenting the hardware checksum flags of the * individual mblks. */ mblk_t * mac_copymsgchain_cksum(mblk_t *mp) { mblk_t *nmp = NULL; mblk_t **nmpp = &nmp; for (; mp != NULL; mp = mp->b_next) { if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { freemsgchain(nmp); return (NULL); } nmpp = &((*nmpp)->b_next); } return (nmp); } /* * Calculate the ULP checksum for IPv4. Return true if the calculation * was successful, or false if an error occurred. If the later, place * an error message into '*err'. */ static boolean_t mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha, const char **err) { const uint8_t proto = ipha->ipha_protocol; size_t len; const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha); /* ULP offset from start of L2. */ const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz; ipaddr_t src, dst; uint32_t cksum; uint16_t *up; /* * We need a pointer to the ULP checksum. We're assuming the * ULP checksum pointer resides in the first mblk. Our native * TCP stack should always put the headers in the first mblk, * but currently we have no way to guarantee that other * clients don't spread headers (or even header fields) across * mblks. */ switch (proto) { case IPPROTO_TCP: ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { *err = "mblk doesn't contain TCP header"; goto bail; } up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz); cksum = IP_TCP_CSUM_COMP; break; case IPPROTO_UDP: ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { *err = "mblk doesn't contain UDP header"; goto bail; } up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz); cksum = IP_UDP_CSUM_COMP; break; case IPPROTO_SCTP: { sctp_hdr_t *sctph; ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { *err = "mblk doesn't contain SCTP header"; goto bail; } sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); sctph->sh_chksum = 0; sctph->sh_chksum = sctp_cksum(mp, ulp_offset); return (B_TRUE); } default: *err = "unexpected protocol"; goto bail; } /* Pseudo-header checksum. */ src = ipha->ipha_src; dst = ipha->ipha_dst; len = ntohs(ipha->ipha_length) - ip_hdr_sz; cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); cksum += htons(len); /* * We have already accounted for the pseudo checksum above. * Make sure the ULP checksum field is zero before computing * the rest. */ *up = 0; cksum = IP_CSUM(mp, ulp_offset, cksum); *up = (uint16_t)(cksum ? cksum : ~cksum); return (B_TRUE); bail: return (B_FALSE); } /* * Calculate the ULP checksum for IPv6. Return true if the calculation * was successful, or false if an error occurred. If the later, place * an error message into '*err'. */ static boolean_t mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err) { ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); const uint8_t proto = ip6h->ip6_nxt; const uint16_t *iphs = (uint16_t *)ip6h; /* ULP offset from start of L2. */ uint32_t ulp_offset; size_t len; uint32_t cksum; uint16_t *up; uint16_t ip_hdr_sz; if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) { *err = "malformed IPv6 header"; goto bail; } ulp_offset = ip_hdr_offset + ip_hdr_sz; /* * We need a pointer to the ULP checksum. We're assuming the * ULP checksum pointer resides in the first mblk. Our native * TCP stack should always put the headers in the first mblk, * but currently we have no way to guarantee that other * clients don't spread headers (or even header fields) across * mblks. */ switch (proto) { case IPPROTO_TCP: ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { *err = "mblk doesn't contain TCP header"; goto bail; } up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz); cksum = IP_TCP_CSUM_COMP; break; case IPPROTO_UDP: ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { *err = "mblk doesn't contain UDP header"; goto bail; } up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz); cksum = IP_UDP_CSUM_COMP; break; case IPPROTO_SCTP: { sctp_hdr_t *sctph; ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { *err = "mblk doesn't contain SCTP header"; goto bail; } sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); /* * Zero out the checksum field to ensure proper * checksum calculation. */ sctph->sh_chksum = 0; sctph->sh_chksum = sctp_cksum(mp, ulp_offset); return (B_TRUE); } default: *err = "unexpected protocol"; goto bail; } /* * The payload length includes the payload and the IPv6 * extension headers; the idea is to subtract the extension * header length to get the real payload length. */ len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN); cksum += len; /* * We accumulate the pseudo header checksum in cksum; then we * call IP_CSUM to compute the checksum over the payload. */ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + iphs[16] + iphs[17] + iphs[18] + iphs[19]; cksum = IP_CSUM(mp, ulp_offset, cksum); /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */ if (proto == IPPROTO_UDP && cksum == 0) cksum = ~cksum; *up = (uint16_t)cksum; return (B_TRUE); bail: return (B_FALSE); } /* * Perform software checksum on a single message, if needed. The * emulation performed is determined by an intersection of the mblk's * flags and the emul flags requested. The emul flags are documented * in mac.h. */ static mblk_t * mac_sw_cksum(mblk_t *mp, mac_emul_t emul) { mblk_t *skipped_hdr = NULL; uint32_t flags, start, stuff, end, value; uint32_t ip_hdr_offset; uint16_t etype; size_t ip_hdr_sz; struct ether_header *ehp; const char *err = ""; /* * This function should only be called from mac_hw_emul() * which handles mblk chains and the shared ref case. */ ASSERT3P(mp->b_next, ==, NULL); mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); flags = DB_CKSUMFLAGS(mp); /* Why call this if checksum emulation isn't needed? */ ASSERT3U(flags & (HCK_FLAGS), !=, 0); /* * Ethernet, and optionally VLAN header. mac_hw_emul() has * already verified we have enough data to read the L2 header. */ ehp = (struct ether_header *)mp->b_rptr; if (ntohs(ehp->ether_type) == VLAN_TPID) { struct ether_vlan_header *evhp; evhp = (struct ether_vlan_header *)mp->b_rptr; etype = ntohs(evhp->ether_type); ip_hdr_offset = sizeof (struct ether_vlan_header); } else { etype = ntohs(ehp->ether_type); ip_hdr_offset = sizeof (struct ether_header); } /* * If this packet isn't IP, then leave it alone. We don't want * to affect non-IP traffic like ARP. Assume the IP header * doesn't include any options, for now. We will use the * correct size later after we know there are enough bytes to * at least fill out the basic header. */ switch (etype) { case ETHERTYPE_IP: ip_hdr_sz = sizeof (ipha_t); break; case ETHERTYPE_IPV6: ip_hdr_sz = sizeof (ip6_t); break; default: return (mp); } ASSERT3U(MBLKL(mp), >=, ip_hdr_offset); /* * If the first mblk of this packet contains only the ethernet * header, skip past it for now. Packets with their data * contained in only a single mblk can then use the fastpaths * tuned to that possibility. */ if (MBLKL(mp) == ip_hdr_offset) { ip_hdr_offset -= MBLKL(mp); /* This is guaranteed by mac_hw_emul(). */ ASSERT3P(mp->b_cont, !=, NULL); skipped_hdr = mp; mp = mp->b_cont; } /* * Both full and partial checksum rely on finding the IP * header in the current mblk. Our native TCP stack honors * this assumption but it's prudent to guard our future * clients that might not honor this contract. */ ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz); if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) { err = "mblk doesn't contain IP header"; goto bail; } /* * We are about to modify the header mblk; make sure we are * modifying our own copy. The code that follows assumes that * the IP/ULP headers exist in this mblk (and drops the * message if they don't). */ if (DB_REF(mp) > 1) { mblk_t *tmp = copyb(mp); if (tmp == NULL) { err = "copyb failed"; goto bail; } if (skipped_hdr != NULL) { ASSERT3P(skipped_hdr->b_cont, ==, mp); skipped_hdr->b_cont = tmp; } tmp->b_cont = mp->b_cont; freeb(mp); mp = tmp; } if (etype == ETHERTYPE_IP) { ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset); if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err)) goto bail; } /* We always update the ULP checksum flags. */ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { flags &= ~HCK_FULLCKSUM; flags |= HCK_FULLCKSUM_OK; value = 0; } /* * While unlikely, it's possible to write code that * might end up calling mac_sw_cksum() twice on the * same mblk (performing both LSO and checksum * emualtion in a single mblk chain loop -- the LSO * emulation inserts a new chain into the existing * chain and then the loop iterates back over the new * segments and emulates the checksum a second time). * Normally this wouldn't be a problem, because the * HCK_*_OK flags are supposed to indicate that we * don't need to do peform the work. But * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the * same value; so we cannot use these flags to * determine if the IP header checksum has already * been calculated or not. For this reason, we zero * out the the checksum first. In the future, we * should fix the HCK_* flags. */ if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { ipha->ipha_hdr_checksum = 0; ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); flags &= ~HCK_IPV4_HDRCKSUM; flags |= HCK_IPV4_HDRCKSUM_OK; } } else if (etype == ETHERTYPE_IPV6) { /* There is no IP header checksum for IPv6. */ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err)) goto bail; flags &= ~HCK_FULLCKSUM; flags |= HCK_FULLCKSUM_OK; value = 0; } } /* * Partial checksum is the same for both IPv4 and IPv6. */ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { uint16_t *up, partial, cksum; uchar_t *ipp; /* ptr to beginning of IP header */ ipp = mp->b_rptr + ip_hdr_offset; up = (uint16_t *)((uchar_t *)ipp + stuff); partial = *up; *up = 0; ASSERT3S(end, >, start); cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial); *up = cksum != 0 ? cksum : ~cksum; } /* We always update the ULP checksum flags. */ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { flags &= ~HCK_PARTIALCKSUM; flags |= HCK_FULLCKSUM_OK; value = 0; } mac_hcksum_set(mp, start, stuff, end, value, flags); /* Don't forget to reattach the header. */ if (skipped_hdr != NULL) { ASSERT3P(skipped_hdr->b_cont, ==, mp); /* * Duplicate the HCKSUM data into the header mblk. * This mimics mac_add_vlan_tag which ensures that * both the first mblk _and_ the first data bearing * mblk possess the HCKSUM information. Consumers like * IP will end up discarding the ether_header mblk, so * for now, it is important that the data be available * in both places. */ mac_hcksum_clone(mp, skipped_hdr); mp = skipped_hdr; } return (mp); bail: if (skipped_hdr != NULL) { ASSERT3P(skipped_hdr->b_cont, ==, mp); mp = skipped_hdr; } mac_drop_pkt(mp, err); return (NULL); } /* * Build a single data segment from an LSO packet. The mblk chain * returned, seg_head, represents the data segment and is always * exactly seg_len bytes long. The lso_mp and offset input/output * parameters track our position in the LSO packet. This function * exists solely as a helper to mac_sw_lso(). * * Case A * * The current lso_mp is larger than the requested seg_len. The * beginning of seg_head may start at the beginning of lso_mp or * offset into it. In either case, a single mblk is returned, and * *offset is updated to reflect our new position in the current * lso_mp. * * +----------------------------+ * | in *lso_mp / out *lso_mp | * +----------------------------+ * ^ ^ * | | * | | * | | * +------------------------+ * | seg_head | * +------------------------+ * ^ ^ * | | * in *offset = 0 out *offset = seg_len * * |------ seg_len ----| * * * +------------------------------+ * | in *lso_mp / out *lso_mp | * +------------------------------+ * ^ ^ * | | * | | * | | * +------------------------+ * | seg_head | * +------------------------+ * ^ ^ * | | * in *offset = N out *offset = N + seg_len * * |------ seg_len ----| * * * * Case B * * The requested seg_len consumes exactly the rest of the lso_mp. * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. * The seg_head may start at the beginning of the lso_mp or at some * offset into it. In either case we return a single mblk, reset * *offset to zero, and walk to the next lso_mp. * * +------------------------+ +------------------------+ * | in *lso_mp |---------->| out *lso_mp | * +------------------------+ +------------------------+ * ^ ^ ^ * | | | * | | out *offset = 0 * | | * +------------------------+ * | seg_head | * +------------------------+ * ^ * | * in *offset = 0 * * |------ seg_len ----| * * * * +----------------------------+ +------------------------+ * | in *lso_mp |---------->| out *lso_mp | * +----------------------------+ +------------------------+ * ^ ^ ^ * | | | * | | out *offset = 0 * | | * +------------------------+ * | seg_head | * +------------------------+ * ^ * | * in *offset = N * * |------ seg_len ----| * * * Case C * * The requested seg_len is greater than the current lso_mp. In * this case we must consume LSO mblks until we have enough data to * satisfy either case (A) or (B) above. We will return multiple * mblks linked via b_cont, offset will be set based on the cases * above, and lso_mp will walk forward at least one mblk, but maybe * more. * * N.B. This digram is not exhaustive. The seg_head may start on * the beginning of an lso_mp. The seg_tail may end exactly on the * boundary of an lso_mp. And there may be two (in this case the * middle block wouldn't exist), three, or more mblks in the * seg_head chain. This is meant as one example of what might * happen. The main thing to remember is that the seg_tail mblk * must be one of case (A) or (B) above. * * +------------------+ +----------------+ +------------------+ * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | * +------------------+ +----------------+ +------------------+ * ^ ^ ^ ^ ^ ^ * | | | | | | * | | | | | | * | | | | | | * | | | | | | * +------------+ +----------------+ +------------+ * | seg_head |--->| |--->| seg_tail | * +------------+ +----------------+ +------------+ * ^ ^ * | | * in *offset = N out *offset = MBLKL(seg_tail) * * |------------------- seg_len -------------------| * */ static mblk_t * build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) { mblk_t *seg_head, *seg_tail, *seg_mp; ASSERT3P(*lso_mp, !=, NULL); ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); seg_mp = dupb(*lso_mp); if (seg_mp == NULL) return (NULL); seg_head = seg_mp; seg_tail = seg_mp; /* Continue where we left off from in the lso_mp. */ seg_mp->b_rptr += *offset; last_mblk: /* Case (A) */ if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { *offset += seg_len; seg_mp->b_wptr = seg_mp->b_rptr + seg_len; return (seg_head); } /* Case (B) */ if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { *offset = 0; *lso_mp = (*lso_mp)->b_cont; return (seg_head); } /* Case (C) */ ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); /* * The current LSO mblk doesn't have enough data to satisfy * seg_len -- continue peeling off LSO mblks to build the new * segment message. If allocation fails we free the previously * allocated segment mblks and return NULL. */ while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { ASSERT3U(MBLKL(seg_mp), <=, seg_len); seg_len -= MBLKL(seg_mp); *offset = 0; *lso_mp = (*lso_mp)->b_cont; seg_mp = dupb(*lso_mp); if (seg_mp == NULL) { freemsgchain(seg_head); return (NULL); } seg_tail->b_cont = seg_mp; seg_tail = seg_mp; } /* * We've walked enough LSO mblks that we can now satisfy the * remaining seg_len. At this point we need to jump back to * determine if we have arrived at case (A) or (B). */ /* Just to be paranoid that we didn't underflow. */ ASSERT3U(seg_len, <, IP_MAXPACKET); ASSERT3U(seg_len, >, 0); goto last_mblk; } /* * Perform software segmentation of a single LSO message. Take an LSO * message as input and return head/tail pointers as output. This * function should not be invoked directly but instead through * mac_hw_emul(). * * The resulting chain is comprised of multiple (nsegs) MSS sized * segments. Each segment will consist of two or more mblks joined by * b_cont: a header and one or more data mblks. The header mblk is * allocated anew for each message. The first segment's header is used * as a template for the rest with adjustments made for things such as * ID, sequence, length, TCP flags, etc. The data mblks reference into * the existing LSO mblk (passed in as omp) by way of dupb(). Their * b_rptr/b_wptr values are adjusted to reference only the fraction of * the LSO message they are responsible for. At the successful * completion of this function the original mblk (omp) is freed, * leaving the newely created segment chain as the only remaining * reference to the data. */ static void mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, uint_t *count) { uint32_t ocsum_flags, ocsum_start, ocsum_stuff; uint32_t mss; uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; uint32_t oleft; uint_t nsegs, seg; int len; struct ether_vlan_header *oevh; const ipha_t *oiph; const tcph_t *otcph; ipha_t *niph; tcph_t *ntcph; uint16_t ip_id; uint32_t tcp_seq, tcp_sum, otcp_sum; uint32_t offset; mblk_t *odatamp; mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; mblk_t *tmptail; ASSERT3P(head, !=, NULL); ASSERT3P(tail, !=, NULL); ASSERT3P(count, !=, NULL); ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); /* Assume we are dealing with a single LSO message. */ ASSERT3P(omp->b_next, ==, NULL); /* * XXX: This is a hack to deal with mac_add_vlan_tag(). * * When VLANs are in play, mac_add_vlan_tag() creates a new * mblk with just the ether_vlan_header and tacks it onto the * front of 'omp'. This breaks the assumptions made below; * namely that the TCP/IP headers are in the first mblk. In * this case, since we already have to pay the cost of LSO * emulation, we simply pull up everything. While this might * seem irksome, keep in mind this will only apply in a couple * of scenarios: a) an LSO-capable VLAN client sending to a * non-LSO-capable client over the "MAC/bridge loopback" * datapath or b) an LSO-capable VLAN client is sending to a * client that, for whatever reason, doesn't have DLS-bypass * enabled. Finally, we have to check for both a tagged and * untagged sized mblk depending on if the mblk came via * mac_promisc_dispatch() or mac_rx_deliver(). * * In the future, two things should be done: * * 1. This function should make use of some yet to be * implemented "mblk helpers". These helper functions would * perform all the b_cont walking for us and guarantee safe * access to the mblk data. * * 2. We should add some slop to the mblks so that * mac_add_vlan_tag() can just edit the first mblk instead * of allocating on the hot path. */ if (MBLKL(omp) == sizeof (struct ether_vlan_header) || MBLKL(omp) == sizeof (struct ether_header)) { mblk_t *tmp = msgpullup(omp, -1); if (tmp == NULL) { mac_drop_pkt(omp, "failed to pull up"); goto fail; } mac_hcksum_clone(omp, tmp); freemsg(omp); omp = tmp; } mss = DB_LSOMSS(omp); ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + sizeof (struct ether_vlan_header)); opktlen = msgsize(omp); /* * First, get references to the IP and TCP headers and * determine the total TCP length (header + data). * * Thanks to mac_hw_emul() we know that the first mblk must * contain (at minimum) the full L2 header. However, this * function assumes more than that. It assumes the L2/L3/L4 * headers are all contained in the first mblk of a message * (i.e., no b_cont walking for headers). While this is a * current reality (our native TCP stack and viona both * enforce this) things may become more nuanced in the future * (e.g. when introducing encap support or adding new * clients). For now we guard against this case by dropping * the packet. */ oevh = (struct ether_vlan_header *)omp->b_rptr; if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) oehlen = sizeof (struct ether_vlan_header); else oehlen = sizeof (struct ether_header); ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); goto fail; } oiph = (ipha_t *)(omp->b_rptr + oehlen); oiphlen = IPH_HDR_LENGTH(oiph); otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); otcphlen = TCP_HDR_LENGTH(otcph); /* * Currently we only support LSO for TCP/IPv4. */ if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", IPH_HDR_VERSION(oiph)); goto fail; } if (oiph->ipha_protocol != IPPROTO_TCP) { mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", oiph->ipha_protocol); goto fail; } if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); goto fail; } ohdrslen = oehlen + oiphlen + otcphlen; if ((len = MBLKL(omp)) < ohdrslen) { mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, ohdrslen); goto fail; } /* * Either we have data in the first mblk or it's just the * header. In either case, we need to set rptr to the start of * the TCP data. */ if (len > ohdrslen) { odatamp = omp; offset = ohdrslen; } else { ASSERT3U(len, ==, ohdrslen); odatamp = omp->b_cont; offset = 0; } /* Make sure we still have enough data. */ ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); /* * If a MAC negotiated LSO then it must negotioate both * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or * HCKSUM_INET_PARTIAL; because both the IP and TCP headers * change during LSO segmentation (only the 3 fields of the * pseudo header checksum don't change: src, dst, proto). Thus * we would expect these flags (HCK_IPV4_HDRCKSUM | * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this * function to emulate those checksums in software. However, * that assumes a world where we only expose LSO if the * underlying hardware exposes LSO. Moving forward the plan is * to assume LSO in the upper layers and have MAC perform * software LSO when the underlying provider doesn't support * it. In such a world, if the provider doesn't support LSO * but does support hardware checksum offload, then we could * simply perform the segmentation and allow the hardware to * calculate the checksums. To the hardware it's just another * chain of non-LSO packets. */ ASSERT3S(DB_TYPE(omp), ==, M_DATA); ocsum_flags = DB_CKSUMFLAGS(omp); ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); /* * If hardware only provides partial checksum then software * must supply the pseudo-header checksum. In the case of LSO * we leave the TCP length at zero to be filled in by * hardware. This function must handle two scenarios. * * 1. Being called by a MAC client on the Rx path to segment * an LSO packet and calculate the checksum. * * 2. Being called by a MAC provider to segment an LSO packet. * In this case the LSO segmentation is performed in * software (by this routine) but the MAC provider should * still calculate the TCP/IP checksums in hardware. * * To elaborate on the second case: we cannot have the * scenario where IP sends LSO packets but the underlying HW * doesn't support checksum offload -- because in that case * TCP/IP would calculate the checksum in software (for the * LSO packet) but then MAC would segment the packet and have * to redo all the checksum work. So IP should never do LSO * if HW doesn't support both IP and TCP checksum. */ if (ocsum_flags & HCK_PARTIALCKSUM) { ocsum_start = (uint32_t)DB_CKSUMSTART(omp); ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); } odatalen = opktlen - ohdrslen; /* * Subtract one to account for the case where the data length * is evenly divisble by the MSS. Add one to account for the * fact that the division will always result in one less * segment than needed. */ nsegs = ((odatalen - 1) / mss) + 1; if (nsegs < 2) { mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); goto fail; } DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, nsegs); seg_chain = NULL; tmptail = seg_chain; oleft = odatalen; for (uint_t i = 0; i < nsegs; i++) { boolean_t last_seg = ((i + 1) == nsegs); uint32_t seg_len; /* * If we fail to allocate, then drop the partially * allocated chain as well as the LSO packet. Let the * sender deal with the fallout. */ if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { freemsgchain(seg_chain); mac_drop_pkt(omp, "failed to alloc segment header"); goto fail; } ASSERT3P(nhdrmp->b_cont, ==, NULL); if (seg_chain == NULL) { seg_chain = nhdrmp; } else { ASSERT3P(tmptail, !=, NULL); tmptail->b_next = nhdrmp; } tmptail = nhdrmp; /* * Calculate this segment's lengh. It's either the MSS * or whatever remains for the last segment. */ seg_len = last_seg ? oleft : mss; ASSERT3U(seg_len, <=, mss); ndatamp = build_data_seg(&odatamp, &offset, seg_len); if (ndatamp == NULL) { freemsgchain(seg_chain); mac_drop_pkt(omp, "LSO failed to segment data"); goto fail; } /* Attach data mblk to header mblk. */ nhdrmp->b_cont = ndatamp; DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; ASSERT3U(seg_len, <=, oleft); oleft -= seg_len; } /* We should have consumed entire LSO msg. */ ASSERT3S(oleft, ==, 0); ASSERT3P(odatamp, ==, NULL); /* * All seg data mblks are referenced by the header mblks, null * out this pointer to catch any bad derefs. */ ndatamp = NULL; /* * Set headers and checksum for first segment. */ nhdrmp = seg_chain; bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); niph->ipha_length = htons(oiphlen + otcphlen + mss); niph->ipha_hdr_checksum = 0; ip_id = ntohs(niph->ipha_ident); ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); tcp_seq = BE32_TO_U32(ntcph->th_seq); tcp_seq += mss; /* * The first segment shouldn't: * * o indicate end of data transmission (FIN), * o indicate immediate handling of the data (PUSH). */ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); /* * If the underlying HW provides partial checksum, then make * sure to correct the pseudo header checksum before calling * mac_sw_cksum(). The native TCP stack doesn't include the * length field in the pseudo header when LSO is in play -- so * we need to calculate it here. */ if (ocsum_flags & HCK_PARTIALCKSUM) { DB_CKSUMSTART(nhdrmp) = ocsum_start; DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; tcp_sum = BE16_TO_U16(ntcph->th_sum); otcp_sum = tcp_sum; tcp_sum += mss + otcphlen; tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); U16_TO_BE16(tcp_sum, ntcph->th_sum); } if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && (emul & MAC_HWCKSUM_EMULS)) { next_nhdrmp = nhdrmp->b_next; nhdrmp->b_next = NULL; nhdrmp = mac_sw_cksum(nhdrmp, emul); nhdrmp->b_next = next_nhdrmp; next_nhdrmp = NULL; /* * We may have freed the nhdrmp argument during * checksum emulation, make sure that seg_chain * references a valid mblk. */ seg_chain = nhdrmp; } ASSERT3P(nhdrmp, !=, NULL); seg = 1; DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, uint_t, seg); seg++; /* There better be at least 2 segs. */ ASSERT3P(nhdrmp->b_next, !=, NULL); prev_nhdrmp = nhdrmp; nhdrmp = nhdrmp->b_next; /* * Now adjust the headers of the middle segments. For each * header we need to adjust the following. * * o IP ID * o IP length * o TCP sequence * o TCP flags * o cksum flags * o cksum values (if MAC_HWCKSUM_EMUL is set) */ for (; seg < nsegs; seg++) { /* * We use seg_chain as a reference to the first seg * header mblk -- this first header is a template for * the rest of the segments. This copy will include * the now updated checksum values from the first * header. We must reset these checksum values to * their original to make sure we produce the correct * value. */ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); niph->ipha_ident = htons(++ip_id); ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); niph->ipha_length = htons(oiphlen + otcphlen + mss); niph->ipha_hdr_checksum = 0; ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); U32_TO_BE32(tcp_seq, ntcph->th_seq); tcp_seq += mss; /* * Just like the first segment, the middle segments * shouldn't have these flags set. */ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); if (ocsum_flags & HCK_PARTIALCKSUM) { /* * First and middle segs have same * pseudo-header checksum. */ U16_TO_BE16(tcp_sum, ntcph->th_sum); DB_CKSUMSTART(nhdrmp) = ocsum_start; DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; } if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && (emul & MAC_HWCKSUM_EMULS)) { next_nhdrmp = nhdrmp->b_next; nhdrmp->b_next = NULL; nhdrmp = mac_sw_cksum(nhdrmp, emul); nhdrmp->b_next = next_nhdrmp; next_nhdrmp = NULL; /* We may have freed the original nhdrmp. */ prev_nhdrmp->b_next = nhdrmp; } DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, uint_t, seg); ASSERT3P(nhdrmp->b_next, !=, NULL); prev_nhdrmp = nhdrmp; nhdrmp = nhdrmp->b_next; } /* Make sure we are on the last segment. */ ASSERT3U(seg, ==, nsegs); ASSERT3P(nhdrmp->b_next, ==, NULL); /* * Now we set the last segment header. The difference being * that FIN/PSH/RST flags are allowed. */ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); niph->ipha_ident = htons(++ip_id); len = msgsize(nhdrmp->b_cont); ASSERT3S(len, >, 0); niph->ipha_length = htons(oiphlen + otcphlen + len); niph->ipha_hdr_checksum = 0; ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); U32_TO_BE32(tcp_seq, ntcph->th_seq); DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); if (ocsum_flags & HCK_PARTIALCKSUM) { DB_CKSUMSTART(nhdrmp) = ocsum_start; DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; tcp_sum = otcp_sum; tcp_sum += len + otcphlen; tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); U16_TO_BE16(tcp_sum, ntcph->th_sum); } if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && (emul & MAC_HWCKSUM_EMULS)) { /* This should be the last mblk. */ ASSERT3P(nhdrmp->b_next, ==, NULL); nhdrmp = mac_sw_cksum(nhdrmp, emul); prev_nhdrmp->b_next = nhdrmp; } DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, uint_t, seg); /* * Free the reference to the original LSO message as it is * being replaced by seg_cahin. */ freemsg(omp); *head = seg_chain; *tail = nhdrmp; *count = nsegs; return; fail: *head = NULL; *tail = NULL; *count = 0; } #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) /* * Emulate various hardware offload features in software. Take a chain * of packets as input and emulate the hardware features specified in * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' * pointer given as input, and its tail pointer is written to * '*otail'. The number of packets in the new chain is written to * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus * may be NULL. The 'mp_chain' argument may point to a NULL chain; in * which case 'mp_chain' will simply stay a NULL chain. * * While unlikely, it is technically possible that this function could * receive a non-NULL chain as input and return a NULL chain as output * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be * zero). This could happen if all the packets in the chain are * dropped or if we fail to allocate new mblks. In this case, there is * nothing for the caller to free. In any event, the caller shouldn't * assume that '*mp_chain' is non-NULL on return. * * This function was written with three main use cases in mind. * * 1. To emulate hardware offloads when traveling mac-loopback (two * clients on the same mac). This is wired up in mac_tx_send(). * * 2. To provide hardware offloads to the client when the underlying * provider cannot. This is currently wired up in mac_tx() but we * still only negotiate offloads when the underlying provider * supports them. * * 3. To emulate real hardware in simnet. */ void mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) { mblk_t *head = NULL, *tail = NULL; uint_t count = 0; ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); ASSERT3P(mp_chain, !=, NULL); for (mblk_t *mp = *mp_chain; mp != NULL; ) { mblk_t *tmp, *next, *tmphead, *tmptail; struct ether_header *ehp; uint32_t flags; uint_t len = MBLKL(mp), l2len; /* Perform LSO/cksum one message at a time. */ next = mp->b_next; mp->b_next = NULL; /* * For our sanity the first mblk should contain at * least the full L2 header. */ if (len < sizeof (struct ether_header)) { mac_drop_pkt(mp, "packet too short (A): %u", len); mp = next; continue; } ehp = (struct ether_header *)mp->b_rptr; if (ntohs(ehp->ether_type) == VLAN_TPID) l2len = sizeof (struct ether_vlan_header); else l2len = sizeof (struct ether_header); /* * If the first mblk is solely the L2 header, then * there better be more data. */ if (len < l2len || (len == l2len && mp->b_cont == NULL)) { mac_drop_pkt(mp, "packet too short (C): %u", len); mp = next; continue; } DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); /* * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) * because we don't want to mask-out the LSO flag. */ flags = DB_CKSUMFLAGS(mp); if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { uint_t tmpcount = 0; /* * LSO fix-up handles checksum emulation * inline (if requested). It also frees mp. */ mac_sw_lso(mp, emul, &tmphead, &tmptail, &tmpcount); if (tmphead == NULL) { /* mac_sw_lso() freed the mp. */ mp = next; continue; } count += tmpcount; } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { tmp = mac_sw_cksum(mp, emul); if (tmp == NULL) { /* mac_sw_cksum() freed the mp. */ mp = next; continue; } tmphead = tmp; tmptail = tmp; count++; } else { /* There is nothing to emulate. */ tmp = mp; tmphead = tmp; tmptail = tmp; count++; } /* * The tmp mblk chain is either the start of the new * chain or added to the tail of the new chain. */ if (head == NULL) { head = tmphead; tail = tmptail; } else { /* Attach the new mblk to the end of the new chain. */ tail->b_next = tmphead; tail = tmptail; } mp = next; } *mp_chain = head; if (otail != NULL) *otail = tail; if (ocount != NULL) *ocount = count; } /* * Add VLAN tag to the specified mblk. */ mblk_t * mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) { mblk_t *hmp; struct ether_vlan_header *evhp; struct ether_header *ehp; ASSERT(pri != 0 || vid != 0); /* * Allocate an mblk for the new tagged ethernet header, * and copy the MAC addresses and ethertype from the * original header. */ hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); if (hmp == NULL) { freemsg(mp); return (NULL); } evhp = (struct ether_vlan_header *)hmp->b_rptr; ehp = (struct ether_header *)mp->b_rptr; bcopy(ehp, evhp, (ETHERADDRL * 2)); evhp->ether_type = ehp->ether_type; evhp->ether_tpid = htons(ETHERTYPE_VLAN); hmp->b_wptr += sizeof (struct ether_vlan_header); mp->b_rptr += sizeof (struct ether_header); /* * Free the original message if it's now empty. Link the * rest of messages to the header message. */ mac_hcksum_clone(mp, hmp); if (MBLKL(mp) == 0) { hmp->b_cont = mp->b_cont; freeb(mp); } else { hmp->b_cont = mp; } ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); /* * Initialize the new TCI (Tag Control Information). */ evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); return (hmp); } /* * Adds a VLAN tag with the specified VID and priority to each mblk of * the specified chain. */ mblk_t * mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) { mblk_t *next_mp, **prev, *mp; mp = mp_chain; prev = &mp_chain; while (mp != NULL) { next_mp = mp->b_next; mp->b_next = NULL; if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { freemsgchain(next_mp); break; } *prev = mp; prev = &mp->b_next; mp = mp->b_next = next_mp; } return (mp_chain); } /* * Strip VLAN tag */ mblk_t * mac_strip_vlan_tag(mblk_t *mp) { mblk_t *newmp; struct ether_vlan_header *evhp; evhp = (struct ether_vlan_header *)mp->b_rptr; if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); if (DB_REF(mp) > 1) { newmp = copymsg(mp); if (newmp == NULL) return (NULL); freemsg(mp); mp = newmp; } evhp = (struct ether_vlan_header *)mp->b_rptr; ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); mp->b_rptr += VLAN_TAGSZ; } return (mp); } /* * Strip VLAN tag from each mblk of the chain. */ mblk_t * mac_strip_vlan_tag_chain(mblk_t *mp_chain) { mblk_t *mp, *next_mp, **prev; mp = mp_chain; prev = &mp_chain; while (mp != NULL) { next_mp = mp->b_next; mp->b_next = NULL; if ((mp = mac_strip_vlan_tag(mp)) == NULL) { freemsgchain(next_mp); break; } *prev = mp; prev = &mp->b_next; mp = mp->b_next = next_mp; } return (mp_chain); } /* * Default callback function. Used when the datapath is not yet initialized. */ /* ARGSUSED */ void mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, boolean_t loopback) { freemsgchain(mp_chain); } /* * Determines the IPv6 header length accounting for all the optional IPv6 * headers (hop-by-hop, destination, routing and fragment). The header length * and next header value (a transport header) is captured. * * Returns B_FALSE if all the IP headers are not in the same mblk otherwise * returns B_TRUE. */ boolean_t mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, uint8_t *next_hdr, ip6_frag_t **fragp) { uint16_t length; uint_t ehdrlen; uint8_t *whereptr; uint8_t *nexthdrp; ip6_dest_t *desthdr; ip6_rthdr_t *rthdr; ip6_frag_t *fraghdr; if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) return (B_FALSE); ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ if (fragp != NULL) *fragp = NULL; nexthdrp = &ip6h->ip6_nxt; while (whereptr < endptr) { /* Is there enough left for len + nexthdr? */ if (whereptr + MIN_EHDR_LEN > endptr) break; switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: /* Assumes the headers are identical for hbh and dst */ desthdr = (ip6_dest_t *)whereptr; ehdrlen = 8 * (desthdr->ip6d_len + 1); if ((uchar_t *)desthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &desthdr->ip6d_nxt; break; case IPPROTO_ROUTING: rthdr = (ip6_rthdr_t *)whereptr; ehdrlen = 8 * (rthdr->ip6r_len + 1); if ((uchar_t *)rthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &rthdr->ip6r_nxt; break; case IPPROTO_FRAGMENT: fraghdr = (ip6_frag_t *)whereptr; ehdrlen = sizeof (ip6_frag_t); if ((uchar_t *)&fraghdr[1] > endptr) return (B_FALSE); nexthdrp = &fraghdr->ip6f_nxt; if (fragp != NULL) *fragp = fraghdr; break; case IPPROTO_NONE: /* No next header means we're finished */ default: *hdr_length = length; *next_hdr = *nexthdrp; return (B_TRUE); } length += ehdrlen; whereptr += ehdrlen; *hdr_length = length; *next_hdr = *nexthdrp; } switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_FRAGMENT: /* * If any know extension headers are still to be processed, * the packet's malformed (or at least all the IP header(s) are * not in the same mblk - and that should never happen. */ return (B_FALSE); default: /* * If we get here, we know that all of the IP headers were in * the same mblk, even if the ULP header is in the next mblk. */ *hdr_length = length; *next_hdr = *nexthdrp; return (B_TRUE); } } /* * The following set of routines are there to take care of interrupt * re-targeting for legacy (fixed) interrupts. Some older versions * of the popular NICs like e1000g do not support MSI-X interrupts * and they reserve fixed interrupts for RX/TX rings. To re-target * these interrupts, PCITOOL ioctls need to be used. */ typedef struct mac_dladm_intr { int ino; int cpu_id; char driver_path[MAXPATHLEN]; char nexus_path[MAXPATHLEN]; } mac_dladm_intr_t; /* Bind the interrupt to cpu_num */ static int mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) { pcitool_intr_set_t iset; int err; iset.old_cpu = oldcpuid; iset.ino = ino; iset.cpu_id = cpu_num; iset.user_version = PCITOOL_VERSION; err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, kcred, NULL); return (err); } /* * Search interrupt information. iget is filled in with the info to search */ static boolean_t mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) { int i; char driver_path[2 * MAXPATHLEN]; for (i = 0; i < iget_p->num_devs; i++) { (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, ":%s%d", iget_p->dev[i].driver_name, iget_p->dev[i].dev_inst); /* Match the device path for the device path */ if (strcmp(driver_path, dln->driver_path) == 0) { dln->ino = iget_p->ino; dln->cpu_id = iget_p->cpu_id; return (B_TRUE); } } return (B_FALSE); } /* * Get information about ino, i.e. if this is the interrupt for our * device and where it is bound etc. */ static boolean_t mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, mac_dladm_intr_t *dln) { pcitool_intr_get_t *iget_p; int ipsz; int nipsz; int err; uint8_t inum; /* * Check if SLEEP is OK, i.e if could come here in response to * changing the fanout due to some callback from the driver, say * link speed changes. */ ipsz = PCITOOL_IGET_SIZE(0); iget_p = kmem_zalloc(ipsz, KM_SLEEP); iget_p->num_devs_ret = 0; iget_p->user_version = PCITOOL_VERSION; iget_p->cpu_id = oldcpuid; iget_p->ino = ino; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, FKIOCTL, kcred, NULL); if (err != 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } if (iget_p->num_devs == 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } inum = iget_p->num_devs; if (iget_p->num_devs_ret < iget_p->num_devs) { /* Reallocate */ nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); kmem_free(iget_p, ipsz); ipsz = nipsz; iget_p = kmem_zalloc(ipsz, KM_SLEEP); iget_p->num_devs_ret = inum; iget_p->cpu_id = oldcpuid; iget_p->ino = ino; iget_p->user_version = PCITOOL_VERSION; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, FKIOCTL, kcred, NULL); if (err != 0) { kmem_free(iget_p, ipsz); return (B_FALSE); } /* defensive */ if (iget_p->num_devs != iget_p->num_devs_ret) { kmem_free(iget_p, ipsz); return (B_FALSE); } } if (mac_search_intrinfo(iget_p, dln)) { kmem_free(iget_p, ipsz); return (B_TRUE); } kmem_free(iget_p, ipsz); return (B_FALSE); } /* * Get the interrupts and check each one to see if it is for our device. */ static int mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) { pcitool_intr_info_t intr_info; int err; int ino; int oldcpuid; err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, FKIOCTL, kcred, NULL); if (err != 0) return (-1); for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { for (ino = 0; ino < intr_info.num_intr; ino++) { if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { if (dln->cpu_id == cpuid) return (0); return (1); } } } return (-1); } /* * Obtain the nexus parent node info. for mdip. */ static dev_info_t * mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) { struct dev_info *tdip = (struct dev_info *)mdip; struct ddi_minor_data *minordata; int circ; dev_info_t *pdip; char pathname[MAXPATHLEN]; while (tdip != NULL) { /* * The netboot code could call this function while walking the * device tree so we need to use ndi_devi_tryenter() here to * avoid deadlock. */ if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) break; for (minordata = tdip->devi_minor; minordata != NULL; minordata = minordata->next) { if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, strlen(DDI_NT_INTRCTL)) == 0) { pdip = minordata->dip; (void) ddi_pathname(pdip, pathname); (void) snprintf(dln->nexus_path, MAXPATHLEN, "/devices%s:intr", pathname); (void) ddi_pathname_minor(minordata, pathname); ndi_devi_exit((dev_info_t *)tdip, circ); return (pdip); } } ndi_devi_exit((dev_info_t *)tdip, circ); tdip = tdip->devi_parent; } return (NULL); } /* * For a primary MAC client, if the user has set a list or CPUs or * we have obtained it implicitly, we try to retarget the interrupt * for that device on one of the CPUs in the list. * We assign the interrupt to the same CPU as the poll thread. */ static boolean_t mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) { ldi_handle_t lh = NULL; ldi_ident_t li = NULL; int err; int ret; mac_dladm_intr_t dln; dev_info_t *dip; struct ddi_minor_data *minordata; dln.nexus_path[0] = '\0'; dln.driver_path[0] = '\0'; minordata = ((struct dev_info *)mdip)->devi_minor; while (minordata != NULL) { if (minordata->type == DDM_MINOR) break; minordata = minordata->next; } if (minordata == NULL) return (B_FALSE); (void) ddi_pathname_minor(minordata, dln.driver_path); dip = mac_get_nexus_node(mdip, &dln); /* defensive */ if (dip == NULL) return (B_FALSE); err = ldi_ident_from_major(ddi_driver_major(dip), &li); if (err != 0) return (B_FALSE); err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); if (err != 0) return (B_FALSE); ret = mac_validate_intr(lh, &dln, cpuid); if (ret < 0) { (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_FALSE); } /* cmn_note? */ if (ret != 0) if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) != 0) { (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_FALSE); } (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_TRUE); } void mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) { dev_info_t *mdip = (dev_info_t *)arg; mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_resource_props_t *mrp; mac_perim_handle_t mph; flow_entry_t *flent = mcip->mci_flent; mac_soft_ring_set_t *rx_srs; mac_cpus_t *srs_cpu; if (!mac_check_interrupt_binding(mdip, cpuid)) cpuid = -1; mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); mrp = MCIP_RESOURCE_PROPS(mcip); mrp->mrp_rx_intr_cpu = cpuid; if (flent != NULL && flent->fe_rx_srs_cnt == 2) { rx_srs = flent->fe_rx_srs[1]; srs_cpu = &rx_srs->srs_cpu; srs_cpu->mc_rx_intr_cpu = cpuid; } mac_perim_exit(mph); } int32_t mac_client_intr_cpu(mac_client_handle_t mch) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_cpus_t *srs_cpu; mac_soft_ring_set_t *rx_srs; flow_entry_t *flent = mcip->mci_flent; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); mac_ring_t *ring; mac_intr_t *mintr; /* * Check if we need to retarget the interrupt. We do this only * for the primary MAC client. We do this if we have the only * exclusive ring in the group. */ if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { rx_srs = flent->fe_rx_srs[1]; srs_cpu = &rx_srs->srs_cpu; ring = rx_srs->srs_ring; mintr = &ring->mr_info.mri_intr; /* * If ddi_handle is present or the poll CPU is * already bound to the interrupt CPU, return -1. */ if (mintr->mi_ddi_handle != NULL || ((mrp->mrp_ncpus != 0) && (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { return (-1); } return (srs_cpu->mc_rx_pollid); } return (-1); } void * mac_get_devinfo(mac_handle_t mh) { mac_impl_t *mip = (mac_impl_t *)mh; return ((void *)mip->mi_dip); } #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) uint64_t mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) { struct ether_header *ehp; uint64_t hash = 0; uint16_t sap; uint_t skip_len; uint8_t proto; boolean_t ip_fragmented; /* * We may want to have one of these per MAC type plugin in the * future. For now supports only ethernet. */ if (media != DL_ETHER) return (0L); /* for now we support only outbound packets */ ASSERT(is_outbound); ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); /* compute L2 hash */ ehp = (struct ether_header *)mp->b_rptr; if ((policy & MAC_PKT_HASH_L2) != 0) { uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); policy &= ~MAC_PKT_HASH_L2; } if (policy == 0) goto done; /* skip ethernet header */ sap = ntohs(ehp->ether_type); if (sap == ETHERTYPE_VLAN) { struct ether_vlan_header *evhp; mblk_t *newmp = NULL; skip_len = sizeof (struct ether_vlan_header); if (MBLKL(mp) < skip_len) { /* the vlan tag is the payload, pull up first */ newmp = msgpullup(mp, -1); if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { goto done; } evhp = (struct ether_vlan_header *)newmp->b_rptr; } else { evhp = (struct ether_vlan_header *)mp->b_rptr; } sap = ntohs(evhp->ether_type); freemsg(newmp); } else { skip_len = sizeof (struct ether_header); } /* if ethernet header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; /* compute IP src/dst addresses hash and skip IPv{4,6} header */ switch (sap) { case ETHERTYPE_IP: { ipha_t *iphp; /* * If the header is not aligned or the header doesn't fit * in the mblk, bail now. Note that this may cause packets * reordering. */ iphp = (ipha_t *)(mp->b_rptr + skip_len); if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || !OK_32PTR((char *)iphp)) goto done; proto = iphp->ipha_protocol; skip_len += IPH_HDR_LENGTH(iphp); /* Check if the packet is fragmented. */ ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & IPH_OFFSET; /* * For fragmented packets, use addresses in addition to * the frag_id to generate the hash inorder to get * better distribution. */ if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); hash ^= (PKT_HASH_4BYTES(ip_src) ^ PKT_HASH_4BYTES(ip_dst)); policy &= ~MAC_PKT_HASH_L3; } if (ip_fragmented) { uint8_t *identp = (uint8_t *)&iphp->ipha_ident; hash ^= PKT_HASH_2BYTES(identp); goto done; } break; } case ETHERTYPE_IPV6: { ip6_t *ip6hp; ip6_frag_t *frag = NULL; uint16_t hdr_length; /* * If the header is not aligned or the header doesn't fit * in the mblk, bail now. Note that this may cause packets * reordering. */ ip6hp = (ip6_t *)(mp->b_rptr + skip_len); if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || !OK_32PTR((char *)ip6hp)) goto done; if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, &proto, &frag)) goto done; skip_len += hdr_length; /* * For fragmented packets, use addresses in addition to * the frag_id to generate the hash inorder to get * better distribution. */ if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); hash ^= (PKT_HASH_4BYTES(ip_src) ^ PKT_HASH_4BYTES(ip_dst)); policy &= ~MAC_PKT_HASH_L3; } if (frag != NULL) { uint8_t *identp = (uint8_t *)&frag->ip6f_ident; hash ^= PKT_HASH_4BYTES(identp); goto done; } break; } default: goto done; } if (policy == 0) goto done; /* if ip header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } /* parse ULP header */ again: switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_ESP: case IPPROTO_SCTP: /* * These Internet Protocols are intentionally designed * for hashing from the git-go. Port numbers are in the first * word for transports, SPI is first for ESP. */ if (mp->b_rptr + skip_len + 4 > mp->b_wptr) goto done; hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); break; case IPPROTO_AH: { ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); uint_t ah_length = AH_TOTAL_LEN(ah); if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) goto done; proto = ah->ah_nexthdr; skip_len += ah_length; /* if AH header is in its own mblk, skip it */ if (MBLKL(mp) <= skip_len) { skip_len -= MBLKL(mp); mp = mp->b_cont; if (mp == NULL) goto done; } goto again; } } done: return (hash); }