/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 1990 Mentat Inc. */ #include #include #include #include #include #include #include #include #define _SUN_TPI_VERSION 2 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Temporary; for CR 6451644 work-around */ #include /* * Naming conventions: * These rules should be judiciously applied * if there is a need to identify something as IPv6 versus IPv4 * IPv6 funcions will end with _v6 in the ip module. * IPv6 funcions will end with _ipv6 in the transport modules. * IPv6 macros: * Some macros end with _V6; e.g. ILL_FRAG_HASH_V6 * Some macros start with V6_; e.g. V6_OR_V4_INADDR_ANY * And then there are ..V4_PART_OF_V6. * The intent is that macros in the ip module end with _V6. * IPv6 global variables will start with ipv6_ * IPv6 structures will start with ipv6 * IPv6 defined constants should start with IPV6_ * (but then there are NDP_DEFAULT_VERS_PRI_AND_FLOW, etc) */ /* * ip6opt_ls is used to enable IPv6 (via /etc/system on TX systems). * We need to do this because we didn't obtain the IP6OPT_LS (0x0a) * from IANA. This mechanism will remain in effect until an official * number is obtained. */ uchar_t ip6opt_ls; const in6_addr_t ipv6_all_ones = { 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU }; const in6_addr_t ipv6_all_zeros = { 0, 0, 0, 0 }; #ifdef _BIG_ENDIAN const in6_addr_t ipv6_unspecified_group = { 0xff000000U, 0, 0, 0 }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_unspecified_group = { 0x000000ffU, 0, 0, 0 }; #endif /* _BIG_ENDIAN */ #ifdef _BIG_ENDIAN const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x00000001U }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x01000000U }; #endif /* _BIG_ENDIAN */ #ifdef _BIG_ENDIAN const in6_addr_t ipv6_all_hosts_mcast = { 0xff020000U, 0, 0, 0x00000001U }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_all_hosts_mcast = { 0x000002ffU, 0, 0, 0x01000000U }; #endif /* _BIG_ENDIAN */ #ifdef _BIG_ENDIAN const in6_addr_t ipv6_all_rtrs_mcast = { 0xff020000U, 0, 0, 0x00000002U }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_all_rtrs_mcast = { 0x000002ffU, 0, 0, 0x02000000U }; #endif /* _BIG_ENDIAN */ #ifdef _BIG_ENDIAN const in6_addr_t ipv6_all_v2rtrs_mcast = { 0xff020000U, 0, 0, 0x00000016U }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_all_v2rtrs_mcast = { 0x000002ffU, 0, 0, 0x16000000U }; #endif /* _BIG_ENDIAN */ #ifdef _BIG_ENDIAN const in6_addr_t ipv6_solicited_node_mcast = { 0xff020000U, 0, 0x00000001U, 0xff000000U }; #else /* _BIG_ENDIAN */ const in6_addr_t ipv6_solicited_node_mcast = { 0x000002ffU, 0, 0x01000000U, 0x000000ffU }; #endif /* _BIG_ENDIAN */ static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *); static void icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *); static void icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *, ip_recv_attr_t *); static void icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *, ip_recv_attr_t *); static void icmp_send_redirect_v6(mblk_t *, in6_addr_t *, in6_addr_t *, ip_recv_attr_t *); static void icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *, ip_recv_attr_t *); static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *); /* * icmp_inbound_v6 deals with ICMP messages that are handled by IP. * If the ICMP message is consumed by IP, i.e., it should not be delivered * to any IPPROTO_ICMP raw sockets, then it returns NULL. * Likewise, if the ICMP error is misformed (too short, etc), then it * returns NULL. The caller uses this to determine whether or not to send * to raw sockets. * * All error messages are passed to the matching transport stream. * * See comment for icmp_inbound_v4() on how IPsec is handled. */ mblk_t * icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira) { icmp6_t *icmp6; ip6_t *ip6h; /* Outer header */ int ip_hdr_length; /* Outer header length */ boolean_t interested; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; mblk_t *mp_ret = NULL; ip6h = (ip6_t *)mp->b_rptr; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs); /* Make sure ira_l2src is set for ndp_input */ if (!(ira->ira_flags & IRAF_L2SRC_SET)) ip_setl2src(mp, ira, ira->ira_rill); ip_hdr_length = ira->ira_ip_hdr_length; if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) { if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); freemsg(mp); return (NULL); } ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira); if (ip6h == NULL) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); return (NULL); } } icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6); ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type, icmp6->icmp6_code)); /* * We will set "interested" to "true" if we should pass a copy to * the transport i.e., if it is an error message. */ interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInDestUnreachs); if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN) BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInAdminProhibs); break; case ICMP6_TIME_EXCEEDED: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInTimeExcds); break; case ICMP6_PARAM_PROB: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInParmProblems); break; case ICMP6_PACKET_TOO_BIG: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs); break; case ICMP6_ECHO_REQUEST: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos); if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && !ipst->ips_ipv6_resp_echo_mcast) break; /* * We must have exclusive use of the mblk to convert it to * a response. * If not, we copy it. */ if (mp->b_datap->db_ref > 1) { mblk_t *mp1; mp1 = copymsg(mp); if (mp1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); freemsg(mp); return (NULL); } freemsg(mp); mp = mp1; ip6h = (ip6_t *)mp->b_rptr; icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); } icmp6->icmp6_type = ICMP6_ECHO_REPLY; icmp_send_reply_v6(mp, ip6h, icmp6, ira); return (NULL); case ICMP6_ECHO_REPLY: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies); break; case ND_ROUTER_SOLICIT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterSolicits); break; case ND_ROUTER_ADVERT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterAdvertisements); break; case ND_NEIGHBOR_SOLICIT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits); ndp_input(mp, ira); return (NULL); case ND_NEIGHBOR_ADVERT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborAdvertisements); ndp_input(mp, ira); return (NULL); case ND_REDIRECT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects); if (ipst->ips_ipv6_ignore_redirect) break; /* We now allow a RAW socket to receive this. */ interested = B_TRUE; break; /* * The next three icmp messages will be handled by MLD. * Pass all valid MLD packets up to any process(es) * listening on a raw ICMP socket. */ case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_REDUCTION: mp = mld_input(mp, ira); return (mp); default: break; } /* * See if there is an ICMP client to avoid an extra copymsg/freemsg * if there isn't one. */ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) { /* If there is an ICMP client and we want one too, copy it. */ if (!interested) { /* Caller will deliver to RAW sockets */ return (mp); } mp_ret = copymsg(mp); if (mp_ret == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); } } else if (!interested) { /* Neither we nor raw sockets are interested. Drop packet now */ freemsg(mp); return (NULL); } /* * ICMP error or redirect packet. Make sure we have enough of * the header and that db_ref == 1 since we might end up modifying * the packet. */ if (mp->b_cont != NULL) { if (ip_pullup(mp, -1, ira) == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards - ip_pullup", mp, ill); freemsg(mp); return (mp_ret); } } if (mp->b_datap->db_ref > 1) { mblk_t *mp1; mp1 = copymsg(mp); if (mp1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); freemsg(mp); return (mp_ret); } freemsg(mp); mp = mp1; } /* * In case mp has changed, verify the message before any further * processes. */ ip6h = (ip6_t *)mp->b_rptr; icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { freemsg(mp); return (mp_ret); } switch (icmp6->icmp6_type) { case ND_REDIRECT: icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira); break; case ICMP6_PACKET_TOO_BIG: /* Update DCE and adjust MTU is icmp header if needed */ icmp_inbound_too_big_v6(icmp6, ira); /* FALLTHRU */ default: icmp_inbound_error_fanout_v6(mp, icmp6, ira); break; } return (mp_ret); } /* * Send an ICMP echo reply. * The caller has already updated the payload part of the packet. * We handle the ICMP checksum, IP source address selection and feed * the packet into ip_output_simple. */ static void icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6, ip_recv_attr_t *ira) { uint_t ip_hdr_length = ira->ira_ip_hdr_length; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ip_xmit_attr_t ixas; in6_addr_t origsrc; /* * Remove any extension headers (do not reverse a source route) * and clear the flow id (keep traffic class for now). */ if (ip_hdr_length != IPV6_HDR_LEN) { int i; for (i = 0; i < IPV6_HDR_LEN; i++) { mp->b_rptr[ip_hdr_length - i - 1] = mp->b_rptr[IPV6_HDR_LEN - i - 1]; } mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN); ip6h = (ip6_t *)mp->b_rptr; ip6h->ip6_nxt = IPPROTO_ICMPV6; i = ntohs(ip6h->ip6_plen); i -= (ip_hdr_length - IPV6_HDR_LEN); ip6h->ip6_plen = htons(i); ip_hdr_length = IPV6_HDR_LEN; ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp)); } ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; /* Reverse the source and destination addresses. */ origsrc = ip6h->ip6_src; ip6h->ip6_src = ip6h->ip6_dst; ip6h->ip6_dst = origsrc; /* set the hop limit */ ip6h->ip6_hops = ipst->ips_ipv6_def_hops; /* * Prepare for checksum by putting icmp length in the icmp * checksum field. The checksum is calculated in ip_output */ icmp6->icmp6_cksum = ip6h->ip6_plen; bzero(&ixas, sizeof (ixas)); ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; ixas.ixa_zoneid = ira->ira_zoneid; ixas.ixa_cred = kcred; ixas.ixa_cpid = NOPID; ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ ixas.ixa_ifindex = 0; ixas.ixa_ipst = ipst; ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { /* * This packet should go out the same way as it * came in i.e in clear, independent of the IPsec * policy for transmitting packets. */ ixas.ixa_flags |= IXAF_NO_IPSEC; } else { if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); /* Note: mp already consumed and ip_drop_packet done */ return; } } /* Was the destination (now source) link-local? Send out same group */ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { ixas.ixa_flags |= IXAF_SCOPEID_SET; if (IS_UNDER_IPMP(ill)) ixas.ixa_scopeid = ill_get_upper_ifindex(ill); else ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex; } if (ira->ira_flags & IRAF_MULTIBROADCAST) { /* * Not one or our addresses (IRE_LOCALs), thus we let * ip_output_simple pick the source. */ ip6h->ip6_src = ipv6_all_zeros; ixas.ixa_flags |= IXAF_SET_SOURCE; } /* Should we send using dce_pmtu? */ if (ipst->ips_ipv6_icmp_return_pmtu) ixas.ixa_flags |= IXAF_PMTU_DISCOVERY; (void) ip_output_simple(mp, &ixas); ixa_cleanup(&ixas); } /* * Verify the ICMP messages for either for ICMP error or redirect packet. * The caller should have fully pulled up the message. If it's a redirect * packet, only basic checks on IP header will be done; otherwise, verify * the packet by looking at the included ULP header. * * Called before icmp_inbound_error_fanout_v6 is called. */ static boolean_t icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira) { ill_t *ill = ira->ira_ill; uint16_t hdr_length; uint8_t *nexthdrp; uint8_t nexthdr; ip_stack_t *ipst = ill->ill_ipst; conn_t *connp; ip6_t *ip6h; /* Inner header */ ip6h = (ip6_t *)&icmp6[1]; if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr) goto truncated; if (icmp6->icmp6_type == ND_REDIRECT) { hdr_length = sizeof (nd_redirect_t); } else { if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION)) goto discard_pkt; hdr_length = IPV6_HDR_LEN; } if ((uchar_t *)ip6h + hdr_length > mp->b_wptr) goto truncated; /* * Stop here for ICMP_REDIRECT. */ if (icmp6->icmp6_type == ND_REDIRECT) return (B_TRUE); /* * ICMP errors only. */ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) goto discard_pkt; nexthdr = *nexthdrp; /* Try to pass the ICMP message to clients who need it */ switch (nexthdr) { case IPPROTO_UDP: /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) goto truncated; break; case IPPROTO_TCP: { tcpha_t *tcpha; /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) goto truncated; tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length); /* * With IPMP we need to match across group, which we do * since we have the upper ill from ira_ill. */ connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst); if (connp == NULL) goto discard_pkt; if ((connp->conn_verifyicmp != NULL) && !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) { CONN_DEC_REF(connp); goto discard_pkt; } CONN_DEC_REF(connp); break; } case IPPROTO_SCTP: /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) goto truncated; break; case IPPROTO_ESP: case IPPROTO_AH: break; case IPPROTO_ENCAP: case IPPROTO_IPV6: { /* Look for self-encapsulated packets that caused an error */ ip6_t *in_ip6h; in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length); if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ? sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr) goto truncated; break; } default: break; } return (B_TRUE); discard_pkt: /* Bogus ICMP error. */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); return (B_FALSE); truncated: /* We pulled up everthing already. Must be truncated */ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); return (B_FALSE); } /* * Process received IPv6 ICMP Packet too big. * The caller is responsible for validating the packet before passing it in * and also to fanout the ICMP error to any matching transport conns. Assumes * the message has been fully pulled up. * * Before getting here, the caller has called icmp_inbound_verify_v6() * that should have verified with ULP to prevent undoing the changes we're * going to make to DCE. For example, TCP might have verified that the packet * which generated error is in the send window. * * In some cases modified this MTU in the ICMP header packet; the caller * should pass to the matching ULP after this returns. */ static void icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira) { uint32_t mtu; dce_t *dce; ill_t *ill = ira->ira_ill; /* Upper ill if IPMP */ ip_stack_t *ipst = ill->ill_ipst; int old_max_frag; in6_addr_t final_dst; ip6_t *ip6h; /* Inner IP header */ /* Caller has already pulled up everything. */ ip6h = (ip6_t *)&icmp6[1]; final_dst = ip_get_dst_v6(ip6h, NULL, NULL); /* * For link local destinations matching simply on address is not * sufficient. Same link local addresses for different ILL's is * possible. */ if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) { dce = dce_lookup_and_add_v6(&final_dst, ill->ill_phyint->phyint_ifindex, ipst); } else { dce = dce_lookup_and_add_v6(&final_dst, 0, ipst); } if (dce == NULL) { /* Couldn't add a unique one - ENOMEM */ if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("icmp_inbound_too_big_v6:" "no dce for dst %s\n", AF_INET6, &final_dst); } return; } mtu = ntohl(icmp6->icmp6_mtu); mutex_enter(&dce->dce_lock); if (dce->dce_flags & DCEF_PMTU) old_max_frag = dce->dce_pmtu; else old_max_frag = ill->ill_mtu; if (mtu < IPV6_MIN_MTU) { ip1dbg(("Received mtu less than IPv6 " "min mtu %d: %d\n", IPV6_MIN_MTU, mtu)); mtu = IPV6_MIN_MTU; /* * If an mtu less than IPv6 min mtu is received, * we must include a fragment header in * subsequent packets. */ dce->dce_flags |= DCEF_TOO_SMALL_PMTU; } else { dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU; } ip1dbg(("Received mtu from router: %d\n", mtu)); dce->dce_pmtu = MIN(old_max_frag, mtu); /* Prepare to send the new max frag size for the ULP. */ if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) { /* * If we need a fragment header in every packet * (above case or multirouting), make sure the * ULP takes it into account when computing the * payload size. */ icmp6->icmp6_mtu = htonl(dce->dce_pmtu - sizeof (ip6_frag_t)); } else { icmp6->icmp6_mtu = htonl(dce->dce_pmtu); } /* We now have a PMTU for sure */ dce->dce_flags |= DCEF_PMTU; dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); mutex_exit(&dce->dce_lock); /* * After dropping the lock the new value is visible to everyone. * Then we bump the generation number so any cached values reinspect * the dce_t. */ dce_increment_generation(dce); dce_refrele(dce); } /* * Fanout received ICMPv6 error packets to the transports. * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else. * * The caller must have called icmp_inbound_verify_v6. */ void icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira) { uint16_t *up; /* Pointer to ports in ULP header */ uint32_t ports; /* reversed ports for fanout */ ip6_t rip6h; /* With reversed addresses */ ip6_t *ip6h; /* Inner IP header */ uint16_t hdr_length; /* Inner IP header length */ uint8_t *nexthdrp; uint8_t nexthdr; tcpha_t *tcpha; conn_t *connp; ill_t *ill = ira->ira_ill; /* Upper in the case of IPMP */ ip_stack_t *ipst = ill->ill_ipst; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; /* Caller has already pulled up everything. */ ip6h = (ip6_t *)&icmp6[1]; ASSERT(mp->b_cont == NULL); ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr); if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) goto drop_pkt; nexthdr = *nexthdrp; ira->ira_protocol = nexthdr; /* * We need a separate IP header with the source and destination * addresses reversed to do fanout/classification because the ip6h in * the ICMPv6 error is in the form we sent it out. */ rip6h.ip6_src = ip6h->ip6_dst; rip6h.ip6_dst = ip6h->ip6_src; rip6h.ip6_nxt = nexthdr; /* Try to pass the ICMP message to clients who need it */ switch (nexthdr) { case IPPROTO_UDP: { /* Attempt to find a client stream based on port. */ up = (uint16_t *)((uchar_t *)ip6h + hdr_length); /* Note that we send error to all matches. */ ira->ira_flags |= IRAF_ICMP_ERROR; ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira); ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } case IPPROTO_TCP: { /* * Attempt to find a client stream based on port. * Note that we do a reverse lookup since the header is * in the form we sent it out. */ tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length); /* * With IPMP we need to match across group, which we do * since we have the upper ill from ira_ill. */ connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst); if (connp == NULL) { goto drop_pkt; } if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || (ira->ira_flags & IRAF_IPSEC_SECURE)) { mp = ipsec_check_inbound_policy(mp, connp, NULL, ip6h, ira); if (mp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); /* Note that mp is NULL */ ip_drop_input("ipIfStatsInDiscards", mp, ill); CONN_DEC_REF(connp); return; } } ira->ira_flags |= IRAF_ICMP_ERROR; if (IPCL_IS_TCP(connp)) { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recvicmp, connp, ira, SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR); } else { /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ ill_t *rill = ira->ira_rill; ira->ira_ill = ira->ira_rill = NULL; (connp->conn_recv)(connp, mp, NULL, ira); CONN_DEC_REF(connp); ira->ira_ill = ill; ira->ira_rill = rill; } ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } case IPPROTO_SCTP: up = (uint16_t *)((uchar_t *)ip6h + hdr_length); /* Find a SCTP client stream for this packet. */ ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; ira->ira_flags |= IRAF_ICMP_ERROR; ip_fanout_sctp(mp, NULL, &rip6h, ports, ira); ira->ira_flags &= ~IRAF_ICMP_ERROR; return; case IPPROTO_ESP: case IPPROTO_AH: if (!ipsec_loaded(ipss)) { ip_proto_not_sup(mp, ira); return; } if (nexthdr == IPPROTO_ESP) mp = ipsecesp_icmp_error(mp, ira); else mp = ipsecah_icmp_error(mp, ira); if (mp == NULL) return; /* Just in case ipsec didn't preserve the NULL b_cont */ if (mp->b_cont != NULL) { if (!pullupmsg(mp, -1)) goto drop_pkt; } /* * If succesful, the mp has been modified to not include * the ESP/AH header so we can fanout to the ULP's icmp * error handler. */ if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN) goto drop_pkt; ip6h = (ip6_t *)mp->b_rptr; /* Don't call hdr_length_v6() unless you have to. */ if (ip6h->ip6_nxt != IPPROTO_ICMPV6) hdr_length = ip_hdr_length_v6(mp, ip6h); else hdr_length = IPV6_HDR_LEN; /* Verify the modified message before any further processes. */ icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { freemsg(mp); return; } icmp_inbound_error_fanout_v6(mp, icmp6, ira); return; case IPPROTO_IPV6: { /* Look for self-encapsulated packets that caused an error */ ip6_t *in_ip6h; in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length); if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) && IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) { /* * Self-encapsulated case. As in the ipv4 case, * we need to strip the 2nd IP header. Since mp * is already pulled-up, we can simply bcopy * the 3rd header + data over the 2nd header. */ uint16_t unused_len; /* * Make sure we don't do recursion more than once. */ if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h, &unused_len, &nexthdrp) || *nexthdrp == IPPROTO_IPV6) { goto drop_pkt; } /* * Copy the 3rd header + remaining data on top * of the 2nd header. */ bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h); /* * Subtract length of the 2nd header. */ mp->b_wptr -= hdr_length; ip6h = (ip6_t *)mp->b_rptr; /* Don't call hdr_length_v6() unless you have to. */ if (ip6h->ip6_nxt != IPPROTO_ICMPV6) hdr_length = ip_hdr_length_v6(mp, ip6h); else hdr_length = IPV6_HDR_LEN; /* * Verify the modified message before any further * processes. */ icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { freemsg(mp); return; } /* * Now recurse, and see what I _really_ should be * doing here. */ icmp_inbound_error_fanout_v6(mp, icmp6, ira); return; } /* FALLTHRU */ } case IPPROTO_ENCAP: if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src, &rip6h.ip6_dst, ipst)) != NULL) { ira->ira_flags |= IRAF_ICMP_ERROR; connp->conn_recvicmp(connp, mp, NULL, ira); CONN_DEC_REF(connp); ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } /* * No IP tunnel is interested, fallthrough and see * if a raw socket will want it. */ /* FALLTHRU */ default: ira->ira_flags |= IRAF_ICMP_ERROR; ASSERT(ira->ira_protocol == nexthdr); ip_fanout_proto_v6(mp, &rip6h, ira); ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } /* NOTREACHED */ drop_pkt: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n")); freemsg(mp); } /* * Process received IPv6 ICMP Redirect messages. * Assumes the caller has verified that the headers are in the pulled up mblk. * Consumes mp. */ /* ARGSUSED */ static void icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd, ip_recv_attr_t *ira) { ire_t *ire, *nire; ire_t *prev_ire = NULL; ire_t *redir_ire; in6_addr_t *src, *dst, *gateway; nd_opt_hdr_t *opt; nce_t *nce; int ncec_flags = 0; int err = 0; boolean_t redirect_to_router = B_FALSE; int len; int optlen; ill_t *ill = ira->ira_rill; ill_t *rill = ira->ira_rill; ip_stack_t *ipst = ill->ill_ipst; /* * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill * and make it be the IPMP upper so avoid being confused by a packet * addressed to a unicast address on a different ill. */ if (IS_UNDER_IPMP(rill)) { rill = ipmp_ill_hold_ipmp_ill(rill); if (rill == NULL) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill", mp, ill); freemsg(mp); return; } ASSERT(rill != ira->ira_rill); } len = mp->b_wptr - (uchar_t *)rd; src = &ip6h->ip6_src; dst = &rd->nd_rd_dst; gateway = &rd->nd_rd_target; /* Verify if it is a valid redirect */ if (!IN6_IS_ADDR_LINKLOCAL(src) || (ip6h->ip6_hops != IPV6_MAX_HOPS) || (rd->nd_rd_code != 0) || (len < sizeof (nd_redirect_t)) || (IN6_IS_ADDR_V4MAPPED(dst)) || (IN6_IS_ADDR_MULTICAST(dst))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill); goto fail_redirect; } if (!(IN6_IS_ADDR_LINKLOCAL(gateway) || IN6_ARE_ADDR_EQUAL(gateway, dst))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway", mp, ill); goto fail_redirect; } optlen = len - sizeof (nd_redirect_t); if (optlen != 0) { if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_input("ipv6IfIcmpInBadRedirects - options", mp, ill); goto fail_redirect; } } if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) { redirect_to_router = B_TRUE; ncec_flags |= NCE_F_ISROUTER; } else { gateway = dst; /* Add nce for dst */ } /* * Verify that the IP source address of the redirect is * the same as the current first-hop router for the specified * ICMP destination address. * Also, Make sure we had a route for the dest in question and * that route was pointing to the old gateway (the source of the * redirect packet.) * We do longest match and then compare ire_gateway_addr_v6 below. */ prev_ire = ire_ftable_lookup_v6(dst, 0, 0, 0, rill, ALL_ZONES, NULL, MATCH_IRE_ILL, 0, ipst, NULL); /* * Check that * the redirect was not from ourselves * old gateway is still directly reachable */ if (prev_ire == NULL || (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) || (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || !IN6_ARE_ADDR_EQUAL(src, &prev_ire->ire_gateway_addr_v6)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill); goto fail_redirect; } ASSERT(prev_ire->ire_ill != NULL); if (prev_ire->ire_ill->ill_flags & ILLF_NONUD) ncec_flags |= NCE_F_NONUD; opt = (nd_opt_hdr_t *)&rd[1]; opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { err = nce_lookup_then_add_v6(rill, (uchar_t *)&opt[1], /* Link layer address */ rill->ill_phys_addr_length, gateway, ncec_flags, ND_STALE, &nce); switch (err) { case 0: nce_refrele(nce); break; case EEXIST: /* * Check to see if link layer address has changed and * process the ncec_state accordingly. */ nce_process(nce->nce_common, (uchar_t *)&opt[1], 0, B_FALSE); nce_refrele(nce); break; default: ip1dbg(("icmp_redirect_v6: NCE create failed %d\n", err)); goto fail_redirect; } } if (redirect_to_router) { ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway)); /* * Create a Route Association. This will allow us to remember * a router told us to use the particular gateway. */ ire = ire_create_v6( dst, &ipv6_all_ones, /* mask */ gateway, /* gateway addr */ IRE_HOST, prev_ire->ire_ill, ALL_ZONES, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), NULL, ipst); } else { ipif_t *ipif; in6_addr_t gw; /* * Just create an on link entry, i.e. interface route. * The gateway field is our link-local on the ill. */ mutex_enter(&rill->ill_lock); for (ipif = rill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) break; } if (ipif == NULL) { /* We have no link-local address! */ mutex_exit(&rill->ill_lock); goto fail_redirect; } gw = ipif->ipif_v6lcl_addr; mutex_exit(&rill->ill_lock); ire = ire_create_v6( dst, /* gateway == dst */ &ipv6_all_ones, /* mask */ &gw, /* gateway addr */ rill->ill_net_type, /* IF_[NO]RESOLVER */ prev_ire->ire_ill, ALL_ZONES, (RTF_DYNAMIC | RTF_HOST), NULL, ipst); } if (ire == NULL) goto fail_redirect; nire = ire_add(ire); /* Check if it was a duplicate entry */ if (nire != NULL && nire != ire) { ASSERT(nire->ire_identical_ref > 1); ire_delete(nire); ire_refrele(nire); nire = NULL; } ire = nire; if (ire != NULL) { ire_refrele(ire); /* Held in ire_add */ /* tell routing sockets that we received a redirect */ ip_rts_change_v6(RTM_REDIRECT, &rd->nd_rd_dst, &rd->nd_rd_target, &ipv6_all_ones, 0, src, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); /* * Delete any existing IRE_HOST type ires for this destination. * This together with the added IRE has the effect of * modifying an existing redirect. */ redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST, prev_ire->ire_ill, ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); if (redir_ire != NULL) { if (redir_ire->ire_flags & RTF_DYNAMIC) ire_delete(redir_ire); ire_refrele(redir_ire); } } ire_refrele(prev_ire); prev_ire = NULL; fail_redirect: if (prev_ire != NULL) ire_refrele(prev_ire); freemsg(mp); if (rill != ira->ira_rill) ill_refrele(rill); } /* * Build and ship an IPv6 ICMP message using the packet data in mp, * and the ICMP header pointed to by "stuff". (May be called as * writer.) * Note: assumes that icmp_pkt_err_ok_v6 has been called to * verify that an icmp error packet can be sent. * * If v6src_ptr is set use it as a source. Otherwise select a reasonable * source address (see above function). */ static void icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len, const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira) { ip6_t *ip6h; in6_addr_t v6dst; size_t len_needed; size_t msg_len; mblk_t *mp1; icmp6_t *icmp6; in6_addr_t v6src; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ip_xmit_attr_t ixas; ip6h = (ip6_t *)mp->b_rptr; bzero(&ixas, sizeof (ixas)); ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; ixas.ixa_zoneid = ira->ira_zoneid; ixas.ixa_ifindex = 0; ixas.ixa_ipst = ipst; ixas.ixa_cred = kcred; ixas.ixa_cpid = NOPID; ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; /* * If the source of the original packet was link-local, then * make sure we send on the same ill (group) as we received it on. */ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { ixas.ixa_flags |= IXAF_SCOPEID_SET; if (IS_UNDER_IPMP(ill)) ixas.ixa_scopeid = ill_get_upper_ifindex(ill); else ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex; } if (ira->ira_flags & IRAF_IPSEC_SECURE) { /* * Apply IPsec based on how IPsec was applied to * the packet that had the error. * * If it was an outbound packet that caused the ICMP * error, then the caller will have setup the IRA * appropriately. */ if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); /* Note: mp already consumed and ip_drop_packet done */ return; } } else { /* * This is in clear. The icmp message we are building * here should go out in clear, independent of our policy. */ ixas.ixa_flags |= IXAF_NO_IPSEC; } /* * If the caller specified the source we use that. * Otherwise, if the packet was for one of our unicast addresses, make * sure we respond with that as the source. Otherwise * have ip_output_simple pick the source address. */ if (v6src_ptr != NULL) { v6src = *v6src_ptr; } else { ire_t *ire; uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY; if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) || IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) match_flags |= MATCH_IRE_ILL; ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL, match_flags, 0, ipst, NULL); if (ire != NULL) { v6src = ip6h->ip6_dst; ire_refrele(ire); } else { v6src = ipv6_all_zeros; ixas.ixa_flags |= IXAF_SET_SOURCE; } } v6dst = ip6h->ip6_src; len_needed = ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len; msg_len = msgdsize(mp); if (msg_len > len_needed) { if (!adjmsg(mp, len_needed - msg_len)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); freemsg(mp); return; } msg_len = len_needed; } mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED); if (mp1 == NULL) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); freemsg(mp); return; } mp1->b_cont = mp; mp = mp1; /* * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this * node generates be accepted in peace by all on-host destinations. * If we do NOT assume that all on-host destinations trust * self-generated ICMP messages, then rework here, ip6.c, and spd.c. * (Look for IXAF_TRUSTED_ICMP). */ ixas.ixa_flags |= IXAF_TRUSTED_ICMP; ip6h = (ip6_t *)mp->b_rptr; mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len); ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_nxt = IPPROTO_ICMPV6; ip6h->ip6_hops = ipst->ips_ipv6_def_hops; ip6h->ip6_dst = v6dst; ip6h->ip6_src = v6src; msg_len += IPV6_HDR_LEN + len; if (msg_len > IP_MAXPACKET + IPV6_HDR_LEN) { (void) adjmsg(mp, IP_MAXPACKET + IPV6_HDR_LEN - msg_len); msg_len = IP_MAXPACKET + IPV6_HDR_LEN; } ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN)); icmp6 = (icmp6_t *)&ip6h[1]; bcopy(stuff, (char *)icmp6, len); /* * Prepare for checksum by putting icmp length in the icmp * checksum field. The checksum is calculated in ip_output_wire_v6. */ icmp6->icmp6_cksum = ip6h->ip6_plen; if (icmp6->icmp6_type == ND_REDIRECT) { ip6h->ip6_hops = IPV6_MAX_HOPS; } (void) ip_output_simple(mp, &ixas); ixa_cleanup(&ixas); } /* * Update the output mib when ICMPv6 packets are sent. */ void icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutDestUnreachs); if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN) BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutAdminProhibs); break; case ICMP6_TIME_EXCEEDED: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutTimeExcds); break; case ICMP6_PARAM_PROB: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutParmProblems); break; case ICMP6_PACKET_TOO_BIG: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutPktTooBigs); break; case ICMP6_ECHO_REQUEST: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchos); break; case ICMP6_ECHO_REPLY: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchoReplies); break; case ND_ROUTER_SOLICIT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterSolicits); break; case ND_ROUTER_ADVERT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterAdvertisements); break; case ND_NEIGHBOR_SOLICIT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborSolicits); break; case ND_NEIGHBOR_ADVERT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborAdvertisements); break; case ND_REDIRECT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRedirects); break; case MLD_LISTENER_QUERY: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembQueries); break; case MLD_LISTENER_REPORT: case MLD_V2_LISTENER_REPORT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembResponses); break; case MLD_LISTENER_REDUCTION: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembReductions); break; } } /* * Check if it is ok to send an ICMPv6 error packet in * response to the IP packet in mp. * Free the message and return null if no * ICMP error packet should be sent. */ static mblk_t * icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira) { ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; boolean_t llbcast; ip6_t *ip6h; if (!mp) return (NULL); /* We view multicast and broadcast as the same.. */ llbcast = (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0; ip6h = (ip6_t *)mp->b_rptr; /* Check if source address uniquely identifies the host */ if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { freemsg(mp); return (NULL); } if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { size_t len_needed = IPV6_HDR_LEN + ICMP6_MINLEN; icmp6_t *icmp6; if (mp->b_wptr - mp->b_rptr < len_needed) { if (!pullupmsg(mp, len_needed)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); return (NULL); } ip6h = (ip6_t *)mp->b_rptr; } icmp6 = (icmp6_t *)&ip6h[1]; /* Explicitly do not generate errors in response to redirects */ if (ICMP6_IS_ERROR(icmp6->icmp6_type) || icmp6->icmp6_type == ND_REDIRECT) { freemsg(mp); return (NULL); } } /* * Check that the destination is not multicast and that the packet * was not sent on link layer broadcast or multicast. (Exception * is Packet too big message as per the draft - when mcast_ok is set.) */ if (!mcast_ok && (llbcast || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) { freemsg(mp); return (NULL); } /* * If this is a labeled system, then check to see if we're allowed to * send a response to this particular sender. If not, then just drop. */ if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); freemsg(mp); return (NULL); } if (icmp_err_rate_limit(ipst)) { /* * Only send ICMP error packets every so often. * This should be done on a per port/source basis, * but for now this will suffice. */ freemsg(mp); return (NULL); } return (mp); } /* * Called when a packet was sent out the same link that it arrived on. * Check if it is ok to send a redirect and then send it. */ void ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire, ip_recv_attr_t *ira) { ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; in6_addr_t *v6targ; ire_t *src_ire_v6 = NULL; mblk_t *mp1; ire_t *nhop_ire = NULL; /* * Don't send a redirect when forwarding a source * routed packet. */ if (ip_source_routed_v6(ip6h, mp, ipst)) return; if (ire->ire_type & IRE_ONLINK) { /* Target is directly connected */ v6targ = &ip6h->ip6_dst; } else { /* Determine the most specific IRE used to send the packets */ nhop_ire = ire_nexthop(ire); if (nhop_ire == NULL) return; /* * We won't send redirects to a router * that doesn't have a link local * address, but will forward. */ if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); ip_drop_input("ipIfStatsInAddrErrors", mp, ill); ire_refrele(nhop_ire); return; } v6targ = &nhop_ire->ire_addr_v6; } src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src, NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL); if (src_ire_v6 == NULL) { if (nhop_ire != NULL) ire_refrele(nhop_ire); return; } /* * The source is directly connected. */ mp1 = copymsg(mp); if (mp1 != NULL) icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira); if (nhop_ire != NULL) ire_refrele(nhop_ire); ire_refrele(src_ire_v6); } /* * Generate an ICMPv6 redirect message. * Include target link layer address option if it exits. * Always include redirect header. */ static void icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest, ip_recv_attr_t *ira) { nd_redirect_t *rd; nd_opt_rd_hdr_t *rdh; uchar_t *buf; ncec_t *ncec = NULL; nd_opt_hdr_t *opt; int len; int ll_opt_len = 0; int max_redir_hdr_data_len; int pkt_len; in6_addr_t *srcp; ill_t *ill; boolean_t need_refrele; ip_stack_t *ipst = ira->ira_ill->ill_ipst; mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira); if (mp == NULL) return; if (IS_UNDER_IPMP(ira->ira_ill)) { ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill); if (ill == NULL) { ill = ira->ira_ill; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); ip_drop_output("no IPMP ill for sending redirect", mp, ill); freemsg(mp); return; } need_refrele = B_TRUE; } else { ill = ira->ira_ill; need_refrele = B_FALSE; } ncec = ncec_lookup_illgrp_v6(ill, targetp); if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE && ncec->ncec_lladdr != NULL) { ll_opt_len = (sizeof (nd_opt_hdr_t) + ill->ill_phys_addr_length + 7)/8 * 8; } len = sizeof (nd_redirect_t) + sizeof (nd_opt_rd_hdr_t) + ll_opt_len; ASSERT(len % 4 == 0); buf = kmem_alloc(len, KM_NOSLEEP); if (buf == NULL) { if (ncec != NULL) ncec_refrele(ncec); if (need_refrele) ill_refrele(ill); freemsg(mp); return; } rd = (nd_redirect_t *)buf; rd->nd_rd_type = (uint8_t)ND_REDIRECT; rd->nd_rd_code = 0; rd->nd_rd_reserved = 0; rd->nd_rd_target = *targetp; rd->nd_rd_dst = *dest; opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t)); if (ncec != NULL && ll_opt_len != 0) { opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; opt->nd_opt_len = ll_opt_len/8; bcopy((char *)ncec->ncec_lladdr, &opt[1], ill->ill_phys_addr_length); } if (ncec != NULL) ncec_refrele(ncec); rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len); rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER; /* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */ max_redir_hdr_data_len = (ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len)/8*8; pkt_len = msgdsize(mp); /* Make sure mp is 8 byte aligned */ if (pkt_len > max_redir_hdr_data_len) { rdh->nd_opt_rh_len = (max_redir_hdr_data_len + sizeof (nd_opt_rd_hdr_t))/8; (void) adjmsg(mp, max_redir_hdr_data_len - pkt_len); } else { rdh->nd_opt_rh_len = (pkt_len + sizeof (nd_opt_rd_hdr_t))/8; (void) adjmsg(mp, -(pkt_len % 8)); } rdh->nd_opt_rh_reserved1 = 0; rdh->nd_opt_rh_reserved2 = 0; /* ipif_v6lcl_addr contains the link-local source address */ srcp = &ill->ill_ipif->ipif_v6lcl_addr; /* Redirects sent by router, and router is global zone */ ASSERT(ira->ira_zoneid == ALL_ZONES); ira->ira_zoneid = GLOBAL_ZONEID; icmp_pkt_v6(mp, buf, len, srcp, ira); kmem_free(buf, len); if (need_refrele) ill_refrele(ill); } /* Generate an ICMP time exceeded message. (May be called as writer.) */ void icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok, ip_recv_attr_t *ira) { icmp6_t icmp6; mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); if (mp == NULL) return; bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_TIME_EXCEEDED; icmp6.icmp6_code = code; icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP unreachable message. * When called from ip_output side a minimal ip_recv_attr_t needs to be * constructed by the caller. */ void icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok, ip_recv_attr_t *ira) { icmp6_t icmp6; mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); if (mp == NULL) return; bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_DST_UNREACH; icmp6.icmp6_code = code; icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP pkt too big message. * When called from ip_output side a minimal ip_recv_attr_t needs to be * constructed by the caller. */ void icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok, ip_recv_attr_t *ira) { icmp6_t icmp6; mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); if (mp == NULL) return; bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG; icmp6.icmp6_code = 0; icmp6.icmp6_mtu = htonl(mtu); icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP parameter problem message. (May be called as writer.) * 'offset' is the offset from the beginning of the packet in error. * When called from ip_output side a minimal ip_recv_attr_t needs to be * constructed by the caller. */ static void icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset, boolean_t mcast_ok, ip_recv_attr_t *ira) { icmp6_t icmp6; mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); if (mp == NULL) return; bzero((char *)&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_PARAM_PROB; icmp6.icmp6_code = code; icmp6.icmp6_pptr = htonl(offset); icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } void icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira) { ip6_t *ip6h = (ip6_t *)mp->b_rptr; uint16_t hdr_length; uint8_t *nexthdrp; uint32_t offset; ill_t *ill = ira->ira_ill; /* Determine the offset of the bad nexthdr value */ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) { /* Malformed packet */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); return; } offset = nexthdrp - mp->b_rptr; icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset, mcast_ok, ira); } /* * Verify whether or not the IP address is a valid local address. * Could be a unicast, including one for a down interface. * If allow_mcbc then a multicast or broadcast address is also * acceptable. * * In the case of a multicast address, however, the * upper protocol is expected to reset the src address * to zero when we return IPVL_MCAST so that * no packets are emitted with multicast address as * source address. * The addresses valid for bind are: * (1) - in6addr_any * (2) - IP address of an UP interface * (3) - IP address of a DOWN interface * (4) - a multicast address. In this case * the conn will only receive packets destined to * the specified multicast address. Note: the * application still has to issue an * IPV6_JOIN_GROUP socket option. * * In all the above cases, the bound address must be valid in the current zone. * When the address is loopback or multicast, there might be many matching IREs * so bind has to look up based on the zone. */ ip_laddr_t ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid) { ire_t *src_ire; uint_t match_flags; ill_t *ill = NULL; ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src)); ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src)); match_flags = MATCH_IRE_ZONEONLY; if (scopeid != 0) { ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst); if (ill == NULL) return (IPVL_BAD); match_flags |= MATCH_IRE_ILL; } src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0, ill, zoneid, NULL, match_flags, 0, ipst, NULL); if (ill != NULL) ill_refrele(ill); /* * If an address other than in6addr_any is requested, * we verify that it is a valid address for bind * Note: Following code is in if-else-if form for * readability compared to a condition check. */ if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) { /* * (2) Bind to address of local UP interface */ ire_refrele(src_ire); return (IPVL_UNICAST_UP); } else if (IN6_IS_ADDR_MULTICAST(v6src)) { /* (4) bind to multicast address. */ if (src_ire != NULL) ire_refrele(src_ire); /* * Note: caller should take IPV6_MULTICAST_IF * into account when selecting a real source address. */ if (allow_mcbc) return (IPVL_MCAST); else return (IPVL_BAD); } else { ipif_t *ipif; /* * (3) Bind to address of local DOWN interface? * (ipif_lookup_addr() looks up all interfaces * but we do not get here for UP interfaces * - case (2) above) */ if (src_ire != NULL) ire_refrele(src_ire); ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst); if (ipif == NULL) return (IPVL_BAD); /* Not a useful source? */ if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) { ipif_refrele(ipif); return (IPVL_BAD); } ipif_refrele(ipif); return (IPVL_UNICAST_DOWN); } } /* * Verify that both the source and destination addresses are valid. If * IPDF_VERIFY_DST is not set, then the destination address may be unreachable, * i.e. have no route to it. Protocols like TCP want to verify destination * reachability, while tunnels do not. * * Determine the route, the interface, and (optionally) the source address * to use to reach a given destination. * Note that we allow connect to broadcast and multicast addresses when * IPDF_ALLOW_MCBC is set. * first_hop and dst_addr are normally the same, but if source routing * they will differ; in that case the first_hop is what we'll use for the * routing lookup but the dce and label checks will be done on dst_addr, * * If uinfo is set, then we fill in the best available information * we have for the destination. This is based on (in priority order) any * metrics and path MTU stored in a dce_t, route metrics, and finally the * ill_mtu. * * Tsol note: If we have a source route then dst_addr != firsthop. But we * always do the label check on dst_addr. * * Assumes that the caller has set ixa_scopeid for link-local communication. */ int ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr, const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode) { ire_t *ire; int error = 0; in6_addr_t setsrc; /* RTF_SETSRC */ zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */ ip_stack_t *ipst = ixa->ixa_ipst; dce_t *dce; uint_t pmtu; uint_t ifindex; uint_t generation; nce_t *nce; ill_t *ill = NULL; boolean_t multirt = B_FALSE; ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr)); ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); /* * We never send to zero; the ULPs map it to the loopback address. * We can't allow it since we use zero to mean unitialized in some * places. */ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr)); if (is_system_labeled()) { ts_label_t *tsl = NULL; error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION, mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl); if (error != 0) return (error); if (tsl != NULL) { /* Update the label */ ip_xmit_attr_replace_tsl(ixa, tsl); } } setsrc = ipv6_all_zeros; /* * Select a route; For IPMP interfaces, we would only select * a "hidden" route (i.e., going through a specific under_ill) * if ixa_ifindex has been specified. */ ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation, &setsrc, &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) goto bad_addr; /* * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set. * If IPDF_VERIFY_DST is set, the destination must be reachable. * Otherwise the destination needn't be reachable. * * If we match on a reject or black hole, then we've got a * local failure. May as well fail out the connect() attempt, * since it's never going to succeed. */ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { /* * If we're verifying destination reachability, we always want * to complain here. * * If we're not verifying destination reachability but the * destination has a route, we still want to fail on the * temporary address and broadcast address tests. * * In both cases do we let the code continue so some reasonable * information is returned to the caller. That enables the * caller to use (and even cache) the IRE. conn_ip_ouput will * use the generation mismatch path to check for the unreachable * case thereby avoiding any specific check in the main path. */ ASSERT(generation == IRE_GENERATION_VERIFY); if (flags & IPDF_VERIFY_DST) { /* * Set errno but continue to set up ixa_ire to be * the RTF_REJECT|RTF_BLACKHOLE IRE. * That allows callers to use ip_output to get an * ICMP error back. */ if (!(ire->ire_type & IRE_HOST)) error = ENETUNREACH; else error = EHOSTUNREACH; } } if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) && !(flags & IPDF_ALLOW_MCBC)) { ire_refrele(ire); ire = ire_reject(ipst, B_FALSE); generation = IRE_GENERATION_VERIFY; error = ENETUNREACH; } /* Cache things */ if (ixa->ixa_ire != NULL) ire_refrele_notr(ixa->ixa_ire); #ifdef DEBUG ire_refhold_notr(ire); ire_refrele(ire); #endif ixa->ixa_ire = ire; ixa->ixa_ire_generation = generation; /* * For multicast with multirt we have a flag passed back from * ire_lookup_multi_ill_v6 since we don't have an IRE for each * possible multicast address. * We also need a flag for multicast since we can't check * whether RTF_MULTIRT is set in ixa_ire for multicast. */ if (multirt) { ixa->ixa_postfragfn = ip_postfrag_multirt_v6; ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; } else { ixa->ixa_postfragfn = ire->ire_postfragfn; ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; } if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { /* Get an nce to cache. */ nce = ire_to_nce(ire, NULL, firsthop); if (nce == NULL) { /* Allocation failure? */ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } else { if (ixa->ixa_nce != NULL) nce_refrele(ixa->ixa_nce); ixa->ixa_nce = nce; } } /* * If the source address is a loopback address, the * destination had best be local or multicast. * If we are sending to an IRE_LOCAL using a loopback source then * it had better be the same zoneid. */ if (IN6_IS_ADDR_LOOPBACK(src_addrp)) { if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) { ire = NULL; /* Stored in ixa_ire */ error = EADDRNOTAVAIL; goto bad_addr; } if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) { ire = NULL; /* Stored in ixa_ire */ error = EADDRNOTAVAIL; goto bad_addr; } } /* * Does the caller want us to pick a source address? */ if (flags & IPDF_SELECT_SRC) { in6_addr_t src_addr; /* * We use use ire_nexthop_ill to avoid the under ipmp * interface for source address selection. Note that for ipmp * probe packets, ixa_ifindex would have been specified, and * the ip_select_route() invocation would have picked an ire * will ire_ill pointing at an under interface. */ ill = ire_nexthop_ill(ire); /* If unreachable we have no ill but need some source */ if (ill == NULL) { src_addr = ipv6_loopback; /* Make sure we look for a better source address */ generation = SRC_GENERATION_VERIFY; } else { error = ip_select_source_v6(ill, &setsrc, dst_addr, zoneid, ipst, B_FALSE, ixa->ixa_src_preferences, &src_addr, &generation, NULL); if (error != 0) { ire = NULL; /* Stored in ixa_ire */ goto bad_addr; } } /* * We allow the source address to to down. * However, we check that we don't use the loopback address * as a source when sending out on the wire. */ if (IN6_IS_ADDR_LOOPBACK(&src_addr) && !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { ire = NULL; /* Stored in ixa_ire */ error = EADDRNOTAVAIL; goto bad_addr; } *src_addrp = src_addr; ixa->ixa_src_generation = generation; } /* * Make sure we don't leave an unreachable ixa_nce in place * since ip_select_route is used when we unplumb i.e., remove * references on ixa_ire, ixa_nce, and ixa_dce. */ nce = ixa->ixa_nce; if (nce != NULL && nce->nce_is_condemned) { nce_refrele(nce); ixa->ixa_nce = NULL; ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } ifindex = 0; if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) { /* If we are creating a DCE we'd better have an ifindex */ if (ill != NULL) ifindex = ill->ill_phyint->phyint_ifindex; else flags &= ~IPDF_UNIQUE_DCE; } if (flags & IPDF_UNIQUE_DCE) { /* Fallback to the default dce if allocation fails */ dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst); if (dce != NULL) { generation = dce->dce_generation; } else { dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation); } } else { dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation); } ASSERT(dce != NULL); if (ixa->ixa_dce != NULL) dce_refrele_notr(ixa->ixa_dce); #ifdef DEBUG dce_refhold_notr(dce); dce_refrele(dce); #endif ixa->ixa_dce = dce; ixa->ixa_dce_generation = generation; /* * Note that IPv6 multicast supports PMTU discovery unlike IPv4 * multicast. But pmtu discovery is only enabled for connected * sockets in general. */ /* * Set initial value for fragmentation limit. Either conn_ip_output * or ULP might updates it when there are routing changes. * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT. */ pmtu = ip_get_pmtu(ixa); ixa->ixa_fragsize = pmtu; /* Make sure ixa_fragsize and ixa_pmtu remain identical */ if (ixa->ixa_flags & IXAF_VERIFY_PMTU) ixa->ixa_pmtu = pmtu; /* * Extract information useful for some transports. * First we look for DCE metrics. Then we take what we have in * the metrics in the route, where the offlink is used if we have * one. */ if (uinfo != NULL) { bzero(uinfo, sizeof (*uinfo)); if (dce->dce_flags & DCEF_UINFO) *uinfo = dce->dce_uinfo; rts_merge_metrics(uinfo, &ire->ire_metrics); /* Allow ire_metrics to decrease the path MTU from above */ if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu) uinfo->iulp_mtu = pmtu; uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0; uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0; uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0; } if (ill != NULL) ill_refrele(ill); return (error); bad_addr: if (ire != NULL) ire_refrele(ire); if (ill != NULL) ill_refrele(ill); /* * Make sure we don't leave an unreachable ixa_nce in place * since ip_select_route is used when we unplumb i.e., remove * references on ixa_ire, ixa_nce, and ixa_dce. */ nce = ixa->ixa_nce; if (nce != NULL && nce->nce_is_condemned) { nce_refrele(nce); ixa->ixa_nce = NULL; ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } return (error); } /* * Handle protocols with which IP is less intimate. There * can be more than one stream bound to a particular * protocol. When this is the case, normally each one gets a copy * of any incoming packets. * * Zones notes: * Packets will be distributed to conns in all zones. This is really only * useful for ICMPv6 as only applications in the global zone can create raw * sockets for other protocols. */ void ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) { mblk_t *mp1; in6_addr_t laddr = ip6h->ip6_dst; conn_t *connp, *first_connp, *next_connp; connf_t *connfp; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */ if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) && (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; } if (connp == NULL) { /* * No one bound to this port. Is * there a client that wants all * unclaimed datagrams? */ mutex_exit(&connfp->connf_lock); ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, ira); return; } ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); CONN_INC_REF(connp); first_connp = connp; /* * XXX: Fix the multiple protocol listeners case. We should not * be walking the conn->conn_next list here. */ connp = connp->conn_next; for (;;) { while (connp != NULL) { /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */ if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) && (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; connp = connp->conn_next; } if (connp == NULL) { /* No more interested clients */ connp = first_connp; break; } if (((mp1 = dupmsg(mp)) == NULL) && ((mp1 = copymsg(mp)) == NULL)) { /* Memory allocation failed */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); connp = first_connp; break; } CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr, ira); mutex_enter(&connfp->connf_lock); /* Follow the next pointer before releasing the conn. */ next_connp = connp->conn_next; CONN_DEC_REF(connp); connp = next_connp; } /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira); CONN_DEC_REF(connp); } /* * Called when it is conceptually a ULP that would sent the packet * e.g., port unreachable and nexthdr unknown. Check that the packet * would have passed the IPsec global policy before sending the error. * * Send an ICMP error after patching up the packet appropriately. * Uses ip_drop_input and bumps the appropriate MIB. * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use. */ void ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code, ip_recv_attr_t *ira) { ip6_t *ip6h; boolean_t secure; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; netstack_t *ns = ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; secure = ira->ira_flags & IRAF_IPSEC_SECURE; /* * We are generating an icmp error for some inbound packet. * Called from all ip_fanout_(udp, tcp, proto) functions. * Before we generate an error, check with global policy * to see whether this is allowed to enter the system. As * there is no "conn", we are checking with global policy. */ ip6h = (ip6_t *)mp->b_rptr; if (secure || ipss->ipsec_inbound_v6_policy_present) { mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns); if (mp == NULL) return; } /* We never send errors for protocols that we do implement */ if (ira->ira_protocol == IPPROTO_ICMPV6) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ip_fanout_send_icmp_v6", mp, ill); freemsg(mp); return; } switch (icmp_type) { case ICMP6_DST_UNREACH: ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT); BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); ip_drop_input("ipIfStatsNoPorts", mp, ill); icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira); break; case ICMP6_PARAM_PROB: ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos); ip_drop_input("ipIfStatsInUnknownProtos", mp, ill); /* Let the system determine the offset for this one */ icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira); break; default: #ifdef DEBUG panic("ip_fanout_send_icmp_v6: wrong type"); /*NOTREACHED*/ #else freemsg(mp); break; #endif } } /* * Fanout for UDP packets that are multicast or ICMP errors. * (Unicast fanout is handled in ip_input_v6.) * * If SO_REUSEADDR is set all multicast packets * will be delivered to all conns bound to the same port. * * Fanout for UDP packets. * The caller puts in the ports parameter. * ire_type must be IRE_BROADCAST for multicast and broadcast packets. * * If SO_REUSEADDR is set all multicast and broadcast packets * will be delivered to all conns bound to the same port. * * Zones notes: * Earlier in ip_input on a system with multiple shared-IP zones we * duplicate the multicast and broadcast packets and send them up * with each explicit zoneid that exists on that ill. * This means that here we can match the zoneid with SO_ALLZONES being special. */ void ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport, ip_recv_attr_t *ira) { in6_addr_t laddr; conn_t *connp; connf_t *connfp; in6_addr_t faddr; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR)); laddr = ip6h->ip6_dst; faddr = ip6h->ip6_src; /* Attempt to find a client stream based on destination port. */ connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; while (connp != NULL) { if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) && conn_wantpacket_v6(connp, ira, ip6h) && (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; connp = connp->conn_next; } if (connp == NULL) goto notfound; CONN_INC_REF(connp); if (connp->conn_reuseaddr) { conn_t *first_connp = connp; conn_t *next_connp; mblk_t *mp1; connp = connp->conn_next; for (;;) { while (connp != NULL) { if (IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr) && conn_wantpacket_v6(connp, ira, ip6h) && (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; connp = connp->conn_next; } if (connp == NULL) { /* No more interested clients */ connp = first_connp; break; } if (((mp1 = dupmsg(mp)) == NULL) && ((mp1 = copymsg(mp)) == NULL)) { /* Memory allocation failed */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); connp = first_connp; break; } CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); IP6_STAT(ipst, ip6_udp_fanmb); ip_fanout_udp_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr, ira); mutex_enter(&connfp->connf_lock); /* Follow the next pointer before releasing the conn. */ next_connp = connp->conn_next; IP6_STAT(ipst, ip6_udp_fanmb); CONN_DEC_REF(connp); connp = next_connp; } } /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); IP6_STAT(ipst, ip6_udp_fanmb); ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira); CONN_DEC_REF(connp); return; notfound: mutex_exit(&connfp->connf_lock); /* * No one bound to this port. Is * there a client that wants all * unclaimed datagrams? */ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) { ASSERT(ira->ira_protocol == IPPROTO_UDP); ip_fanout_proto_v6(mp, ip6h, ira); } else { ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, ira); } } /* * int ip_find_hdr_v6() * * This routine is used by the upper layer protocols, iptun, and IPsec: * - Set extension header pointers to appropriate locations * - Determine IPv6 header length and return it * - Return a pointer to the last nexthdr value * * The caller must initialize ipp_fields. * The upper layer protocols normally set label_separate which makes the * routine put the TX label in ipp_label_v6. If this is not set then * the hop-by-hop options including the label are placed in ipp_hopopts. * * NOTE: If multiple extension headers of the same type are present, * ip_find_hdr_v6() will set the respective extension header pointers * to the first one that it encounters in the IPv6 header. It also * skips fragment headers. This routine deals with malformed packets * of various sorts in which case the returned length is up to the * malformed part. */ int ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp, uint8_t *nexthdrp) { uint_t length, ehdrlen; uint8_t nexthdr; uint8_t *whereptr, *endptr; ip6_dest_t *tmpdstopts; ip6_rthdr_t *tmprthdr; ip6_hbh_t *tmphopopts; ip6_frag_t *tmpfraghdr; ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR; ipp->ipp_hoplimit = ip6h->ip6_hops; ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow); ipp->ipp_addr = ip6h->ip6_dst; length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; nexthdr = ip6h->ip6_nxt; while (whereptr < endptr) { /* Is there enough left for len + nexthdr? */ if (whereptr + MIN_EHDR_LEN > endptr) goto done; switch (nexthdr) { case IPPROTO_HOPOPTS: { /* We check for any CIPSO */ uchar_t *secopt; boolean_t hbh_needed; uchar_t *after_secopt; tmphopopts = (ip6_hbh_t *)whereptr; ehdrlen = 8 * (tmphopopts->ip6h_len + 1); if ((uchar_t *)tmphopopts + ehdrlen > endptr) goto done; nexthdr = tmphopopts->ip6h_nxt; if (!label_separate) { secopt = NULL; after_secopt = whereptr; } else { /* * We have dropped packets with bad options in * ip6_input. No need to check return value * here. */ (void) tsol_find_secopt_v6(whereptr, ehdrlen, &secopt, &after_secopt, &hbh_needed); } if (secopt != NULL && after_secopt - whereptr > 0) { ipp->ipp_fields |= IPPF_LABEL_V6; ipp->ipp_label_v6 = secopt; ipp->ipp_label_len_v6 = after_secopt - whereptr; } else { ipp->ipp_label_len_v6 = 0; after_secopt = whereptr; hbh_needed = B_TRUE; } /* return only 1st hbh */ if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) { ipp->ipp_fields |= IPPF_HOPOPTS; ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt; ipp->ipp_hopoptslen = ehdrlen - ipp->ipp_label_len_v6; } break; } case IPPROTO_DSTOPTS: tmpdstopts = (ip6_dest_t *)whereptr; ehdrlen = 8 * (tmpdstopts->ip6d_len + 1); if ((uchar_t *)tmpdstopts + ehdrlen > endptr) goto done; nexthdr = tmpdstopts->ip6d_nxt; /* * ipp_dstopts is set to the destination header after a * routing header. * Assume it is a post-rthdr destination header * and adjust when we find an rthdr. */ if (!(ipp->ipp_fields & IPPF_DSTOPTS)) { ipp->ipp_fields |= IPPF_DSTOPTS; ipp->ipp_dstopts = tmpdstopts; ipp->ipp_dstoptslen = ehdrlen; } break; case IPPROTO_ROUTING: tmprthdr = (ip6_rthdr_t *)whereptr; ehdrlen = 8 * (tmprthdr->ip6r_len + 1); if ((uchar_t *)tmprthdr + ehdrlen > endptr) goto done; nexthdr = tmprthdr->ip6r_nxt; /* return only 1st rthdr */ if (!(ipp->ipp_fields & IPPF_RTHDR)) { ipp->ipp_fields |= IPPF_RTHDR; ipp->ipp_rthdr = tmprthdr; ipp->ipp_rthdrlen = ehdrlen; } /* * Make any destination header we've seen be a * pre-rthdr destination header. */ if (ipp->ipp_fields & IPPF_DSTOPTS) { ipp->ipp_fields &= ~IPPF_DSTOPTS; ipp->ipp_fields |= IPPF_RTHDRDSTOPTS; ipp->ipp_rthdrdstopts = ipp->ipp_dstopts; ipp->ipp_dstopts = NULL; ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen; ipp->ipp_dstoptslen = 0; } break; case IPPROTO_FRAGMENT: tmpfraghdr = (ip6_frag_t *)whereptr; ehdrlen = sizeof (ip6_frag_t); if ((uchar_t *)tmpfraghdr + ehdrlen > endptr) goto done; nexthdr = tmpfraghdr->ip6f_nxt; if (!(ipp->ipp_fields & IPPF_FRAGHDR)) { ipp->ipp_fields |= IPPF_FRAGHDR; ipp->ipp_fraghdr = tmpfraghdr; ipp->ipp_fraghdrlen = ehdrlen; } break; case IPPROTO_NONE: default: goto done; } length += ehdrlen; whereptr += ehdrlen; } done: if (nexthdrp != NULL) *nexthdrp = nexthdr; return (length); } /* * Try to determine where and what are the IPv6 header length and * pointer to nexthdr value for the upper layer protocol (or an * unknown next hdr). * * Parameters returns a pointer to the nexthdr value; * Must handle malformed packets of various sorts. * Function returns failure for malformed cases. */ boolean_t ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr, uint8_t **nexthdrpp) { uint16_t length; uint_t ehdrlen; uint8_t *nexthdrp; uint8_t *whereptr; uint8_t *endptr; ip6_dest_t *desthdr; ip6_rthdr_t *rthdr; ip6_frag_t *fraghdr; ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; nexthdrp = &ip6h->ip6_nxt; while (whereptr < endptr) { /* Is there enough left for len + nexthdr? */ if (whereptr + MIN_EHDR_LEN > endptr) break; switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: /* Assumes the headers are identical for hbh and dst */ desthdr = (ip6_dest_t *)whereptr; ehdrlen = 8 * (desthdr->ip6d_len + 1); if ((uchar_t *)desthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &desthdr->ip6d_nxt; break; case IPPROTO_ROUTING: rthdr = (ip6_rthdr_t *)whereptr; ehdrlen = 8 * (rthdr->ip6r_len + 1); if ((uchar_t *)rthdr + ehdrlen > endptr) return (B_FALSE); nexthdrp = &rthdr->ip6r_nxt; break; case IPPROTO_FRAGMENT: fraghdr = (ip6_frag_t *)whereptr; ehdrlen = sizeof (ip6_frag_t); if ((uchar_t *)&fraghdr[1] > endptr) return (B_FALSE); nexthdrp = &fraghdr->ip6f_nxt; break; case IPPROTO_NONE: /* No next header means we're finished */ default: *hdr_length_ptr = length; *nexthdrpp = nexthdrp; return (B_TRUE); } length += ehdrlen; whereptr += ehdrlen; *hdr_length_ptr = length; *nexthdrpp = nexthdrp; } switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_FRAGMENT: /* * If any know extension headers are still to be processed, * the packet's malformed (or at least all the IP header(s) are * not in the same mblk - and that should never happen. */ return (B_FALSE); default: /* * If we get here, we know that all of the IP headers were in * the same mblk, even if the ULP header is in the next mblk. */ *hdr_length_ptr = length; *nexthdrpp = nexthdrp; return (B_TRUE); } } /* * Return the length of the IPv6 related headers (including extension headers) * Returns a length even if the packet is malformed. */ int ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) { uint16_t hdr_len; uint8_t *nexthdrp; (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, &nexthdrp); return (hdr_len); } /* * Parse and process any hop-by-hop or destination options. * * Assumes that q is an ill read queue so that ICMP errors for link-local * destinations are sent out the correct interface. * * Returns -1 if there was an error and mp has been consumed. * Returns 0 if no special action is needed. * Returns 1 if the packet contained a router alert option for this node * which is verified to be "interesting/known" for our implementation. * * XXX Note: In future as more hbh or dest options are defined, * it may be better to have different routines for hbh and dest * options as opt_type fields other than IP6OPT_PAD1 and IP6OPT_PADN * may have same value in different namespaces. Or is it same namespace ?? * Current code checks for each opt_type (other than pads) if it is in * the expected nexthdr (hbh or dest) */ int ip_process_options_v6(mblk_t *mp, ip6_t *ip6h, uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira) { uint8_t opt_type; uint_t optused; int ret = 0; const char *errtype; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; while (optlen != 0) { opt_type = *optptr; if (opt_type == IP6OPT_PAD1) { optused = 1; } else { if (optlen < 2) goto bad_opt; errtype = "malformed"; if (opt_type == ip6opt_ls) { optused = 2 + optptr[1]; if (optused > optlen) goto bad_opt; } else switch (opt_type) { case IP6OPT_PADN: /* * Note:We don't verify that (N-2) pad octets * are zero as required by spec. Adhere to * "be liberal in what you accept..." part of * implementation philosophy (RFC791,RFC1122) */ optused = 2 + optptr[1]; if (optused > optlen) goto bad_opt; break; case IP6OPT_JUMBO: if (hdr_type != IPPROTO_HOPOPTS) goto opt_error; goto opt_error; /* XXX Not implemented! */ case IP6OPT_ROUTER_ALERT: { struct ip6_opt_router *or; if (hdr_type != IPPROTO_HOPOPTS) goto opt_error; optused = 2 + optptr[1]; if (optused > optlen) goto bad_opt; or = (struct ip6_opt_router *)optptr; /* Check total length and alignment */ if (optused != sizeof (*or) || ((uintptr_t)or->ip6or_value & 0x1) != 0) goto opt_error; /* Check value */ switch (*((uint16_t *)or->ip6or_value)) { case IP6_ALERT_MLD: case IP6_ALERT_RSVP: ret = 1; } break; } case IP6OPT_HOME_ADDRESS: { /* * Minimal support for the home address option * (which is required by all IPv6 nodes). * Implement by just swapping the home address * and source address. * XXX Note: this has IPsec implications since * AH needs to take this into account. * Also, when IPsec is used we need to ensure * that this is only processed once * in the received packet (to avoid swapping * back and forth). * NOTE:This option processing is considered * to be unsafe and prone to a denial of * service attack. * The current processing is not safe even with * IPsec secured IP packets. Since the home * address option processing requirement still * is in the IETF draft and in the process of * being redefined for its usage, it has been * decided to turn off the option by default. * If this section of code needs to be executed, * ndd variable ip6_ignore_home_address_opt * should be set to 0 at the user's own risk. */ struct ip6_opt_home_address *oh; in6_addr_t tmp; if (ipst->ips_ipv6_ignore_home_address_opt) goto opt_error; if (hdr_type != IPPROTO_DSTOPTS) goto opt_error; optused = 2 + optptr[1]; if (optused > optlen) goto bad_opt; /* * We did this dest. opt the first time * around (i.e. before AH processing). * If we've done AH... stop now. */ if ((ira->ira_flags & IRAF_IPSEC_SECURE) && ira->ira_ipsec_ah_sa != NULL) break; oh = (struct ip6_opt_home_address *)optptr; /* Check total length and alignment */ if (optused < sizeof (*oh) || ((uintptr_t)oh->ip6oh_addr & 0x7) != 0) goto opt_error; /* Swap ip6_src and the home address */ tmp = ip6h->ip6_src; /* XXX Note: only 8 byte alignment option */ ip6h->ip6_src = *(in6_addr_t *)oh->ip6oh_addr; *(in6_addr_t *)oh->ip6oh_addr = tmp; break; } case IP6OPT_TUNNEL_LIMIT: if (hdr_type != IPPROTO_DSTOPTS) { goto opt_error; } optused = 2 + optptr[1]; if (optused > optlen) { goto bad_opt; } if (optused != 3) { goto opt_error; } break; default: errtype = "unknown"; /* FALLTHROUGH */ opt_error: /* Determine which zone should send error */ switch (IP6OPT_TYPE(opt_type)) { case IP6OPT_TYPE_SKIP: optused = 2 + optptr[1]; if (optused > optlen) goto bad_opt; ip1dbg(("ip_process_options_v6: %s " "opt 0x%x skipped\n", errtype, opt_type)); break; case IP6OPT_TYPE_DISCARD: ip1dbg(("ip_process_options_v6: %s " "opt 0x%x; packet dropped\n", errtype, opt_type)); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("ipIfStatsInHdrErrors", mp, ill); freemsg(mp); return (-1); case IP6OPT_TYPE_ICMP: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("ipIfStatsInHdrErrors", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, (uint32_t)(optptr - (uint8_t *)ip6h), B_FALSE, ira); return (-1); case IP6OPT_TYPE_FORCEICMP: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("ipIfStatsInHdrErrors", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, (uint32_t)(optptr - (uint8_t *)ip6h), B_TRUE, ira); return (-1); default: ASSERT(0); } } } optlen -= optused; optptr += optused; } return (ret); bad_opt: /* Determine which zone should send error */ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, (uint32_t)(optptr - (uint8_t *)ip6h), B_FALSE, ira); return (-1); } /* * Process a routing header that is not yet empty. * Because of RFC 5095, we now reject all route headers. */ void ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, ip_recv_attr_t *ira) { ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ASSERT(rth->ip6r_segleft != 0); if (!ipst->ips_ipv6_forward_src_routed) { /* XXX Check for source routed out same interface? */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); ip_drop_input("ipIfStatsInAddrErrors", mp, ill); freemsg(mp); return; } ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h), B_FALSE, ira); } /* * Read side put procedure for IPv6 module. */ void ip_rput_v6(queue_t *q, mblk_t *mp) { ill_t *ill; ill = (ill_t *)q->q_ptr; if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { union DL_primitives *dl; dl = (union DL_primitives *)mp->b_rptr; /* * Things are opening or closing - only accept DLPI * ack messages. If the stream is closing and ip_wsrv * has completed, ip_close is out of the qwait, but has * not yet completed qprocsoff. Don't proceed any further * because the ill has been cleaned up and things hanging * off the ill have been freed. */ if ((mp->b_datap->db_type != M_PCPROTO) || (dl->dl_primitive == DL_UNITDATA_IND)) { inet_freemsg(mp); return; } } if (DB_TYPE(mp) == M_DATA) { struct mac_header_info_s mhi; ip_mdata_to_mhi(ill, mp, &mhi); ip_input_v6(ill, NULL, mp, &mhi); } else { ip_rput_notdata(ill, mp); } } /* * Walk through the IPv6 packet in mp and see if there's an AH header * in it. See if the AH header needs to get done before other headers in * the packet. (Worker function for ipsec_early_ah_v6().) */ #define IPSEC_HDR_DONT_PROCESS 0 #define IPSEC_HDR_PROCESS 1 #define IPSEC_MEMORY_ERROR 2 /* or malformed packet */ static int ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr) { uint_t length; uint_t ehdrlen; uint8_t *whereptr; uint8_t *endptr; uint8_t *nexthdrp; ip6_dest_t *desthdr; ip6_rthdr_t *rthdr; ip6_t *ip6h; /* * For now just pullup everything. In general, the less pullups, * the better, but there's so much squirrelling through anyway, * it's just easier this way. */ if (!pullupmsg(mp, -1)) { return (IPSEC_MEMORY_ERROR); } ip6h = (ip6_t *)mp->b_rptr; length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; /* * We can't just use the argument nexthdr in the place * of nexthdrp becaue we don't dereference nexthdrp * till we confirm whether it is a valid address. */ nexthdrp = &ip6h->ip6_nxt; while (whereptr < endptr) { /* Is there enough left for len + nexthdr? */ if (whereptr + MIN_EHDR_LEN > endptr) return (IPSEC_MEMORY_ERROR); switch (*nexthdrp) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: /* Assumes the headers are identical for hbh and dst */ desthdr = (ip6_dest_t *)whereptr; ehdrlen = 8 * (desthdr->ip6d_len + 1); if ((uchar_t *)desthdr + ehdrlen > endptr) return (IPSEC_MEMORY_ERROR); /* * Return DONT_PROCESS because the destination * options header may be for each hop in a * routing-header, and we only want AH if we're * finished with routing headers. */ if (*nexthdrp == IPPROTO_DSTOPTS) return (IPSEC_HDR_DONT_PROCESS); nexthdrp = &desthdr->ip6d_nxt; break; case IPPROTO_ROUTING: rthdr = (ip6_rthdr_t *)whereptr; /* * If there's more hops left on the routing header, * return now with DON'T PROCESS. */ if (rthdr->ip6r_segleft > 0) return (IPSEC_HDR_DONT_PROCESS); ehdrlen = 8 * (rthdr->ip6r_len + 1); if ((uchar_t *)rthdr + ehdrlen > endptr) return (IPSEC_MEMORY_ERROR); nexthdrp = &rthdr->ip6r_nxt; break; case IPPROTO_FRAGMENT: /* Wait for reassembly */ return (IPSEC_HDR_DONT_PROCESS); case IPPROTO_AH: *nexthdr = IPPROTO_AH; return (IPSEC_HDR_PROCESS); case IPPROTO_NONE: /* No next header means we're finished */ default: return (IPSEC_HDR_DONT_PROCESS); } length += ehdrlen; whereptr += ehdrlen; } /* * Malformed/truncated packet. */ return (IPSEC_MEMORY_ERROR); } /* * Path for AH if options are present. * Returns NULL if the mblk was consumed. * * Sometimes AH needs to be done before other IPv6 headers for security * reasons. This function (and its ipsec_needs_processing_v6() above) * indicates if that is so, and fans out to the appropriate IPsec protocol * for the datagram passed in. */ mblk_t * ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira) { uint8_t nexthdr; ah_t *ah; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; switch (ipsec_needs_processing_v6(mp, &nexthdr)) { case IPSEC_MEMORY_ERROR: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); return (NULL); case IPSEC_HDR_DONT_PROCESS: return (mp); } /* Default means send it to AH! */ ASSERT(nexthdr == IPPROTO_AH); if (!ipsec_loaded(ipss)) { ip_proto_not_sup(mp, ira); return (NULL); } mp = ipsec_inbound_ah_sa(mp, ira, &ah); if (mp == NULL) return (NULL); ASSERT(ah != NULL); ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); ASSERT(ira->ira_ipsec_ah_sa != NULL); ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira); if (mp == NULL) { /* * Either it failed or is pending. In the former case * ipIfStatsInDiscards was increased. */ return (NULL); } /* we're done with IPsec processing, send it up */ ip_input_post_ipsec(mp, ira); return (NULL); } /* * Reassemble fragment. * When it returns a completed message the first mblk will only contain * the headers prior to the fragment header, with the nexthdr value updated * to be the header after the fragment header. */ mblk_t * ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h, ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira) { uint32_t ident = ntohl(fraghdr->ip6f_ident); uint16_t offset; boolean_t more_frags; uint8_t nexthdr = fraghdr->ip6f_nxt; in6_addr_t *v6dst_ptr; in6_addr_t *v6src_ptr; uint_t end; uint_t hdr_length; size_t count; ipf_t *ipf; ipf_t **ipfp; ipfb_t *ipfb; mblk_t *mp1; uint8_t ecn_info = 0; size_t msg_len; mblk_t *tail_mp; mblk_t *t_mp; boolean_t pruned = B_FALSE; uint32_t sum_val; uint16_t sum_flags; ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; uint_t prev_nexthdr_offset; uint8_t prev_nexthdr; uint8_t *ptr; uint32_t packet_size; /* * We utilize hardware computed checksum info only for UDP since * IP fragmentation is a normal occurence for the protocol. In * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ ASSERT(ira->ira_rill != NULL); if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ira->ira_rill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; /* Record checksum information from the packet */ sum_val = (uint32_t)DB_CKSUM16(mp); sum_flags = DB_CKSUMFLAGS(mp); /* fragmented payload offset from beginning of mblk */ offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr); if ((sum_flags & HCK_PARTIALCKSUM) && (mp1 == NULL || mp1->b_cont == NULL) && offset >= DB_CKSUMSTART(mp) && ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { uint32_t adj; /* * Partial checksum has been calculated by hardware * and attached to the packet; in addition, any * prepended extraneous data is even byte aligned. * If any such data exists, we adjust the checksum; * this would also handle any postpended data. */ IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), mp, mp1, len, adj); /* One's complement subtract extraneous checksum */ if (adj >= sum_val) sum_val = ~(adj - sum_val) & 0xFFFF; else sum_val -= adj; } } else { sum_val = 0; sum_flags = 0; } /* Clear hardware checksumming flag */ DB_CKSUMFLAGS(mp) = 0; /* * Determine the offset (from the begining of the IP header) * of the nexthdr value which has IPPROTO_FRAGMENT. We use * this when removing the fragment header from the packet. * This packet consists of the IPv6 header, a potential * hop-by-hop options header, a potential pre-routing-header * destination options header, and a potential routing header. */ prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h; prev_nexthdr = ip6h->ip6_nxt; ptr = (uint8_t *)&ip6h[1]; if (prev_nexthdr == IPPROTO_HOPOPTS) { ip6_hbh_t *hbh_hdr; uint_t hdr_len; hbh_hdr = (ip6_hbh_t *)ptr; hdr_len = 8 * (hbh_hdr->ip6h_len + 1); prev_nexthdr = hbh_hdr->ip6h_nxt; prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt - (uint8_t *)ip6h; ptr += hdr_len; } if (prev_nexthdr == IPPROTO_DSTOPTS) { ip6_dest_t *dest_hdr; uint_t hdr_len; dest_hdr = (ip6_dest_t *)ptr; hdr_len = 8 * (dest_hdr->ip6d_len + 1); prev_nexthdr = dest_hdr->ip6d_nxt; prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt - (uint8_t *)ip6h; ptr += hdr_len; } if (prev_nexthdr == IPPROTO_ROUTING) { ip6_rthdr_t *rthdr; uint_t hdr_len; rthdr = (ip6_rthdr_t *)ptr; prev_nexthdr = rthdr->ip6r_nxt; prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt - (uint8_t *)ip6h; hdr_len = 8 * (rthdr->ip6r_len + 1); ptr += hdr_len; } if (prev_nexthdr != IPPROTO_FRAGMENT) { /* Can't handle other headers before the fragment header */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("ipIfStatsInHdrErrors", mp, ill); freemsg(mp); return (NULL); } /* * Note: Fragment offset in header is in 8-octet units. * Clearing least significant 3 bits not only extracts * it but also gets it in units of octets. */ offset = ntohs(fraghdr->ip6f_offlg) & ~7; more_frags = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG); /* * Is the more frags flag on and the payload length not a multiple * of eight? */ if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) { ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&ip6h->ip6_plen - (char *)ip6h), B_FALSE, ira); return (NULL); } v6src_ptr = &ip6h->ip6_src; v6dst_ptr = &ip6h->ip6_dst; end = remlen; hdr_length = (uint_t)((char *)&fraghdr[1] - (char *)ip6h); end += offset; /* * Would fragment cause reassembled packet to have a payload length * greater than IP_MAXPACKET - the max payload size? */ if (end > IP_MAXPACKET) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("Reassembled packet too large", mp, ill); icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&fraghdr->ip6f_offlg - (char *)ip6h), B_FALSE, ira); return (NULL); } /* * This packet just has one fragment. Reassembly not * needed. */ if (!more_frags && offset == 0) { goto reass_done; } /* * Drop the fragmented as early as possible, if * we don't have resource(s) to re-assemble. */ if (ipst->ips_ip_reass_queue_bytes == 0) { freemsg(mp); return (NULL); } /* Record the ECN field info. */ ecn_info = (uint8_t)(ntohl(ip6h->ip6_vcf & htonl(~0xFFCFFFFF)) >> 20); /* * If this is not the first fragment, dump the unfragmentable * portion of the packet. */ if (offset) mp->b_rptr = (uchar_t *)&fraghdr[1]; /* * Fragmentation reassembly. Each ILL has a hash table for * queueing packets undergoing reassembly for all IPIFs * associated with the ILL. The hash is based on the packet * IP ident field. The ILL frag hash table was allocated * as a timer block at the time the ILL was created. Whenever * there is anything on the reassembly queue, the timer will * be running. */ /* Handle vnic loopback of fragments */ if (mp->b_datap->db_ref > 2) msg_len = 0; else msg_len = MBLKSIZE(mp); tail_mp = mp; while (tail_mp->b_cont != NULL) { tail_mp = tail_mp->b_cont; if (tail_mp->b_datap->db_ref <= 2) msg_len += MBLKSIZE(tail_mp); } /* * If the reassembly list for this ILL will get too big * prune it. */ if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= ipst->ips_ip_reass_queue_bytes) { DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len, uint_t, ill->ill_frag_count, uint_t, ipst->ips_ip_reass_queue_bytes); ill_frag_prune(ill, (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : (ipst->ips_ip_reass_queue_bytes - msg_len)); pruned = B_TRUE; } ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH_V6(*v6src_ptr, ident)]; mutex_enter(&ipfb->ipfb_lock); ipfp = &ipfb->ipfb_ipf; /* Try to find an existing fragment queue for this packet. */ for (;;) { ipf = ipfp[0]; if (ipf) { /* * It has to match on ident, source address, and * dest address. */ if (ipf->ipf_ident == ident && IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6src, v6src_ptr) && IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6dst, v6dst_ptr)) { /* * If we have received too many * duplicate fragments for this packet * free it. */ if (ipf->ipf_num_dups > ip_max_frag_dups) { ill_frag_free_pkts(ill, ipfb, ipf, 1); freemsg(mp); mutex_exit(&ipfb->ipfb_lock); return (NULL); } break; } ipfp = &ipf->ipf_hash_next; continue; } /* * If we pruned the list, do we want to store this new * fragment?. We apply an optimization here based on the * fact that most fragments will be received in order. * So if the offset of this incoming fragment is zero, * it is the first fragment of a new packet. We will * keep it. Otherwise drop the fragment, as we have * probably pruned the packet already (since the * packet cannot be found). */ if (pruned && offset != 0) { mutex_exit(&ipfb->ipfb_lock); freemsg(mp); return (NULL); } /* New guy. Allocate a frag message. */ mp1 = allocb(sizeof (*ipf), BPRI_MED); if (!mp1) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); partial_reass_done: mutex_exit(&ipfb->ipfb_lock); return (NULL); } if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { /* * Too many fragmented packets in this hash bucket. * Free the oldest. */ ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); } mp1->b_cont = mp; /* Initialize the fragment header. */ ipf = (ipf_t *)mp1->b_rptr; ipf->ipf_mp = mp1; ipf->ipf_ptphn = ipfp; ipfp[0] = ipf; ipf->ipf_hash_next = NULL; ipf->ipf_ident = ident; ipf->ipf_v6src = *v6src_ptr; ipf->ipf_v6dst = *v6dst_ptr; /* Record reassembly start time. */ ipf->ipf_timestamp = gethrestime_sec(); /* Record ipf generation and account for frag header */ ipf->ipf_gen = ill->ill_ipf_gen++; ipf->ipf_count = MBLKSIZE(mp1); ipf->ipf_protocol = nexthdr; ipf->ipf_nf_hdr_len = 0; ipf->ipf_prev_nexthdr_offset = 0; ipf->ipf_last_frag_seen = B_FALSE; ipf->ipf_ecn = ecn_info; ipf->ipf_num_dups = 0; ipfb->ipfb_frag_pkts++; ipf->ipf_checksum = 0; ipf->ipf_checksum_flags = 0; /* Store checksum value in fragment header */ if (sum_flags != 0) { sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); ipf->ipf_checksum = sum_val; ipf->ipf_checksum_flags = sum_flags; } /* * We handle reassembly two ways. In the easy case, * where all the fragments show up in order, we do * minimal bookkeeping, and just clip new pieces on * the end. If we ever see a hole, then we go off * to ip_reassemble which has to mark the pieces and * keep track of the number of holes, etc. Obviously, * the point of having both mechanisms is so we can * handle the easy case as efficiently as possible. */ if (offset == 0) { /* Easy case, in-order reassembly so far. */ /* Update the byte count */ ipf->ipf_count += msg_len; ipf->ipf_tail_mp = tail_mp; /* * Keep track of next expected offset in * ipf_end. */ ipf->ipf_end = end; ipf->ipf_nf_hdr_len = hdr_length; ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset; } else { /* Hard case, hole at the beginning. */ ipf->ipf_tail_mp = NULL; /* * ipf_end == 0 means that we have given up * on easy reassembly. */ ipf->ipf_end = 0; /* Forget checksum offload from now on */ ipf->ipf_checksum_flags = 0; /* * ipf_hole_cnt is set by ip_reassemble. * ipf_count is updated by ip_reassemble. * No need to check for return value here * as we don't expect reassembly to complete or * fail for the first fragment itself. */ (void) ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len); } /* Update per ipfb and ill byte counts */ ipfb->ipfb_count += ipf->ipf_count; ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ atomic_add_32(&ill->ill_frag_count, ipf->ipf_count); /* If the frag timer wasn't already going, start it. */ mutex_enter(&ill->ill_lock); ill_frag_timer_start(ill); mutex_exit(&ill->ill_lock); goto partial_reass_done; } /* * If the packet's flag has changed (it could be coming up * from an interface different than the previous, therefore * possibly different checksum capability), then forget about * any stored checksum states. Otherwise add the value to * the existing one stored in the fragment header. */ if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { sum_val += ipf->ipf_checksum; sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); ipf->ipf_checksum = sum_val; } else if (ipf->ipf_checksum_flags != 0) { /* Forget checksum offload from now on */ ipf->ipf_checksum_flags = 0; } /* * We have a new piece of a datagram which is already being * reassembled. Update the ECN info if all IP fragments * are ECN capable. If there is one which is not, clear * all the info. If there is at least one which has CE * code point, IP needs to report that up to transport. */ if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { if (ecn_info == IPH_ECN_CE) ipf->ipf_ecn = IPH_ECN_CE; } else { ipf->ipf_ecn = IPH_ECN_NECT; } if (offset && ipf->ipf_end == offset) { /* The new fragment fits at the end */ ipf->ipf_tail_mp->b_cont = mp; /* Update the byte count */ ipf->ipf_count += msg_len; /* Update per ipfb and ill byte counts */ ipfb->ipfb_count += msg_len; ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ atomic_add_32(&ill->ill_frag_count, msg_len); if (more_frags) { /* More to come. */ ipf->ipf_end = end; ipf->ipf_tail_mp = tail_mp; goto partial_reass_done; } } else { /* * Go do the hard cases. * Call ip_reassemble(). */ int ret; if (offset == 0) { if (ipf->ipf_prev_nexthdr_offset == 0) { ipf->ipf_nf_hdr_len = hdr_length; ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset; } } /* Save current byte count */ count = ipf->ipf_count; ret = ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len); /* Count of bytes added and subtracted (freeb()ed) */ count = ipf->ipf_count - count; if (count) { /* Update per ipfb and ill byte counts */ ipfb->ipfb_count += count; ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ atomic_add_32(&ill->ill_frag_count, count); } if (ret == IP_REASS_PARTIAL) { goto partial_reass_done; } else if (ret == IP_REASS_FAILED) { /* Reassembly failed. Free up all resources */ ill_frag_free_pkts(ill, ipfb, ipf, 1); for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { IP_REASS_SET_START(t_mp, 0); IP_REASS_SET_END(t_mp, 0); } freemsg(mp); goto partial_reass_done; } /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ } /* * We have completed reassembly. Unhook the frag header from * the reassembly list. * * Grab the unfragmentable header length next header value out * of the first fragment */ ASSERT(ipf->ipf_nf_hdr_len != 0); hdr_length = ipf->ipf_nf_hdr_len; /* * Before we free the frag header, record the ECN info * to report back to the transport. */ ecn_info = ipf->ipf_ecn; /* * Store the nextheader field in the header preceding the fragment * header */ nexthdr = ipf->ipf_protocol; prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset; ipfp = ipf->ipf_ptphn; /* We need to supply these to caller */ if ((sum_flags = ipf->ipf_checksum_flags) != 0) sum_val = ipf->ipf_checksum; else sum_val = 0; mp1 = ipf->ipf_mp; count = ipf->ipf_count; ipf = ipf->ipf_hash_next; if (ipf) ipf->ipf_ptphn = ipfp; ipfp[0] = ipf; atomic_add_32(&ill->ill_frag_count, -count); ASSERT(ipfb->ipfb_count >= count); ipfb->ipfb_count -= count; ipfb->ipfb_frag_pkts--; mutex_exit(&ipfb->ipfb_lock); /* Ditch the frag header. */ mp = mp1->b_cont; freeb(mp1); /* * Make sure the packet is good by doing some sanity * check. If bad we can silentely drop the packet. */ reass_done: if (hdr_length < sizeof (ip6_frag_t)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); ip_drop_input("ipIfStatsInHdrErrors", mp, ill); ip1dbg(("ip_input_fragment_v6: bad packet\n")); freemsg(mp); return (NULL); } /* * Remove the fragment header from the initial header by * splitting the mblk into the non-fragmentable header and * everthing after the fragment extension header. This has the * side effect of putting all the headers that need destination * processing into the b_cont block-- on return this fact is * used in order to avoid having to look at the extensions * already processed. * * Note that this code assumes that the unfragmentable portion * of the header is in the first mblk and increments * the read pointer past it. If this assumption is broken * this code fails badly. */ if (mp->b_rptr + hdr_length != mp->b_wptr) { mblk_t *nmp; if (!(nmp = dupb(mp))) { ip1dbg(("ip_input_fragment_v6: dupb failed\n")); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); return (NULL); } nmp->b_cont = mp->b_cont; mp->b_cont = nmp; nmp->b_rptr += hdr_length; } mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t); ip6h = (ip6_t *)mp->b_rptr; ((char *)ip6h)[prev_nexthdr_offset] = nexthdr; /* Restore original IP length in header. */ packet_size = msgdsize(mp); ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN)); /* Record the ECN info. */ ip6h->ip6_vcf &= htonl(0xFFCFFFFF); ip6h->ip6_vcf |= htonl(ecn_info << 20); /* Update the receive attributes */ ira->ira_pktlen = packet_size; ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t); ira->ira_protocol = nexthdr; /* Reassembly is successful; set checksum information in packet */ DB_CKSUM16(mp) = (uint16_t)sum_val; DB_CKSUMFLAGS(mp) = sum_flags; DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length; return (mp); } /* * Given an mblk and a ptr, find the destination address in an IPv6 routing * header. */ static in6_addr_t pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv) { ip6_rthdr0_t *rt0; int segleft, numaddr; in6_addr_t *ap, rv = oldrv; rt0 = (ip6_rthdr0_t *)whereptr; if (rt0->ip6r0_type != 0 && rt0->ip6r0_type != 2) { DTRACE_PROBE2(pluck_out_dst_unknown_type, mblk_t *, mp, uint8_t *, whereptr); return (rv); } segleft = rt0->ip6r0_segleft; numaddr = rt0->ip6r0_len / 2; if ((rt0->ip6r0_len & 0x1) || (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) || (segleft > rt0->ip6r0_len / 2)) { /* * Corrupt packet. Either the routing header length is odd * (can't happen) or mismatched compared to the packet, or the * number of addresses is. Return what we can. This will * only be a problem on forwarded packets that get squeezed * through an outbound tunnel enforcing IPsec Tunnel Mode. */ DTRACE_PROBE2(pluck_out_dst_badpkt, mblk_t *, mp, uint8_t *, whereptr); return (rv); } if (segleft != 0) { ap = (in6_addr_t *)((char *)rt0 + sizeof (*rt0)); rv = ap[numaddr - 1]; } return (rv); } /* * Walk through the options to see if there is a routing header. * If present get the destination which is the last address of * the option. * mp needs to be provided in cases when the extension headers might span * b_cont; mp is never modified by this function. */ in6_addr_t ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment) { const mblk_t *current_mp = mp; uint8_t nexthdr; uint8_t *whereptr; int ehdrlen; in6_addr_t rv; whereptr = (uint8_t *)ip6h; ehdrlen = sizeof (ip6_t); /* We assume at least the IPv6 base header is within one mblk. */ ASSERT(mp == NULL || (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen)); rv = ip6h->ip6_dst; nexthdr = ip6h->ip6_nxt; if (is_fragment != NULL) *is_fragment = B_FALSE; /* * We also assume (thanks to ipsec_tun_outbound()'s pullup) that * no extension headers will be split across mblks. */ while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS || nexthdr == IPPROTO_ROUTING) { if (nexthdr == IPPROTO_ROUTING) rv = pluck_out_dst(current_mp, whereptr, rv); /* * All IPv6 extension headers have the next-header in byte * 0, and the (length - 8) in 8-byte-words. */ while (current_mp != NULL && whereptr + ehdrlen >= current_mp->b_wptr) { ehdrlen -= (current_mp->b_wptr - whereptr); current_mp = current_mp->b_cont; if (current_mp == NULL) { /* Bad packet. Return what we can. */ DTRACE_PROBE3(ip_get_dst_v6_badpkt, mblk_t *, mp, mblk_t *, current_mp, ip6_t *, ip6h); goto done; } whereptr = current_mp->b_rptr; } whereptr += ehdrlen; nexthdr = *whereptr; ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr); ehdrlen = (*(whereptr + 1) + 1) * 8; } done: if (nexthdr == IPPROTO_FRAGMENT && is_fragment != NULL) *is_fragment = B_TRUE; return (rv); } /* * ip_source_routed_v6: * This function is called by redirect code (called from ip_input_v6) to * know whether this packet is source routed through this node i.e * whether this node (router) is part of the journey. This * function is called under two cases : * * case 1 : Routing header was processed by this node and * ip_process_rthdr replaced ip6_dst with the next hop * and we are forwarding the packet to the next hop. * * case 2 : Routing header was not processed by this node and we * are just forwarding the packet. * * For case (1) we don't want to send redirects. For case(2) we * want to send redirects. */ static boolean_t ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst) { uint8_t nexthdr; in6_addr_t *addrptr; ip6_rthdr0_t *rthdr; uint8_t numaddr; ip6_hbh_t *hbhhdr; uint_t ehdrlen; uint8_t *byteptr; ip2dbg(("ip_source_routed_v6\n")); nexthdr = ip6h->ip6_nxt; ehdrlen = IPV6_HDR_LEN; /* if a routing hdr is preceeded by HOPOPT or DSTOPT */ while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS) { byteptr = (uint8_t *)ip6h + ehdrlen; /* * Check if we have already processed * packets or we are just a forwarding * router which only pulled up msgs up * to IPV6HDR and one HBH ext header */ if (byteptr + MIN_EHDR_LEN > mp->b_wptr) { ip2dbg(("ip_source_routed_v6: Extension" " headers not processed\n")); return (B_FALSE); } hbhhdr = (ip6_hbh_t *)byteptr; nexthdr = hbhhdr->ip6h_nxt; ehdrlen = ehdrlen + 8 * (hbhhdr->ip6h_len + 1); } switch (nexthdr) { case IPPROTO_ROUTING: byteptr = (uint8_t *)ip6h + ehdrlen; /* * If for some reason, we haven't pulled up * the routing hdr data mblk, then we must * not have processed it at all. So for sure * we are not part of the source routed journey. */ if (byteptr + MIN_EHDR_LEN > mp->b_wptr) { ip2dbg(("ip_source_routed_v6: Routing" " header not processed\n")); return (B_FALSE); } rthdr = (ip6_rthdr0_t *)byteptr; /* * Either we are an intermediate router or the * last hop before destination and we have * already processed the routing header. * If segment_left is greater than or equal to zero, * then we must be the (numaddr - segleft) entry * of the routing header. Although ip6r0_segleft * is a unit8_t variable, we still check for zero * or greater value, if in case the data type * is changed someday in future. */ if (rthdr->ip6r0_segleft > 0 || rthdr->ip6r0_segleft == 0) { numaddr = rthdr->ip6r0_len / 2; addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr)); addrptr += (numaddr - (rthdr->ip6r0_segleft + 1)); if (addrptr != NULL) { if (ip_type_v6(addrptr, ipst) == IRE_LOCAL) return (B_TRUE); ip1dbg(("ip_source_routed_v6: Not local\n")); } } /* FALLTHRU */ default: ip2dbg(("ip_source_routed_v6: Not source routed here\n")); return (B_FALSE); } } /* * IPv6 fragmentation. Essentially the same as IPv4 fragmentation. * We have not optimized this in terms of number of mblks * allocated. For instance, for each fragment sent we always allocate a * mblk to hold the IPv6 header and fragment header. * * Assumes that all the extension headers are contained in the first mblk * and that the fragment header has has already been added by calling * ip_fraghdr_add_v6. */ int ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie) { ip6_t *ip6h = (ip6_t *)mp->b_rptr; ip6_t *fip6h; mblk_t *hmp; mblk_t *hmp0; mblk_t *dmp; ip6_frag_t *fraghdr; size_t unfragmentable_len; size_t mlen; size_t max_chunk; uint16_t off_flags; uint16_t offset = 0; ill_t *ill = nce->nce_ill; uint8_t nexthdr; uint8_t *ptr; ip_stack_t *ipst = ill->ill_ipst; uint_t priority = mp->b_band; int error = 0; BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds); if (max_frag == 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: zero max_frag", mp, ill); freemsg(mp); return (EINVAL); } /* * Caller should have added fraghdr_t to pkt_len, and also * updated ip6_plen. */ ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len); ASSERT(msgdsize(mp) == pkt_len); /* * Determine the length of the unfragmentable portion of this * datagram. This consists of the IPv6 header, a potential * hop-by-hop options header, a potential pre-routing-header * destination options header, and a potential routing header. */ nexthdr = ip6h->ip6_nxt; ptr = (uint8_t *)&ip6h[1]; if (nexthdr == IPPROTO_HOPOPTS) { ip6_hbh_t *hbh_hdr; uint_t hdr_len; hbh_hdr = (ip6_hbh_t *)ptr; hdr_len = 8 * (hbh_hdr->ip6h_len + 1); nexthdr = hbh_hdr->ip6h_nxt; ptr += hdr_len; } if (nexthdr == IPPROTO_DSTOPTS) { ip6_dest_t *dest_hdr; uint_t hdr_len; dest_hdr = (ip6_dest_t *)ptr; if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) { hdr_len = 8 * (dest_hdr->ip6d_len + 1); nexthdr = dest_hdr->ip6d_nxt; ptr += hdr_len; } } if (nexthdr == IPPROTO_ROUTING) { ip6_rthdr_t *rthdr; uint_t hdr_len; rthdr = (ip6_rthdr_t *)ptr; nexthdr = rthdr->ip6r_nxt; hdr_len = 8 * (rthdr->ip6r_len + 1); ptr += hdr_len; } if (nexthdr != IPPROTO_FRAGMENT) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: bad nexthdr", mp, ill); freemsg(mp); return (EINVAL); } unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h); unfragmentable_len += sizeof (ip6_frag_t); max_chunk = (max_frag - unfragmentable_len) & ~7; /* * Allocate an mblk with enough room for the link-layer * header and the unfragmentable part of the datagram, which includes * the fragment header. This (or a copy) will be used as the * first mblk for each fragment we send. */ hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp); if (hmp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: no hmp", mp, ill); freemsg(mp); return (ENOBUFS); } hmp->b_rptr += ipst->ips_ip_wroff_extra; hmp->b_wptr = hmp->b_rptr + unfragmentable_len; fip6h = (ip6_t *)hmp->b_rptr; bcopy(ip6h, fip6h, unfragmentable_len); /* * pkt_len is set to the total length of the fragmentable data in this * datagram. For each fragment sent, we will decrement pkt_len * by the amount of fragmentable data sent in that fragment * until len reaches zero. */ pkt_len -= unfragmentable_len; /* * Move read ptr past unfragmentable portion, we don't want this part * of the data in our fragments. */ mp->b_rptr += unfragmentable_len; if (mp->b_rptr == mp->b_wptr) { mblk_t *mp1 = mp->b_cont; freeb(mp); mp = mp1; } while (pkt_len != 0) { mlen = MIN(pkt_len, max_chunk); pkt_len -= mlen; if (pkt_len != 0) { /* Not last */ hmp0 = copyb(hmp); if (hmp0 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: copyb failed", mp, ill); freeb(hmp); freemsg(mp); ip1dbg(("ip_fragment_v6: copyb failed\n")); return (ENOBUFS); } off_flags = IP6F_MORE_FRAG; } else { /* Last fragment */ hmp0 = hmp; hmp = NULL; off_flags = 0; } fip6h = (ip6_t *)(hmp0->b_rptr); fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len - sizeof (ip6_frag_t)); fip6h->ip6_plen = htons((uint16_t)(mlen + unfragmentable_len - IPV6_HDR_LEN)); /* * Note: Optimization alert. * In IPv6 (and IPv4) protocol header, Fragment Offset * ("offset") is 13 bits wide and in 8-octet units. * In IPv6 protocol header (unlike IPv4) in a 16 bit field, * it occupies the most significant 13 bits. * (least significant 13 bits in IPv4). * We do not do any shifts here. Not shifting is same effect * as taking offset value in octet units, dividing by 8 and * then shifting 3 bits left to line it up in place in proper * place protocol header. */ fraghdr->ip6f_offlg = htons(offset) | off_flags; if (!(dmp = ip_carve_mp(&mp, mlen))) { /* mp has already been freed by ip_carve_mp() */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: could not carve mp", hmp0, ill); if (hmp != NULL) freeb(hmp); freeb(hmp0); ip1dbg(("ip_carve_mp: failed\n")); return (ENOBUFS); } hmp0->b_cont = dmp; /* Get the priority marking, if any */ hmp0->b_band = priority; BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates); error = postfragfn(hmp0, nce, ixaflags, mlen + unfragmentable_len, xmit_hint, szone, nolzid, ixa_cookie); if (error != 0 && error != EWOULDBLOCK && hmp != NULL) { /* No point in sending the other fragments */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); ip_drop_output("FragFails: postfragfn failed", hmp, ill); freeb(hmp); freemsg(mp); return (error); } /* No need to redo state machine in loop */ ixaflags &= ~IXAF_REACH_CONF; offset += mlen; } BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); return (error); } /* * Add a fragment header to an IPv6 packet. * Assumes that all the extension headers are contained in the first mblk. * * The fragment header is inserted after an hop-by-hop options header * and after [an optional destinations header followed by] a routing header. */ mblk_t * ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa) { ip6_t *ip6h = (ip6_t *)mp->b_rptr; ip6_t *fip6h; mblk_t *hmp; ip6_frag_t *fraghdr; size_t unfragmentable_len; uint8_t nexthdr; uint_t prev_nexthdr_offset; uint8_t *ptr; uint_t priority = mp->b_band; ip_stack_t *ipst = ixa->ixa_ipst; /* * Determine the length of the unfragmentable portion of this * datagram. This consists of the IPv6 header, a potential * hop-by-hop options header, a potential pre-routing-header * destination options header, and a potential routing header. */ nexthdr = ip6h->ip6_nxt; prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h; ptr = (uint8_t *)&ip6h[1]; if (nexthdr == IPPROTO_HOPOPTS) { ip6_hbh_t *hbh_hdr; uint_t hdr_len; hbh_hdr = (ip6_hbh_t *)ptr; hdr_len = 8 * (hbh_hdr->ip6h_len + 1); nexthdr = hbh_hdr->ip6h_nxt; prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt - (uint8_t *)ip6h; ptr += hdr_len; } if (nexthdr == IPPROTO_DSTOPTS) { ip6_dest_t *dest_hdr; uint_t hdr_len; dest_hdr = (ip6_dest_t *)ptr; if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) { hdr_len = 8 * (dest_hdr->ip6d_len + 1); nexthdr = dest_hdr->ip6d_nxt; prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt - (uint8_t *)ip6h; ptr += hdr_len; } } if (nexthdr == IPPROTO_ROUTING) { ip6_rthdr_t *rthdr; uint_t hdr_len; rthdr = (ip6_rthdr_t *)ptr; nexthdr = rthdr->ip6r_nxt; prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt - (uint8_t *)ip6h; hdr_len = 8 * (rthdr->ip6r_len + 1); ptr += hdr_len; } unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h); /* * Allocate an mblk with enough room for the link-layer * header, the unfragmentable part of the datagram, and the * fragment header. */ hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) + ipst->ips_ip_wroff_extra, mp); if (hmp == NULL) { ill_t *ill = ixa->ixa_nce->nce_ill; BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill); freemsg(mp); return (NULL); } hmp->b_rptr += ipst->ips_ip_wroff_extra; hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t); fip6h = (ip6_t *)hmp->b_rptr; fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len); bcopy(ip6h, fip6h, unfragmentable_len); fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t)); hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT; fraghdr->ip6f_nxt = nexthdr; fraghdr->ip6f_reserved = 0; fraghdr->ip6f_offlg = 0; fraghdr->ip6f_ident = htonl(ident); /* Get the priority marking, if any */ hmp->b_band = priority; /* * Move read ptr past unfragmentable portion, we don't want this part * of the data in our fragments. */ mp->b_rptr += unfragmentable_len; hmp->b_cont = mp; return (hmp); } /* * Determine if the ill and multicast aspects of that packets * "matches" the conn. */ boolean_t conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h) { ill_t *ill = ira->ira_rill; zoneid_t zoneid = ira->ira_zoneid; uint_t in_ifindex; in6_addr_t *v6dst_ptr = &ip6h->ip6_dst; in6_addr_t *v6src_ptr = &ip6h->ip6_src; /* * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local * scopeid. This is used to limit * unicast and multicast reception to conn_incoming_ifindex. * conn_wantpacket_v6 is called both for unicast and * multicast packets. */ in_ifindex = connp->conn_incoming_ifindex; /* mpathd can bind to the under IPMP interface, which we allow */ if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) { if (!IS_UNDER_IPMP(ill)) return (B_FALSE); if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill)) return (B_FALSE); } if (!IPCL_ZONE_MATCH(connp, zoneid)) return (B_FALSE); if (!(ira->ira_flags & IRAF_MULTICAST)) return (B_TRUE); if (connp->conn_multi_router) return (B_TRUE); if (ira->ira_protocol == IPPROTO_RSVP) return (B_TRUE); return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr, ira->ira_ill)); } /* * pr_addr_dbg function provides the needed buffer space to call * inet_ntop() function's 3rd argument. This function should be * used by any kernel routine which wants to save INET6_ADDRSTRLEN * stack buffer space in it's own stack frame. This function uses * a buffer from it's own stack and prints the information. * Example: pr_addr_dbg("func: no route for %s\n ", AF_INET, addr) * * Note: This function can call inet_ntop() once. */ void pr_addr_dbg(char *fmt1, int af, const void *addr) { char buf[INET6_ADDRSTRLEN]; if (fmt1 == NULL) { ip0dbg(("pr_addr_dbg: Wrong arguments\n")); return; } /* * This does not compare debug level and just prints * out. Thus it is the responsibility of the caller * to check the appropriate debug-level before calling * this function. */ if (ip_debug > 0) { printf(fmt1, inet_ntop(af, addr, buf, sizeof (buf))); } } /* * Return the length in bytes of the IPv6 headers (base header * extension headers) that will be needed based on the * ip_pkt_t structure passed by the caller. * * The returned length does not include the length of the upper level * protocol (ULP) header. */ int ip_total_hdrs_len_v6(const ip_pkt_t *ipp) { int len; len = IPV6_HDR_LEN; /* * If there's a security label here, then we ignore any hop-by-hop * options the user may try to set. */ if (ipp->ipp_fields & IPPF_LABEL_V6) { uint_t hopoptslen; /* * Note that ipp_label_len_v6 is just the option - not * the hopopts extension header. It also needs to be padded * to a multiple of 8 bytes. */ ASSERT(ipp->ipp_label_len_v6 != 0); hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t); hopoptslen = (hopoptslen + 7)/8 * 8; len += hopoptslen; } else if (ipp->ipp_fields & IPPF_HOPOPTS) { ASSERT(ipp->ipp_hopoptslen != 0); len += ipp->ipp_hopoptslen; } /* * En-route destination options * Only do them if there's a routing header as well */ if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) == (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) { ASSERT(ipp->ipp_rthdrdstoptslen != 0); len += ipp->ipp_rthdrdstoptslen; } if (ipp->ipp_fields & IPPF_RTHDR) { ASSERT(ipp->ipp_rthdrlen != 0); len += ipp->ipp_rthdrlen; } if (ipp->ipp_fields & IPPF_DSTOPTS) { ASSERT(ipp->ipp_dstoptslen != 0); len += ipp->ipp_dstoptslen; } return (len); } /* * All-purpose routine to build a header chain of an IPv6 header * followed by any required extension headers and a proto header. * * The caller has to set the source and destination address as well as * ip6_plen. The caller has to massage any routing header and compensate * for the ULP pseudo-header checksum due to the source route. * * The extension headers will all be fully filled in. */ void ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp, uint8_t protocol, uint32_t flowinfo) { uint8_t *nxthdr_ptr; uint8_t *cp; ip6_t *ip6h = (ip6_t *)buf; /* Initialize IPv6 header */ ip6h->ip6_vcf = (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); if (ipp->ipp_fields & IPPF_TCLASS) { /* Overrides the class part of flowinfo */ ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, ipp->ipp_tclass); } if (ipp->ipp_fields & IPPF_HOPLIMIT) ip6h->ip6_hops = ipp->ipp_hoplimit; else ip6h->ip6_hops = ipp->ipp_unicast_hops; if ((ipp->ipp_fields & IPPF_ADDR) && !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) ip6h->ip6_src = ipp->ipp_addr; nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt; cp = (uint8_t *)&ip6h[1]; /* * Here's where we have to start stringing together * any extension headers in the right order: * Hop-by-hop, destination, routing, and final destination opts. */ /* * If there's a security label here, then we ignore any hop-by-hop * options the user may try to set. */ if (ipp->ipp_fields & IPPF_LABEL_V6) { /* * Hop-by-hop options with the label. * Note that ipp_label_v6 is just the option - not * the hopopts extension header. It also needs to be padded * to a multiple of 8 bytes. */ ip6_hbh_t *hbh = (ip6_hbh_t *)cp; uint_t hopoptslen; uint_t padlen; padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t); hopoptslen = (padlen + 7)/8 * 8; padlen = hopoptslen - padlen; *nxthdr_ptr = IPPROTO_HOPOPTS; nxthdr_ptr = &hbh->ip6h_nxt; hbh->ip6h_len = hopoptslen/8 - 1; cp += sizeof (ip6_hbh_t); bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6); cp += ipp->ipp_label_len_v6; ASSERT(padlen <= 7); switch (padlen) { case 0: break; case 1: cp[0] = IP6OPT_PAD1; break; default: cp[0] = IP6OPT_PADN; cp[1] = padlen - 2; bzero(&cp[2], padlen - 2); break; } cp += padlen; } else if (ipp->ipp_fields & IPPF_HOPOPTS) { /* Hop-by-hop options */ ip6_hbh_t *hbh = (ip6_hbh_t *)cp; *nxthdr_ptr = IPPROTO_HOPOPTS; nxthdr_ptr = &hbh->ip6h_nxt; bcopy(ipp->ipp_hopopts, cp, ipp->ipp_hopoptslen); cp += ipp->ipp_hopoptslen; } /* * En-route destination options * Only do them if there's a routing header as well */ if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) == (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) { ip6_dest_t *dst = (ip6_dest_t *)cp; *nxthdr_ptr = IPPROTO_DSTOPTS; nxthdr_ptr = &dst->ip6d_nxt; bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen); cp += ipp->ipp_rthdrdstoptslen; } /* * Routing header next */ if (ipp->ipp_fields & IPPF_RTHDR) { ip6_rthdr_t *rt = (ip6_rthdr_t *)cp; *nxthdr_ptr = IPPROTO_ROUTING; nxthdr_ptr = &rt->ip6r_nxt; bcopy(ipp->ipp_rthdr, cp, ipp->ipp_rthdrlen); cp += ipp->ipp_rthdrlen; } /* * Do ultimate destination options */ if (ipp->ipp_fields & IPPF_DSTOPTS) { ip6_dest_t *dest = (ip6_dest_t *)cp; *nxthdr_ptr = IPPROTO_DSTOPTS; nxthdr_ptr = &dest->ip6d_nxt; bcopy(ipp->ipp_dstopts, cp, ipp->ipp_dstoptslen); cp += ipp->ipp_dstoptslen; } /* * Now set the last header pointer to the proto passed in */ *nxthdr_ptr = protocol; ASSERT((int)(cp - buf) == buf_len); } /* * Return a pointer to the routing header extension header * in the IPv6 header(s) chain passed in. * If none found, return NULL * Assumes that all extension headers are in same mblk as the v6 header */ ip6_rthdr_t * ip_find_rthdr_v6(ip6_t *ip6h, uint8_t *endptr) { ip6_dest_t *desthdr; ip6_frag_t *fraghdr; uint_t hdrlen; uint8_t nexthdr; uint8_t *ptr = (uint8_t *)&ip6h[1]; if (ip6h->ip6_nxt == IPPROTO_ROUTING) return ((ip6_rthdr_t *)ptr); /* * The routing header will precede all extension headers * other than the hop-by-hop and destination options * extension headers, so if we see anything other than those, * we're done and didn't find it. * We could see a destination options header alone but no * routing header, in which case we'll return NULL as soon as * we see anything after that. * Hop-by-hop and destination option headers are identical, * so we can use either one we want as a template. */ nexthdr = ip6h->ip6_nxt; while (ptr < endptr) { /* Is there enough left for len + nexthdr? */ if (ptr + MIN_EHDR_LEN > endptr) return (NULL); switch (nexthdr) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: /* Assumes the headers are identical for hbh and dst */ desthdr = (ip6_dest_t *)ptr; hdrlen = 8 * (desthdr->ip6d_len + 1); nexthdr = desthdr->ip6d_nxt; break; case IPPROTO_ROUTING: return ((ip6_rthdr_t *)ptr); case IPPROTO_FRAGMENT: fraghdr = (ip6_frag_t *)ptr; hdrlen = sizeof (ip6_frag_t); nexthdr = fraghdr->ip6f_nxt; break; default: return (NULL); } ptr += hdrlen; } return (NULL); } /* * Called for source-routed packets originating on this node. * Manipulates the original routing header by moving every entry up * one slot, placing the first entry in the v6 header's v6_dst field, * and placing the ultimate destination in the routing header's last * slot. * * Returns the checksum diference between the ultimate destination * (last hop in the routing header when the packet is sent) and * the first hop (ip6_dst when the packet is sent) */ /* ARGSUSED2 */ uint32_t ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns) { uint_t numaddr; uint_t i; in6_addr_t *addrptr; in6_addr_t tmp; ip6_rthdr0_t *rthdr = (ip6_rthdr0_t *)rth; uint32_t cksm; uint32_t addrsum = 0; uint16_t *ptr; /* * Perform any processing needed for source routing. * We know that all extension headers will be in the same mblk * as the IPv6 header. */ /* * If no segments left in header, or the header length field is zero, * don't move hop addresses around; * Checksum difference is zero. */ if ((rthdr->ip6r0_segleft == 0) || (rthdr->ip6r0_len == 0)) return (0); ptr = (uint16_t *)&ip6h->ip6_dst; cksm = 0; for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) { cksm += ptr[i]; } cksm = (cksm & 0xFFFF) + (cksm >> 16); /* * Here's where the fun begins - we have to * move all addresses up one spot, take the * first hop and make it our first ip6_dst, * and place the ultimate destination in the * newly-opened last slot. */ addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr)); numaddr = rthdr->ip6r0_len / 2; tmp = *addrptr; for (i = 0; i < (numaddr - 1); addrptr++, i++) { *addrptr = addrptr[1]; } *addrptr = ip6h->ip6_dst; ip6h->ip6_dst = tmp; /* * From the checksummed ultimate destination subtract the checksummed * current ip6_dst (the first hop address). Return that number. * (In the v4 case, the second part of this is done in each routine * that calls ip_massage_options(). We do it all in this one place * for v6). */ ptr = (uint16_t *)&ip6h->ip6_dst; for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) { addrsum += ptr[i]; } cksm -= ((addrsum >> 16) + (addrsum & 0xFFFF)); if ((int)cksm < 0) cksm--; cksm = (cksm & 0xFFFF) + (cksm >> 16); return (cksm); } void *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp) { kstat_t *ksp; ip6_stat_t template = { { "ip6_udp_fannorm", KSTAT_DATA_UINT64 }, { "ip6_udp_fanmb", KSTAT_DATA_UINT64 }, { "ip6_recv_pullup", KSTAT_DATA_UINT64 }, { "ip6_db_ref", KSTAT_DATA_UINT64 }, { "ip6_notaligned", KSTAT_DATA_UINT64 }, { "ip6_multimblk", KSTAT_DATA_UINT64 }, { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, { "ip6_out_sw_cksum", KSTAT_DATA_UINT64 }, { "ip6_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "ip6_in_sw_cksum", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, }; ksp = kstat_create_netstack("ip", 0, "ip6stat", "net", KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL, stackid); if (ksp == NULL) return (NULL); bcopy(&template, ip6_statisticsp, sizeof (template)); ksp->ks_data = (void *)ip6_statisticsp; ksp->ks_private = (void *)(uintptr_t)stackid; kstat_install(ksp); return (ksp); } void ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp) { if (ksp != NULL) { ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); kstat_delete_netstack(ksp, stackid); } } /* * The following two functions set and get the value for the * IPV6_SRC_PREFERENCES socket option. */ int ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs) { /* * We only support preferences that are covered by * IPV6_PREFER_SRC_MASK. */ if (prefs & ~IPV6_PREFER_SRC_MASK) return (EINVAL); /* * Look for conflicting preferences or default preferences. If * both bits of a related pair are clear, the application wants the * system's default value for that pair. Both bits in a pair can't * be set. */ if ((prefs & IPV6_PREFER_SRC_MIPMASK) == 0) { prefs |= IPV6_PREFER_SRC_MIPDEFAULT; } else if ((prefs & IPV6_PREFER_SRC_MIPMASK) == IPV6_PREFER_SRC_MIPMASK) { return (EINVAL); } if ((prefs & IPV6_PREFER_SRC_TMPMASK) == 0) { prefs |= IPV6_PREFER_SRC_TMPDEFAULT; } else if ((prefs & IPV6_PREFER_SRC_TMPMASK) == IPV6_PREFER_SRC_TMPMASK) { return (EINVAL); } if ((prefs & IPV6_PREFER_SRC_CGAMASK) == 0) { prefs |= IPV6_PREFER_SRC_CGADEFAULT; } else if ((prefs & IPV6_PREFER_SRC_CGAMASK) == IPV6_PREFER_SRC_CGAMASK) { return (EINVAL); } ixa->ixa_src_preferences = prefs; return (0); } size_t ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val) { *val = ixa->ixa_src_preferences; return (sizeof (ixa->ixa_src_preferences)); } /* * Get the size of the IP options (including the IP headers size) * without including the AH header's size. If till_ah is B_FALSE, * and if AH header is present, dest options beyond AH header will * also be included in the returned size. */ int ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah) { ip6_t *ip6h; uint8_t nexthdr; uint8_t *whereptr; ip6_hbh_t *hbhhdr; ip6_dest_t *dsthdr; ip6_rthdr_t *rthdr; int ehdrlen; int size; ah_t *ah; ip6h = (ip6_t *)mp->b_rptr; size = IPV6_HDR_LEN; nexthdr = ip6h->ip6_nxt; whereptr = (uint8_t *)&ip6h[1]; for (;;) { /* Assume IP has already stripped it */ ASSERT(nexthdr != IPPROTO_FRAGMENT); switch (nexthdr) { case IPPROTO_HOPOPTS: hbhhdr = (ip6_hbh_t *)whereptr; nexthdr = hbhhdr->ip6h_nxt; ehdrlen = 8 * (hbhhdr->ip6h_len + 1); break; case IPPROTO_DSTOPTS: dsthdr = (ip6_dest_t *)whereptr; nexthdr = dsthdr->ip6d_nxt; ehdrlen = 8 * (dsthdr->ip6d_len + 1); break; case IPPROTO_ROUTING: rthdr = (ip6_rthdr_t *)whereptr; nexthdr = rthdr->ip6r_nxt; ehdrlen = 8 * (rthdr->ip6r_len + 1); break; default : if (till_ah) { ASSERT(nexthdr == IPPROTO_AH); return (size); } /* * If we don't have a AH header to traverse, * return now. This happens normally for * outbound datagrams where we have not inserted * the AH header. */ if (nexthdr != IPPROTO_AH) { return (size); } /* * We don't include the AH header's size * to be symmetrical with other cases where * we either don't have a AH header (outbound) * or peek into the AH header yet (inbound and * not pulled up yet). */ ah = (ah_t *)whereptr; nexthdr = ah->ah_nexthdr; ehdrlen = (ah->ah_length << 2) + 8; if (nexthdr == IPPROTO_DSTOPTS) { if (whereptr + ehdrlen >= mp->b_wptr) { /* * The destination options header * is not part of the first mblk. */ whereptr = mp->b_cont->b_rptr; } else { whereptr += ehdrlen; } dsthdr = (ip6_dest_t *)whereptr; ehdrlen = 8 * (dsthdr->ip6d_len + 1); size += ehdrlen; } return (size); } whereptr += ehdrlen; size += ehdrlen; } } /* * Utility routine that checks if `v6srcp' is a valid address on underlying * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif * associated with `v6srcp' on success. NOTE: if this is not called from * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the * group during or after this lookup. */ boolean_t ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp) { ipif_t *ipif; ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst); if (ipif != NULL) { if (ipifp != NULL) *ipifp = ipif; else ipif_refrele(ipif); return (B_TRUE); } if (ip_debug > 2) { pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for " "src %s\n", AF_INET6, v6srcp); } return (B_FALSE); }