/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ #include <sys/types.h> #include <sys/stream.h> #include <sys/strsubr.h> #include <sys/dlpi.h> #include <sys/strsun.h> #include <sys/zone.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/atomic.h> #include <sys/systm.h> #include <sys/param.h> #include <sys/kmem.h> #include <sys/sdt.h> #include <sys/socket.h> #include <sys/mac.h> #include <net/if.h> #include <net/if_arp.h> #include <net/route.h> #include <sys/sockio.h> #include <netinet/in.h> #include <net/if_dl.h> #include <inet/common.h> #include <inet/mi.h> #include <inet/mib2.h> #include <inet/nd.h> #include <inet/arp.h> #include <inet/snmpcom.h> #include <inet/kstatcom.h> #include <netinet/igmp_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet/sctp.h> #include <inet/ip.h> #include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip6_asp.h> #include <inet/tcp.h> #include <inet/ip_multi.h> #include <inet/ip_if.h> #include <inet/ip_ire.h> #include <inet/ip_ftable.h> #include <inet/ip_rts.h> #include <inet/optcom.h> #include <inet/ip_ndp.h> #include <inet/ip_listutils.h> #include <netinet/igmp.h> #include <netinet/ip_mroute.h> #include <inet/ipp_common.h> #include <net/pfkeyv2.h> #include <inet/sadb.h> #include <inet/ipsec_impl.h> #include <inet/ipdrop.h> #include <inet/ip_netinfo.h> #include <sys/pattr.h> #include <inet/ipclassifier.h> #include <inet/sctp_ip.h> #include <inet/sctp/sctp_impl.h> #include <inet/udp_impl.h> #include <sys/sunddi.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> #ifdef DEBUG extern boolean_t skip_sctp_cksum; #endif int ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa) { ip6_t *ip6h; in6_addr_t firsthop; /* In IP header */ in6_addr_t dst; /* End of source route, or ip6_dst if none */ ire_t *ire; in6_addr_t setsrc; int error; ill_t *ill = NULL; dce_t *dce = NULL; nce_t *nce; iaflags_t ixaflags = ixa->ixa_flags; ip_stack_t *ipst = ixa->ixa_ipst; uint8_t *nexthdrp; boolean_t repeat = B_FALSE; boolean_t multirt = B_FALSE; uint_t ifindex; int64_t now; ip6h = (ip6_t *)mp->b_rptr; ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); ASSERT(ixa->ixa_nce == NULL); ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; ASSERT(ixa->ixa_pktlen == msgdsize(mp)); if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length, &nexthdrp)) { /* Malformed packet */ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, NULL); freemsg(mp); return (EINVAL); } ixa->ixa_protocol = *nexthdrp; /* * Assumes that source routed packets have already been massaged by * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next * hop in the source route. The final destination is used for IPsec * policy and DCE lookup. */ firsthop = ip6h->ip6_dst; dst = ip_get_dst_v6(ip6h, mp, NULL); repeat_ire: error = 0; setsrc = ipv6_all_zeros; ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc, &error, &multirt); ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ if (error != 0) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, NULL); freemsg(mp); goto done; } if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { /* ire_ill might be NULL hence need to skip some code */ if (ixaflags & IXAF_SET_SOURCE) ip6h->ip6_src = ipv6_loopback; ixa->ixa_fragsize = IP_MAXPACKET; ire->ire_ob_pkt_count++; BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); /* No dce yet; use default one */ error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &ipst->ips_dce_default->dce_ident); goto done; } /* Note that ip6_dst is only used for IRE_MULTICAST */ nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); if (nce == NULL) { /* Allocation failure? */ ip_drop_output("ire_to_nce", mp, ill); freemsg(mp); error = ENOBUFS; goto done; } if (nce->nce_is_condemned) { nce_t *nce1; nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE); nce_refrele(nce); if (nce1 == NULL) { if (!repeat) { /* Try finding a better IRE */ repeat = B_TRUE; ire_refrele(ire); goto repeat_ire; } /* Tried twice - drop packet */ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); ip_drop_output("No nce", mp, ill); freemsg(mp); error = ENOBUFS; goto done; } nce = nce1; } /* * For multicast with multirt we have a flag passed back from * ire_lookup_multi_ill_v6 since we don't have an IRE for each * possible multicast address. * We also need a flag for multicast since we can't check * whether RTF_MULTIRT is set in ixa_ire for multicast. */ if (multirt) { ixa->ixa_postfragfn = ip_postfrag_multirt_v6; ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; } else { ixa->ixa_postfragfn = ire->ire_postfragfn; ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; } ASSERT(ixa->ixa_nce == NULL); ixa->ixa_nce = nce; /* * Check for a dce_t with a path mtu. */ ifindex = 0; if (IN6_IS_ADDR_LINKSCOPE(&dst)) ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex; dce = dce_lookup_v6(&dst, ifindex, ipst, NULL); ASSERT(dce != NULL); if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { ixa->ixa_fragsize = IPV6_MIN_MTU; } else if (dce->dce_flags & DCEF_PMTU) { /* * To avoid a periodic timer to increase the path MTU we * look at dce_last_change_time each time we send a packet. */ now = ddi_get_lbolt64(); if (TICK_TO_SEC(now) - dce->dce_last_change_time > ipst->ips_ip_pathmtu_interval) { /* * Older than 20 minutes. Drop the path MTU information. */ mutex_enter(&dce->dce_lock); dce->dce_flags &= ~DCEF_PMTU; dce->dce_last_change_time = TICK_TO_SEC(now); mutex_exit(&dce->dce_lock); dce_increment_generation(dce); ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); } else { uint_t fragsize; fragsize = ip_get_base_mtu(nce->nce_ill, ire); if (fragsize > dce->dce_pmtu) fragsize = dce->dce_pmtu; ixa->ixa_fragsize = fragsize; } } else { ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); } /* * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp * interface for source address selection. */ ill = ire_nexthop_ill(ire); if (ixaflags & IXAF_SET_SOURCE) { in6_addr_t src; /* * We use the final destination to get * correct selection for source routed packets */ /* If unreachable we have no ill but need some source */ if (ill == NULL) { src = ipv6_loopback; error = 0; } else { error = ip_select_source_v6(ill, &setsrc, &dst, ixa->ixa_zoneid, ipst, B_FALSE, ixa->ixa_src_preferences, &src, NULL, NULL); } if (error != 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - no source", mp, ill); freemsg(mp); goto done; } ip6h->ip6_src = src; } else if (ixaflags & IXAF_VERIFY_SOURCE) { /* Check if the IP source is assigned to the host. */ if (!ip_verify_src(mp, ixa, NULL)) { /* Don't send a packet with a source that isn't ours */ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - invalid source", mp, ill); freemsg(mp); error = EADDRNOTAVAIL; goto done; } } /* * Check against global IPsec policy to set the AH/ESP attributes. * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. */ if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { ASSERT(ixa->ixa_ipsec_policy == NULL); mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa); if (mp == NULL) { /* MIB and ip_drop_packet already done */ return (EHOSTUNREACH); /* IPsec policy failure */ } } if (ill != NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); } else { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); } /* * We update the statistics on the most specific IRE i.e., the first * one we found. * We don't have an IRE when we fragment, hence ire_ob_pkt_count * can only count the use prior to fragmentation. However the MIB * counters on the ill will be incremented in post fragmentation. */ ire->ire_ob_pkt_count++; /* * Based on ire_type and ire_flags call one of: * ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK * ire_send_multirt_v6 - if RTF_MULTIRT * ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE * ire_send_multicast_v6 - for IRE_MULTICAST * ire_send_wire_v6 - for the rest. */ error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident); done: ire_refrele(ire); if (dce != NULL) dce_refrele(dce); if (ill != NULL) ill_refrele(ill); if (ixa->ixa_nce != NULL) nce_refrele(ixa->ixa_nce); ixa->ixa_nce = NULL; return (error); } /* * ire_sendfn() functions. * These functions use the following xmit_attr: * - ixa_fragsize - read to determine whether or not to fragment * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec * - ixa_ipsec_* are used inside IPsec * - IXAF_LOOPBACK_COPY - for multicast */ /* * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK * * The checks for restrict_interzone_loopback are done in ire_route_recursive. */ /* ARGSUSED4 */ int ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_xmit_attr_t *ixa, uint32_t *identp) { ip6_t *ip6h = (ip6_t *)iph_arg; ip_stack_t *ipst = ixa->ixa_ipst; ill_t *ill = ire->ire_ill; ip_recv_attr_t iras; /* NOTE: No bzero for performance */ uint_t pktlen = ixa->ixa_pktlen; /* * No fragmentation, no nce, and no application of IPsec. * * * Note different order between IP provider and FW_HOOKS than in * send_wire case. */ /* * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the * send probe, but not the receive probe. */ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, int, 1); DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) { int error; FW_HOOKS(ipst->ips_ip6_loopback_out_event, ipst->ips_ipv6firewall_loopback_out, NULL, ill, ip6h, mp, mp, 0, ipst, error); DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); if (mp == NULL) return (error); /* * Even if the destination was changed by the filter we use the * forwarding decision that was made based on the address * in ip_output/ip_set_destination. */ /* Length could be different */ ip6h = (ip6_t *)mp->b_rptr; pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; } /* * If a callback is enabled then we need to know the * source and destination zoneids for the packet. We already * have those handy. */ if (ipst->ips_ip6_observe.he_interested) { zoneid_t szone, dzone; zoneid_t stackzoneid; stackzoneid = netstackid_to_zoneid( ipst->ips_netstack->netstack_stackid); if (stackzoneid == GLOBAL_ZONEID) { /* Shared-IP zone */ dzone = ire->ire_zoneid; szone = ixa->ixa_zoneid; } else { szone = dzone = stackzoneid; } ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); } /* Handle lo0 stats */ ipst->ips_loopback_packets++; /* * Update output mib stats. Note that we can't move into the icmp * sender (icmp_output etc) since they don't know the ill and the * stats are per ill. */ if (ixa->ixa_protocol == IPPROTO_ICMPV6) { icmp6_t *icmp6; icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length); icmp_update_out_mib_v6(ill, icmp6); } DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) { int error; FW_HOOKS(ipst->ips_ip6_loopback_in_event, ipst->ips_ipv6firewall_loopback_in, ill, NULL, ip6h, mp, mp, 0, ipst, error); DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); if (mp == NULL) return (error); /* * Even if the destination was changed by the filter we use the * forwarding decision that was made based on the address * in ip_output/ip_set_destination. */ /* Length could be different */ ip6h = (ip6_t *)mp->b_rptr; pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; } DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, int, 1); /* Map ixa to ira including IPsec policies */ ipsec_out_to_in(ixa, ill, &iras); iras.ira_pktlen = pktlen; ire->ire_ib_pkt_count++; BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); /* Destined to ire_zoneid - use that for fanout */ iras.ira_zoneid = ire->ire_zoneid; if (is_system_labeled()) { iras.ira_flags |= IRAF_SYSTEM_LABELED; /* * This updates ira_cred, ira_tsl and ira_free_flags based * on the label. We don't expect this to ever fail for * loopback packets, so we silently drop the packet should it * fail. */ if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip_drop_input("tsol_get_pkt_label", mp, ill); freemsg(mp); return (0); } ASSERT(iras.ira_tsl != NULL); /* tsol_get_pkt_label sometimes does pullupmsg */ ip6h = (ip6_t *)mp->b_rptr; } ip_fanout_v6(mp, ip6h, &iras); /* We moved any IPsec refs from ixa to iras */ ira_cleanup(&iras, B_FALSE); return (0); } static void multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa) { ip_stack_t *ipst = ixa->ixa_ipst; /* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */ if (ire->ire_type & IRE_MULTICAST) { if (ip6h->ip6_hops > 1) { ip2dbg(("ire_send_multirt_v6: forcing multicast " "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops)); ip6h->ip6_hops = 1; } ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; } else if ((ipst->ips_ip_multirt_ttl > 0) && (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) { ip6h->ip6_hops = ipst->ips_ip_multirt_ttl; /* * Need to ensure we don't increase the ttl should we go through * ire_send_multicast. */ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; } /* For IPv6 this also needs to insert a fragment header */ ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR; } /* * ire_sendfn for IRE_MULTICAST * * Note that we do path MTU discovery by default for IPv6 multicast. But * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY * only connected sockets get this by default. */ int ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_xmit_attr_t *ixa, uint32_t *identp) { ip6_t *ip6h = (ip6_t *)iph_arg; ip_stack_t *ipst = ixa->ixa_ipst; ill_t *ill = ire->ire_ill; iaflags_t ixaflags = ixa->ixa_flags; /* * The IRE_MULTICAST is the same whether or not multirt is in use. * Hence we need special-case code. */ if (ixaflags & IXAF_MULTIRT_MULTICAST) multirt_check_v6(ire, ip6h, ixa); /* * Check if anything in ip_input_v6 wants a copy of the transmitted * packet (after IPsec and fragmentation) * * 1. Multicast routers always need a copy unless SO_DONTROUTE is set * RSVP and the rsvp daemon is an example of a * protocol and user level process that * handles it's own routing. Hence, it uses the * SO_DONTROUTE option to accomplish this. * 2. If the sender has set IP_MULTICAST_LOOP, then we just * check whether there are any receivers for the group on the ill * (ignoring the zoneid). * 3. If IP_MULTICAST_LOOP is not set, then we check if there are * any members in other shared-IP zones. * If such members exist, then we indicate that the sending zone * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP * behavior. * * When we loopback we skip hardware checksum to make sure loopback * copy is checksumed. * * Note that ire_ill is the upper in the case of IPMP. */ ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && !(ixaflags & IXAF_DONTROUTE)) { ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; } else if (ixaflags & IXAF_MULTICAST_LOOP) { /* * If this zone or any other zone has members then loopback * a copy. */ if (ill_hasmembers_v6(ill, &ip6h->ip6_dst)) ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; } else if (ipst->ips_netstack->netstack_numzones > 1) { /* * This zone should not have a copy. But there are some other * zones which might have members. */ if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst, ixa->ixa_zoneid)) { ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; } } /* * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl, * force the ttl to the IP_MULTICAST_TTL value */ if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { ip6h->ip6_hops = ixa->ixa_multicast_ttl; } return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp)); } /* * ire_sendfn for IREs with RTF_MULTIRT */ int ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_xmit_attr_t *ixa, uint32_t *identp) { ip6_t *ip6h = (ip6_t *)iph_arg; multirt_check_v6(ire, ip6h, ixa); if (ire->ire_type & IRE_MULTICAST) return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp)); else return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp)); } /* * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE */ /* ARGSUSED4 */ int ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_xmit_attr_t *ixa, uint32_t *identp) { ip6_t *ip6h = (ip6_t *)iph_arg; ip_stack_t *ipst = ixa->ixa_ipst; ill_t *ill; ip_recv_attr_t iras; boolean_t dummy; BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); if (ire->ire_type & IRE_NOROUTE) { /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); } if (ire->ire_flags & RTF_BLACKHOLE) { ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); freemsg(mp); /* No error even for local senders - silent blackhole */ return (0); } ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); /* * We need an ill_t for the ip_recv_attr_t even though this packet * was never received and icmp_unreachable doesn't currently use * ira_ill. */ ill = ill_lookup_on_name("lo0", B_FALSE, !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); if (ill == NULL) { freemsg(mp); return (EHOSTUNREACH); } bzero(&iras, sizeof (iras)); /* Map ixa to ira including IPsec policies */ ipsec_out_to_in(ixa, ill, &iras); icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras); /* We moved any IPsec refs from ixa to iras */ ira_cleanup(&iras, B_FALSE); ill_refrele(ill); return (EHOSTUNREACH); } /* * Calculate a checksum ignoring any hardware capabilities * * Returns B_FALSE if the packet was too short for the checksum. Caller * should free and do stats. */ static boolean_t ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa) { ip_stack_t *ipst = ixa->ixa_ipst; uint_t pktlen = ixa->ixa_pktlen; uint16_t *cksump; uint32_t cksum; uint8_t protocol = ixa->ixa_protocol; uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; #define iphs ((uint16_t *)ip6h) /* Just in case it contained garbage */ DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; /* * Calculate ULP checksum */ if (protocol == IPPROTO_TCP) { cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length); cksum = IP_TCP_CSUM_COMP; } else if (protocol == IPPROTO_UDP) { cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length); cksum = IP_UDP_CSUM_COMP; } else if (protocol == IPPROTO_SCTP) { sctp_hdr_t *sctph; ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); /* * Zero out the checksum field to ensure proper * checksum calculation. */ sctph->sh_chksum = 0; #ifdef DEBUG if (!skip_sctp_cksum) #endif sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); return (B_TRUE); } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { /* * icmp has placed length and routing * header adjustment in the checksum field. */ cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length + ixa->ixa_raw_cksum_offset); cksum = htons(protocol); } else if (protocol == IPPROTO_ICMPV6) { cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */ } else { return (B_TRUE); } /* ULP puts the checksum field is in the first mblk */ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); /* * We accumulate the pseudo header checksum in cksum. * This is pretty hairy code, so watch close. One * thing to keep in mind is that UDP and TCP have * stored their respective datagram lengths in their * checksum fields. This lines things up real nice. */ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + iphs[16] + iphs[17] + iphs[18] + iphs[19]; cksum = IP_CSUM(mp, ip_hdr_length, cksum); /* * For UDP/IPv6 a zero UDP checksum is not allowed. * Change to 0xffff */ if (protocol == IPPROTO_UDP && cksum == 0) *cksump = ~cksum; else *cksump = cksum; IP6_STAT(ipst, ip6_out_sw_cksum); IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen); /* No IP header checksum for IPv6 */ return (B_TRUE); #undef iphs } /* There are drivers that can't do partial checksum for ICMPv6 */ int nxge_cksum_workaround = 1; /* * Calculate the ULP checksum - try to use hardware. * In the case of MULTIRT or multicast the * IXAF_NO_HW_CKSUM is set in which case we use software. * * Returns B_FALSE if the packet was too short for the checksum. Caller * should free and do stats. */ static boolean_t ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa, ill_t *ill) { uint_t pktlen = ixa->ixa_pktlen; uint16_t *cksump; uint16_t hck_flags; uint32_t cksum; uint8_t protocol = ixa->ixa_protocol; uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; #define iphs ((uint16_t *)ip6h) if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || !dohwcksum) { return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); } /* * Calculate ULP checksum. Note that we don't use cksump and cksum * if the ill has FULL support. */ if (protocol == IPPROTO_TCP) { cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length); cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ } else if (protocol == IPPROTO_UDP) { cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length); cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ } else if (protocol == IPPROTO_SCTP) { sctp_hdr_t *sctph; ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); /* * Zero out the checksum field to ensure proper * checksum calculation. */ sctph->sh_chksum = 0; #ifdef DEBUG if (!skip_sctp_cksum) #endif sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); goto ip_hdr_cksum; } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { /* * icmp has placed length and routing * header adjustment in the checksum field. */ cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length + ixa->ixa_raw_cksum_offset); cksum = htons(protocol); } else if (protocol == IPPROTO_ICMPV6) { /* * Currently we assume no HW support for ICMP checksum calc. * * When HW support is advertised for ICMP, we'll want the * following to be set: * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); * cksum = IP_ICMPV6_CSUM_COMP; Pseudo-header cksum */ return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); } else { ip_hdr_cksum: /* No IP header checksum for IPv6 */ return (B_TRUE); } /* ULP puts the checksum field is in the first mblk */ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); /* * Underlying interface supports hardware checksum offload for * the payload; leave the payload checksum for the hardware to * calculate. N.B: We only need to set up checksum info on the * first mblk. */ hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; if (hck_flags & HCKSUM_INET_FULL_V6) { /* * Hardware calculates pseudo-header, header and the * payload checksums, so clear the checksum field in * the protocol header. */ *cksump = 0; DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; return (B_TRUE); } if (((hck_flags) & HCKSUM_INET_PARTIAL) && (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) { /* * Partial checksum offload has been enabled. Fill * the checksum field in the protocol header with the * pseudo-header checksum value. * * We accumulate the pseudo header checksum in cksum. * This is pretty hairy code, so watch close. One * thing to keep in mind is that UDP and TCP have * stored their respective datagram lengths in their * checksum fields. This lines things up real nice. */ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + iphs[16] + iphs[17] + iphs[18] + iphs[19]; cksum += *(cksump); cksum = (cksum & 0xFFFF) + (cksum >> 16); *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); /* * Offsets are relative to beginning of IP header. */ DB_CKSUMSTART(mp) = ip_hdr_length; DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h; DB_CKSUMEND(mp) = pktlen; DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; return (B_TRUE); } /* Hardware capabilities include neither full nor partial IPv6 */ return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); #undef iphs } /* * ire_sendfn for offlink and onlink destinations. * Also called from the multicast, and multirt send functions. * * Assumes that the caller has a hold on the ire. * * This function doesn't care if the IRE just became condemned since that * can happen at any time. */ /* ARGSUSED */ int ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_xmit_attr_t *ixa, uint32_t *identp) { ip_stack_t *ipst = ixa->ixa_ipst; ip6_t *ip6h = (ip6_t *)iph_arg; iaflags_t ixaflags = ixa->ixa_flags; ill_t *ill; uint32_t pktlen = ixa->ixa_pktlen; ASSERT(ixa->ixa_nce != NULL); ill = ixa->ixa_nce->nce_ill; /* * Update output mib stats. Note that we can't move into the icmp * sender (icmp_output etc) since they don't know the ill and the * stats are per ill. * * With IPMP we record the stats on the upper ill. */ if (ixa->ixa_protocol == IPPROTO_ICMPV6) { icmp6_t *icmp6; icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length); icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill, icmp6); } if (ixaflags & IXAF_DONTROUTE) ip6h->ip6_hops = 1; /* * This might set b_band, thus the IPsec and fragmentation * code in IP ensures that b_band is updated in the first mblk. */ if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { /* ip_process translates an IS_UNDER_IPMP */ mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); if (mp == NULL) { /* ip_drop_packet and MIB done */ return (0); /* Might just be delayed */ } } /* * To handle IPsec/iptun's labeling needs we need to tag packets * while we still have ixa_tsl */ if (is_system_labeled() && ixa->ixa_tsl != NULL && (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || ill->ill_mactype == DL_IPV6)) { cred_t *newcr; newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, KM_NOSLEEP); if (newcr == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - newcr", mp, ill); freemsg(mp); return (ENOBUFS); } mblk_setcred(mp, newcr, NOPID); crfree(newcr); /* mblk_setcred did its own crhold */ } /* * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a * fragment header without fragmenting. CGTP on the receiver will * filter duplicates on the ident field. */ if (pktlen > ixa->ixa_fragsize || (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) { uint32_t ident = 0; if (ixaflags & IXAF_IPSEC_SECURE) pktlen += ipsec_out_extra_length(ixa); if (pktlen > IP_MAXPACKET) return (EMSGSIZE); if (ixaflags & IXAF_SET_ULP_CKSUM) { /* * Compute ULP checksum using software */ if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, ill); freemsg(mp); return (EINVAL); } /* Avoid checksum again below if we only add fraghdr */ ixaflags &= ~IXAF_SET_ULP_CKSUM; } /* * If we need a fragment header, pick the ident and insert * the header before IPsec to we have a place to store * the ident value. */ if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) || pktlen > ixa->ixa_fragsize) { /* * If this packet would generate a icmp_frag_needed * message, we need to handle it before we do the IPsec * processing. Otherwise, we need to strip the IPsec * headers before we send up the message to the ULPs * which becomes messy and difficult. */ if ((pktlen > ixa->ixa_fragsize) && (ixaflags & IXAF_DONTFRAG)) { /* Generate ICMP and return error */ ip_recv_attr_t iras; DTRACE_PROBE4(ip6__fragsize__fail, uint_t, pktlen, uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, uint_t, ixa->ixa_pmtu); bzero(&iras, sizeof (iras)); /* Map ixa to ira including IPsec policies */ ipsec_out_to_in(ixa, ill, &iras); ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill); icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE, &iras); /* We moved any IPsec refs from ixa to iras */ ira_cleanup(&iras, B_FALSE); return (EMSGSIZE); } DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen, uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, uint_t, ixa->ixa_pmtu); /* * Assign an ident value for this packet. There could * be other threads targeting the same destination, so * we have to arrange for a atomic increment. * Normally ixa_extra_ident is 0, but in the case of * LSO it will be the number of TCP segments that the * driver/hardware will extraly construct. * * Note that cl_inet_ipident has only been used for * IPv4. We don't use it here. */ ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident + 1); ixa->ixa_ident = ident; /* In case we do IPsec */ } if (ixaflags & IXAF_IPSEC_SECURE) { /* * Pass in sufficient information so that * IPsec can determine whether to fragment, and * which function to call after fragmentation. */ return (ipsec_out_process(mp, ixa)); } mp = ip_fraghdr_add_v6(mp, ident, ixa); if (mp == NULL) { /* MIB and ip_drop_output already done */ return (ENOMEM); } ASSERT(pktlen == ixa->ixa_pktlen); pktlen += sizeof (ip6_frag_t); if (pktlen > ixa->ixa_fragsize) { return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags, pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn, &ixa->ixa_cookie)); } } if (ixaflags & IXAF_SET_ULP_CKSUM) { /* Compute ULP checksum and IP header checksum */ /* An IS_UNDER_IPMP ill is ok here */ if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, ill); freemsg(mp); return (EINVAL); } } return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); } /* * Post fragmentation function for RTF_MULTIRT routes. * Since IRE_MULTICASTs might have RTF_MULTIRT, this function * checks IXAF_LOOPBACK_COPY. * * If no packet is sent due to failures then we return an errno, but if at * least one succeeded we return zero. */ int ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie) { irb_t *irb; ip6_t *ip6h = (ip6_t *)mp->b_rptr; ire_t *ire; ire_t *ire1; mblk_t *mp1; nce_t *nce1; ill_t *ill = nce->nce_ill; ill_t *ill1; ip_stack_t *ipst = ill->ill_ipst; int error = 0; int num_sent = 0; int err; uint_t ire_type; in6_addr_t nexthop; ASSERT(!(ixaflags & IXAF_IS_IPV4)); /* Check for IXAF_LOOPBACK_COPY */ if (ixaflags & IXAF_LOOPBACK_COPY) { mblk_t *mp1; mp1 = copymsg(mp); if (mp1 == NULL) { /* Failed to deliver the loopback copy. */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, ill); error = ENOBUFS; } else { ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, nolzid); } } /* * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send * a copy to each one. * Use the nce (nexthop) and ip6_dst to find the ire. * * MULTIRT is not designed to work with shared-IP zones thus we don't * need to pass a zoneid or a label to the IRE lookup. */ if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) { /* Broadcast and multicast case */ ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); } else { /* Unicast case */ ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); } if (ire == NULL || (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || !(ire->ire_flags & RTF_MULTIRT)) { /* Drop */ ip_drop_output("ip_postfrag_multirt didn't find route", mp, nce->nce_ill); if (ire != NULL) ire_refrele(ire); return (ENETUNREACH); } irb = ire->ire_bucket; irb_refhold(irb); for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { if (IRE_IS_CONDEMNED(ire1) || !(ire1->ire_flags & RTF_MULTIRT)) continue; /* Note: When IPv6 uses radix tree we don't need this check */ if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6)) continue; /* Do the ire argument one after the loop */ if (ire1 == ire) continue; ill1 = ire_nexthop_ill(ire1); if (ill1 == NULL) { /* * This ire might not have been picked by * ire_route_recursive, in which case ire_dep might * not have been setup yet. * We kick ire_route_recursive to try to resolve * starting at ire1. */ ire_t *ire2; uint_t match_flags = MATCH_IRE_DSTONLY; if (ire1->ire_ill != NULL) match_flags |= MATCH_IRE_ILL; ire2 = ire_route_recursive_impl_v6(ire1, &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill, ire1->ire_zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL); if (ire2 != NULL) ire_refrele(ire2); ill1 = ire_nexthop_ill(ire1); } if (ill1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - no ill", mp, ill); error = ENETUNREACH; continue; } /* Pick the addr and type to use for ndp_nce_init */ if (nce->nce_common->ncec_flags & NCE_F_MCAST) { ire_type = IRE_MULTICAST; nexthop = ip6h->ip6_dst; } else { ire_type = ire1->ire_type; /* Doesn't matter */ nexthop = ire1->ire_gateway_addr_v6; } /* If IPMP meta or under, then we just drop */ if (ill1->ill_grp != NULL) { BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - IPMP", mp, ill1); ill_refrele(ill1); error = ENETUNREACH; continue; } nce1 = ndp_nce_init(ill1, &nexthop, ire_type); if (nce1 == NULL) { BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards - no nce", mp, ill1); ill_refrele(ill1); error = ENOBUFS; continue; } mp1 = copymsg(mp); if (mp1 == NULL) { BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, ill1); nce_refrele(nce1); ill_refrele(ill1); error = ENOBUFS; continue; } /* Preserve HW checksum for this copy */ DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); DB_LSOMSS(mp1) = DB_LSOMSS(mp); ire1->ire_ob_pkt_count++; err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, 0, ixacookie); if (err == 0) num_sent++; else error = err; nce_refrele(nce1); ill_refrele(ill1); } irb_refrele(irb); ire_refrele(ire); /* Finally, the main one */ err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, ixacookie); if (err == 0) num_sent++; else error = err; if (num_sent > 0) return (0); else return (error); }