xref: /illumos-gate/usr/src/uts/common/inet/ip/ip6.c (revision fec047081731fd77caf46ec0471c501b2cb33894)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 1990 Mentat Inc.
24  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
25  * Copyright 2021 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/dlpi.h>
31 #include <sys/stropts.h>
32 #include <sys/sysmacros.h>
33 #include <sys/strsun.h>
34 #include <sys/strlog.h>
35 #include <sys/strsubr.h>
36 #define	_SUN_TPI_VERSION	2
37 #include <sys/tihdr.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/cmn_err.h>
41 #include <sys/debug.h>
42 #include <sys/sdt.h>
43 #include <sys/kobj.h>
44 #include <sys/zone.h>
45 #include <sys/neti.h>
46 #include <sys/hook.h>
47 
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/param.h>
51 #include <sys/socket.h>
52 #include <sys/vtrace.h>
53 #include <sys/isa_defs.h>
54 #include <sys/atomic.h>
55 #include <sys/policy.h>
56 #include <sys/mac.h>
57 #include <net/if.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 #include <net/if_dl.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <netinet/ip6.h>
64 #include <netinet/icmp6.h>
65 #include <netinet/sctp.h>
66 
67 #include <inet/common.h>
68 #include <inet/mi.h>
69 #include <inet/optcom.h>
70 #include <inet/mib2.h>
71 #include <inet/nd.h>
72 #include <inet/arp.h>
73 
74 #include <inet/ip.h>
75 #include <inet/ip_impl.h>
76 #include <inet/ip6.h>
77 #include <inet/ip6_asp.h>
78 #include <inet/tcp.h>
79 #include <inet/tcp_impl.h>
80 #include <inet/udp_impl.h>
81 #include <inet/ipp_common.h>
82 
83 #include <inet/ip_multi.h>
84 #include <inet/ip_if.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_rts.h>
87 #include <inet/ip_ndp.h>
88 #include <net/pfkeyv2.h>
89 #include <inet/sadb.h>
90 #include <inet/ipsec_impl.h>
91 #include <inet/iptun/iptun_impl.h>
92 #include <inet/sctp_ip.h>
93 #include <sys/pattr.h>
94 #include <inet/ipclassifier.h>
95 #include <inet/ipsecah.h>
96 #include <inet/rawip_impl.h>
97 #include <inet/rts_impl.h>
98 #include <sys/squeue_impl.h>
99 #include <sys/squeue.h>
100 
101 #include <sys/tsol/label.h>
102 #include <sys/tsol/tnet.h>
103 
104 /* Temporary; for CR 6451644 work-around */
105 #include <sys/ethernet.h>
106 
107 /*
108  * Naming conventions:
109  *      These rules should be judiciously applied
110  *	if there is a need to identify something as IPv6 versus IPv4
111  *	IPv6 funcions will end with _v6 in the ip module.
112  *	IPv6 funcions will end with _ipv6 in the transport modules.
113  *	IPv6 macros:
114  *		Some macros end with _V6; e.g. ILL_FRAG_HASH_V6
115  *		Some macros start with V6_; e.g. V6_OR_V4_INADDR_ANY
116  *		And then there are ..V4_PART_OF_V6.
117  *		The intent is that macros in the ip module end with _V6.
118  *	IPv6 global variables will start with ipv6_
119  *	IPv6 structures will start with ipv6
120  *	IPv6 defined constants should start with IPV6_
121  *		(but then there are NDP_DEFAULT_VERS_PRI_AND_FLOW, etc)
122  */
123 
124 /*
125  * ip6opt_ls is used to enable IPv6 (via /etc/system on TX systems).
126  * We need to do this because we didn't obtain the IP6OPT_LS (0x0a)
127  * from IANA. This mechanism will remain in effect until an official
128  * number is obtained.
129  */
130 uchar_t ip6opt_ls;
131 
132 const in6_addr_t ipv6_all_ones =
133 	{ 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
134 const in6_addr_t ipv6_all_zeros = { 0, 0, 0, 0 };
135 
136 #ifdef	_BIG_ENDIAN
137 const in6_addr_t ipv6_unspecified_group = { 0xff000000U, 0, 0, 0 };
138 #else	/* _BIG_ENDIAN */
139 const in6_addr_t ipv6_unspecified_group = { 0x000000ffU, 0, 0, 0 };
140 #endif	/* _BIG_ENDIAN */
141 
142 #ifdef	_BIG_ENDIAN
143 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x00000001U };
144 #else  /* _BIG_ENDIAN */
145 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x01000000U };
146 #endif /* _BIG_ENDIAN */
147 
148 #ifdef _BIG_ENDIAN
149 const in6_addr_t ipv6_all_hosts_mcast = { 0xff020000U, 0, 0, 0x00000001U };
150 #else  /* _BIG_ENDIAN */
151 const in6_addr_t ipv6_all_hosts_mcast = { 0x000002ffU, 0, 0, 0x01000000U };
152 #endif /* _BIG_ENDIAN */
153 
154 #ifdef _BIG_ENDIAN
155 const in6_addr_t ipv6_all_rtrs_mcast = { 0xff020000U, 0, 0, 0x00000002U };
156 #else  /* _BIG_ENDIAN */
157 const in6_addr_t ipv6_all_rtrs_mcast = { 0x000002ffU, 0, 0, 0x02000000U };
158 #endif /* _BIG_ENDIAN */
159 
160 #ifdef _BIG_ENDIAN
161 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0xff020000U, 0, 0, 0x00000016U };
162 #else  /* _BIG_ENDIAN */
163 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0x000002ffU, 0, 0, 0x16000000U };
164 #endif /* _BIG_ENDIAN */
165 
166 #ifdef _BIG_ENDIAN
167 const in6_addr_t ipv6_solicited_node_mcast =
168 			{ 0xff020000U, 0, 0x00000001U, 0xff000000U };
169 #else  /* _BIG_ENDIAN */
170 const in6_addr_t ipv6_solicited_node_mcast =
171 			{ 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
172 #endif /* _BIG_ENDIAN */
173 
174 static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
175 static void	icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
176 static void	icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
177     ip_recv_attr_t *);
178 static void	icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
179     ip_recv_attr_t *);
180 static void	icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
181     in6_addr_t *, ip_recv_attr_t *);
182 static void	icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
183     ip_recv_attr_t *);
184 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
185 
186 /*
187  * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
188  * If the ICMP message is consumed by IP, i.e., it should not be delivered
189  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
190  * Likewise, if the ICMP error is misformed (too short, etc), then it
191  * returns NULL. The caller uses this to determine whether or not to send
192  * to raw sockets.
193  *
194  * All error messages are passed to the matching transport stream.
195  *
196  * See comment for icmp_inbound_v4() on how IPsec is handled.
197  */
198 mblk_t *
199 icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
200 {
201 	icmp6_t		*icmp6;
202 	ip6_t		*ip6h;		/* Outer header */
203 	int		ip_hdr_length;	/* Outer header length */
204 	boolean_t	interested;
205 	ill_t		*ill = ira->ira_ill;
206 	ip_stack_t	*ipst = ill->ill_ipst;
207 	mblk_t		*mp_ret = NULL;
208 
209 	ip6h = (ip6_t *)mp->b_rptr;
210 
211 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
212 
213 	/* Check for Martian packets  */
214 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
215 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
216 		ip_drop_input("ipIfStatsInAddrErrors: mcast src", mp, ill);
217 		freemsg(mp);
218 		return (NULL);
219 	}
220 
221 	/* Make sure ira_l2src is set for ndp_input */
222 	if (!(ira->ira_flags & IRAF_L2SRC_SET))
223 		ip_setl2src(mp, ira, ira->ira_rill);
224 
225 	ip_hdr_length = ira->ira_ip_hdr_length;
226 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
227 		if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
228 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
229 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
230 			freemsg(mp);
231 			return (NULL);
232 		}
233 		ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
234 		if (ip6h == NULL) {
235 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
236 			freemsg(mp);
237 			return (NULL);
238 		}
239 	}
240 
241 	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
242 	DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
243 	ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
244 	    icmp6->icmp6_code));
245 
246 	/*
247 	 * We will set "interested" to "true" if we should pass a copy to
248 	 * the transport i.e., if it is an error message.
249 	 */
250 	interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
251 
252 	switch (icmp6->icmp6_type) {
253 	case ICMP6_DST_UNREACH:
254 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInDestUnreachs);
255 		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
256 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInAdminProhibs);
257 		break;
258 
259 	case ICMP6_TIME_EXCEEDED:
260 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInTimeExcds);
261 		break;
262 
263 	case ICMP6_PARAM_PROB:
264 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInParmProblems);
265 		break;
266 
267 	case ICMP6_PACKET_TOO_BIG:
268 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
269 		break;
270 
271 	case ICMP6_ECHO_REQUEST:
272 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
273 		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
274 		    !ipst->ips_ipv6_resp_echo_mcast)
275 			break;
276 
277 		/*
278 		 * We must have exclusive use of the mblk to convert it to
279 		 * a response.
280 		 * If not, we copy it.
281 		 */
282 		if (mp->b_datap->db_ref > 1) {
283 			mblk_t	*mp1;
284 
285 			mp1 = copymsg(mp);
286 			if (mp1 == NULL) {
287 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
288 				ip_drop_input("ipIfStatsInDiscards - copymsg",
289 				    mp, ill);
290 				freemsg(mp);
291 				return (NULL);
292 			}
293 			freemsg(mp);
294 			mp = mp1;
295 			ip6h = (ip6_t *)mp->b_rptr;
296 			icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
297 		}
298 
299 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
300 		icmp_send_reply_v6(mp, ip6h, icmp6, ira);
301 		return (NULL);
302 
303 	case ICMP6_ECHO_REPLY:
304 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
305 		break;
306 
307 	case ND_ROUTER_SOLICIT:
308 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterSolicits);
309 		break;
310 
311 	case ND_ROUTER_ADVERT:
312 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterAdvertisements);
313 		break;
314 
315 	case ND_NEIGHBOR_SOLICIT:
316 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
317 		ndp_input(mp, ira);
318 		return (NULL);
319 
320 	case ND_NEIGHBOR_ADVERT:
321 		BUMP_MIB(ill->ill_icmp6_mib,
322 		    ipv6IfIcmpInNeighborAdvertisements);
323 		ndp_input(mp, ira);
324 		return (NULL);
325 
326 	case ND_REDIRECT:
327 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
328 
329 		if (ipst->ips_ipv6_ignore_redirect)
330 			break;
331 
332 		/* We now allow a RAW socket to receive this. */
333 		interested = B_TRUE;
334 		break;
335 
336 	/*
337 	 * The next three icmp messages will be handled by MLD.
338 	 * Pass all valid MLD packets up to any process(es)
339 	 * listening on a raw ICMP socket.
340 	 */
341 	case MLD_LISTENER_QUERY:
342 	case MLD_LISTENER_REPORT:
343 	case MLD_LISTENER_REDUCTION:
344 		mp = mld_input(mp, ira);
345 		return (mp);
346 	default:
347 		break;
348 	}
349 	/*
350 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
351 	 * if there isn't one.
352 	 */
353 	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
354 		/* If there is an ICMP client and we want one too, copy it. */
355 
356 		if (!interested) {
357 			/* Caller will deliver to RAW sockets */
358 			return (mp);
359 		}
360 		mp_ret = copymsg(mp);
361 		if (mp_ret == NULL) {
362 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
363 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
364 		}
365 	} else if (!interested) {
366 		/* Neither we nor raw sockets are interested. Drop packet now */
367 		freemsg(mp);
368 		return (NULL);
369 	}
370 
371 	/*
372 	 * ICMP error or redirect packet. Make sure we have enough of
373 	 * the header and that db_ref == 1 since we might end up modifying
374 	 * the packet.
375 	 */
376 	if (mp->b_cont != NULL) {
377 		if (ip_pullup(mp, -1, ira) == NULL) {
378 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
379 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
380 			    mp, ill);
381 			freemsg(mp);
382 			return (mp_ret);
383 		}
384 	}
385 
386 	if (mp->b_datap->db_ref > 1) {
387 		mblk_t	*mp1;
388 
389 		mp1 = copymsg(mp);
390 		if (mp1 == NULL) {
391 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
392 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
393 			freemsg(mp);
394 			return (mp_ret);
395 		}
396 		freemsg(mp);
397 		mp = mp1;
398 	}
399 
400 	/*
401 	 * In case mp has changed, verify the message before any further
402 	 * processes.
403 	 */
404 	ip6h = (ip6_t *)mp->b_rptr;
405 	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
406 	if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
407 		freemsg(mp);
408 		return (mp_ret);
409 	}
410 
411 	switch (icmp6->icmp6_type) {
412 	case ND_REDIRECT:
413 		icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
414 		break;
415 	case ICMP6_PACKET_TOO_BIG:
416 		/* Update DCE and adjust MTU is icmp header if needed */
417 		icmp_inbound_too_big_v6(icmp6, ira);
418 		/* FALLTHROUGH */
419 	default:
420 		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
421 		break;
422 	}
423 
424 	return (mp_ret);
425 }
426 
427 /*
428  * Send an ICMP echo reply.
429  * The caller has already updated the payload part of the packet.
430  * We handle the ICMP checksum, IP source address selection and feed
431  * the packet into ip_output_simple.
432  */
433 static void
434 icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
435     ip_recv_attr_t *ira)
436 {
437 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
438 	ill_t		*ill = ira->ira_ill;
439 	ip_stack_t	*ipst = ill->ill_ipst;
440 	ip_xmit_attr_t	ixas;
441 	in6_addr_t	origsrc;
442 
443 	/*
444 	 * Remove any extension headers (do not reverse a source route)
445 	 * and clear the flow id (keep traffic class for now).
446 	 */
447 	if (ip_hdr_length != IPV6_HDR_LEN) {
448 		int	i;
449 
450 		for (i = 0; i < IPV6_HDR_LEN; i++) {
451 			mp->b_rptr[ip_hdr_length - i - 1] =
452 			    mp->b_rptr[IPV6_HDR_LEN - i - 1];
453 		}
454 		mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
455 		ip6h = (ip6_t *)mp->b_rptr;
456 		ip6h->ip6_nxt = IPPROTO_ICMPV6;
457 		i = ntohs(ip6h->ip6_plen);
458 		i -= (ip_hdr_length - IPV6_HDR_LEN);
459 		ip6h->ip6_plen = htons(i);
460 		ip_hdr_length = IPV6_HDR_LEN;
461 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
462 	}
463 	ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
464 
465 	/* Reverse the source and destination addresses. */
466 	origsrc = ip6h->ip6_src;
467 	ip6h->ip6_src = ip6h->ip6_dst;
468 	ip6h->ip6_dst = origsrc;
469 
470 	/* set the hop limit */
471 	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
472 
473 	/*
474 	 * Prepare for checksum by putting icmp length in the icmp
475 	 * checksum field. The checksum is calculated in ip_output
476 	 */
477 	icmp6->icmp6_cksum = ip6h->ip6_plen;
478 
479 	bzero(&ixas, sizeof (ixas));
480 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
481 	ixas.ixa_zoneid = ira->ira_zoneid;
482 	ixas.ixa_cred = kcred;
483 	ixas.ixa_cpid = NOPID;
484 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
485 	ixas.ixa_ifindex = 0;
486 	ixas.ixa_ipst = ipst;
487 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
488 
489 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
490 		/*
491 		 * This packet should go out the same way as it
492 		 * came in i.e in clear, independent of the IPsec
493 		 * policy for transmitting packets.
494 		 */
495 		ixas.ixa_flags |= IXAF_NO_IPSEC;
496 	} else {
497 		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
498 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
499 			/* Note: mp already consumed and ip_drop_packet done */
500 			return;
501 		}
502 	}
503 
504 	/* Was the destination (now source) link-local? Send out same group */
505 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
506 		ixas.ixa_flags |= IXAF_SCOPEID_SET;
507 		if (IS_UNDER_IPMP(ill))
508 			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
509 		else
510 			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
511 	}
512 
513 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
514 		/*
515 		 * Not one or our addresses (IRE_LOCALs), thus we let
516 		 * ip_output_simple pick the source.
517 		 */
518 		ip6h->ip6_src = ipv6_all_zeros;
519 		ixas.ixa_flags |= IXAF_SET_SOURCE;
520 	}
521 
522 	/* Should we send using dce_pmtu? */
523 	if (ipst->ips_ipv6_icmp_return_pmtu)
524 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
525 
526 	(void) ip_output_simple(mp, &ixas);
527 	ixa_cleanup(&ixas);
528 
529 }
530 
531 /*
532  * Verify the ICMP messages for either for ICMP error or redirect packet.
533  * The caller should have fully pulled up the message. If it's a redirect
534  * packet, only basic checks on IP header will be done; otherwise, verify
535  * the packet by looking at the included ULP header.
536  *
537  * Called before icmp_inbound_error_fanout_v6 is called.
538  */
539 static boolean_t
540 icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
541 {
542 	ill_t		*ill = ira->ira_ill;
543 	uint16_t	hdr_length;
544 	uint8_t		*nexthdrp;
545 	uint8_t		nexthdr;
546 	ip_stack_t	*ipst = ill->ill_ipst;
547 	conn_t		*connp;
548 	ip6_t		*ip6h;	/* Inner header */
549 
550 	ip6h = (ip6_t *)&icmp6[1];
551 	if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
552 		goto truncated;
553 
554 	if (icmp6->icmp6_type == ND_REDIRECT) {
555 		hdr_length = sizeof (nd_redirect_t);
556 	} else {
557 		if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
558 			goto discard_pkt;
559 		hdr_length = IPV6_HDR_LEN;
560 	}
561 
562 	if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
563 		goto truncated;
564 
565 	/*
566 	 * Stop here for ICMP_REDIRECT.
567 	 */
568 	if (icmp6->icmp6_type == ND_REDIRECT)
569 		return (B_TRUE);
570 
571 	/*
572 	 * ICMP errors only.
573 	 */
574 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
575 		goto discard_pkt;
576 	nexthdr = *nexthdrp;
577 
578 	/* Try to pass the ICMP message to clients who need it */
579 	switch (nexthdr) {
580 	case IPPROTO_UDP:
581 		/*
582 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
583 		 * transport header.
584 		 */
585 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
586 		    mp->b_wptr)
587 			goto truncated;
588 		break;
589 	case IPPROTO_TCP: {
590 		tcpha_t		*tcpha;
591 
592 		/*
593 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
594 		 * transport header.
595 		 */
596 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
597 		    mp->b_wptr)
598 			goto truncated;
599 
600 		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
601 		/*
602 		 * With IPMP we need to match across group, which we do
603 		 * since we have the upper ill from ira_ill.
604 		 */
605 		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
606 		    ill->ill_phyint->phyint_ifindex, ipst);
607 		if (connp == NULL)
608 			goto discard_pkt;
609 
610 		if ((connp->conn_verifyicmp != NULL) &&
611 		    !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
612 			CONN_DEC_REF(connp);
613 			goto discard_pkt;
614 		}
615 		CONN_DEC_REF(connp);
616 		break;
617 	}
618 	case IPPROTO_SCTP:
619 		/*
620 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
621 		 * transport header.
622 		 */
623 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
624 		    mp->b_wptr)
625 			goto truncated;
626 		break;
627 	case IPPROTO_ESP:
628 	case IPPROTO_AH:
629 		break;
630 	case IPPROTO_ENCAP:
631 	case IPPROTO_IPV6: {
632 		/* Look for self-encapsulated packets that caused an error */
633 		ip6_t *in_ip6h;
634 
635 		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
636 		if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
637 		    sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
638 			goto truncated;
639 		break;
640 	}
641 	default:
642 		break;
643 	}
644 
645 	return (B_TRUE);
646 
647 discard_pkt:
648 	/* Bogus ICMP error. */
649 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
650 	return (B_FALSE);
651 
652 truncated:
653 	/* We pulled up everthing already. Must be truncated */
654 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
655 	return (B_FALSE);
656 }
657 
658 /*
659  * Process received IPv6 ICMP Packet too big.
660  * The caller is responsible for validating the packet before passing it in
661  * and also to fanout the ICMP error to any matching transport conns. Assumes
662  * the message has been fully pulled up.
663  *
664  * Before getting here, the caller has called icmp_inbound_verify_v6()
665  * that should have verified with ULP to prevent undoing the changes we're
666  * going to make to DCE. For example, TCP might have verified that the packet
667  * which generated error is in the send window.
668  *
669  * In some cases modified this MTU in the ICMP header packet; the caller
670  * should pass to the matching ULP after this returns.
671  */
672 static void
673 icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
674 {
675 	uint32_t	mtu;
676 	dce_t		*dce;
677 	ill_t		*ill = ira->ira_ill;	/* Upper ill if IPMP */
678 	ip_stack_t	*ipst = ill->ill_ipst;
679 	int		old_max_frag;
680 	in6_addr_t	final_dst;
681 	ip6_t		*ip6h;	/* Inner IP header */
682 
683 	/* Caller has already pulled up everything. */
684 	ip6h = (ip6_t *)&icmp6[1];
685 	final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
686 
687 	mtu = ntohl(icmp6->icmp6_mtu);
688 	if (mtu < IPV6_MIN_MTU) {
689 		/*
690 		 * RFC 8021 suggests to ignore messages where mtu is
691 		 * less than the IPv6 minimum.
692 		 */
693 		ip1dbg(("Received mtu less than IPv6 "
694 		    "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
695 		DTRACE_PROBE1(icmp6__too__small__mtu, uint32_t, mtu);
696 		return;
697 	}
698 
699 	/*
700 	 * For link local destinations matching simply on address is not
701 	 * sufficient. Same link local addresses for different ILL's is
702 	 * possible.
703 	 */
704 	if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
705 		dce = dce_lookup_and_add_v6(&final_dst,
706 		    ill->ill_phyint->phyint_ifindex, ipst);
707 	} else {
708 		dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
709 	}
710 	if (dce == NULL) {
711 		/* Couldn't add a unique one - ENOMEM */
712 		if (ip_debug > 2) {
713 			/* ip1dbg */
714 			pr_addr_dbg("icmp_inbound_too_big_v6:"
715 			    "no dce for dst %s\n", AF_INET6,
716 			    &final_dst);
717 		}
718 		return;
719 	}
720 
721 	mutex_enter(&dce->dce_lock);
722 	if (dce->dce_flags & DCEF_PMTU)
723 		old_max_frag = dce->dce_pmtu;
724 	else if (IN6_IS_ADDR_MULTICAST(&final_dst))
725 		old_max_frag = ill->ill_mc_mtu;
726 	else
727 		old_max_frag = ill->ill_mtu;
728 
729 	ip1dbg(("Received mtu from router: %d\n", mtu));
730 	DTRACE_PROBE1(icmp6__received__mtu, uint32_t, mtu);
731 	dce->dce_pmtu = MIN(old_max_frag, mtu);
732 	icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
733 
734 	/* We now have a PMTU for sure */
735 	dce->dce_flags |= DCEF_PMTU;
736 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
737 
738 	mutex_exit(&dce->dce_lock);
739 	/*
740 	 * After dropping the lock the new value is visible to everyone.
741 	 * Then we bump the generation number so any cached values reinspect
742 	 * the dce_t.
743 	 */
744 	dce_increment_generation(dce);
745 	dce_refrele(dce);
746 }
747 
748 /*
749  * Fanout received ICMPv6 error packets to the transports.
750  * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
751  *
752  * The caller must have called icmp_inbound_verify_v6.
753  */
754 void
755 icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
756 {
757 	uint16_t	*up;	/* Pointer to ports in ULP header */
758 	uint32_t	ports;	/* reversed ports for fanout */
759 	ip6_t		rip6h;	/* With reversed addresses */
760 	ip6_t		*ip6h;	/* Inner IP header */
761 	uint16_t	hdr_length; /* Inner IP header length */
762 	uint8_t		*nexthdrp;
763 	uint8_t		nexthdr;
764 	tcpha_t		*tcpha;
765 	conn_t		*connp;
766 	ill_t		*ill = ira->ira_ill;	/* Upper in the case of IPMP */
767 	ip_stack_t	*ipst = ill->ill_ipst;
768 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
769 
770 	/* Caller has already pulled up everything. */
771 	ip6h = (ip6_t *)&icmp6[1];
772 	ASSERT(mp->b_cont == NULL);
773 	ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
774 
775 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
776 		goto drop_pkt;
777 	nexthdr = *nexthdrp;
778 	ira->ira_protocol = nexthdr;
779 
780 	/*
781 	 * We need a separate IP header with the source and destination
782 	 * addresses reversed to do fanout/classification because the ip6h in
783 	 * the ICMPv6 error is in the form we sent it out.
784 	 */
785 	rip6h.ip6_src = ip6h->ip6_dst;
786 	rip6h.ip6_dst = ip6h->ip6_src;
787 	rip6h.ip6_nxt = nexthdr;
788 
789 	/* Try to pass the ICMP message to clients who need it */
790 	switch (nexthdr) {
791 	case IPPROTO_UDP: {
792 		/* Attempt to find a client stream based on port. */
793 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
794 
795 		/* Note that we send error to all matches. */
796 		ira->ira_flags |= IRAF_ICMP_ERROR;
797 		ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
798 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
799 		return;
800 	}
801 	case IPPROTO_TCP: {
802 		/*
803 		 * Attempt to find a client stream based on port.
804 		 * Note that we do a reverse lookup since the header is
805 		 * in the form we sent it out.
806 		 */
807 		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
808 		/*
809 		 * With IPMP we need to match across group, which we do
810 		 * since we have the upper ill from ira_ill.
811 		 */
812 		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
813 		    TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
814 		if (connp == NULL) {
815 			goto drop_pkt;
816 		}
817 
818 		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
819 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
820 			mp = ipsec_check_inbound_policy(mp, connp,
821 			    NULL, ip6h, ira);
822 			if (mp == NULL) {
823 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
824 				/* Note that mp is NULL */
825 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
826 				CONN_DEC_REF(connp);
827 				return;
828 			}
829 		}
830 
831 		ira->ira_flags |= IRAF_ICMP_ERROR;
832 		if (IPCL_IS_TCP(connp)) {
833 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
834 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
835 			    SQTAG_TCP6_INPUT_ICMP_ERR);
836 		} else {
837 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
838 			ill_t *rill = ira->ira_rill;
839 
840 			ira->ira_ill = ira->ira_rill = NULL;
841 			(connp->conn_recv)(connp, mp, NULL, ira);
842 			CONN_DEC_REF(connp);
843 			ira->ira_ill = ill;
844 			ira->ira_rill = rill;
845 		}
846 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
847 		return;
848 
849 	}
850 	case IPPROTO_SCTP:
851 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
852 		/* Find a SCTP client stream for this packet. */
853 		((uint16_t *)&ports)[0] = up[1];
854 		((uint16_t *)&ports)[1] = up[0];
855 
856 		ira->ira_flags |= IRAF_ICMP_ERROR;
857 		ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
858 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
859 		return;
860 
861 	case IPPROTO_ESP:
862 	case IPPROTO_AH:
863 		if (!ipsec_loaded(ipss)) {
864 			ip_proto_not_sup(mp, ira);
865 			return;
866 		}
867 
868 		if (nexthdr == IPPROTO_ESP)
869 			mp = ipsecesp_icmp_error(mp, ira);
870 		else
871 			mp = ipsecah_icmp_error(mp, ira);
872 		if (mp == NULL)
873 			return;
874 
875 		/* Just in case ipsec didn't preserve the NULL b_cont */
876 		if (mp->b_cont != NULL) {
877 			if (!pullupmsg(mp, -1))
878 				goto drop_pkt;
879 		}
880 
881 		/*
882 		 * If succesful, the mp has been modified to not include
883 		 * the ESP/AH header so we can fanout to the ULP's icmp
884 		 * error handler.
885 		 */
886 		if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
887 			goto drop_pkt;
888 
889 		ip6h = (ip6_t *)mp->b_rptr;
890 		/* Don't call hdr_length_v6() unless you have to. */
891 		if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
892 			hdr_length = ip_hdr_length_v6(mp, ip6h);
893 		else
894 			hdr_length = IPV6_HDR_LEN;
895 
896 		/* Verify the modified message before any further processes. */
897 		icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
898 		if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
899 			freemsg(mp);
900 			return;
901 		}
902 
903 		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
904 		return;
905 
906 	case IPPROTO_IPV6: {
907 		/* Look for self-encapsulated packets that caused an error */
908 		ip6_t *in_ip6h;
909 
910 		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
911 
912 		if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
913 		    IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
914 			/*
915 			 * Self-encapsulated case. As in the ipv4 case,
916 			 * we need to strip the 2nd IP header. Since mp
917 			 * is already pulled-up, we can simply bcopy
918 			 * the 3rd header + data over the 2nd header.
919 			 */
920 			uint16_t unused_len;
921 
922 			/*
923 			 * Make sure we don't do recursion more than once.
924 			 */
925 			if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
926 			    &unused_len, &nexthdrp) ||
927 			    *nexthdrp == IPPROTO_IPV6) {
928 				goto drop_pkt;
929 			}
930 
931 			/*
932 			 * Copy the 3rd header + remaining data on top
933 			 * of the 2nd header.
934 			 */
935 			bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
936 
937 			/*
938 			 * Subtract length of the 2nd header.
939 			 */
940 			mp->b_wptr -= hdr_length;
941 
942 			ip6h = (ip6_t *)mp->b_rptr;
943 			/* Don't call hdr_length_v6() unless you have to. */
944 			if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
945 				hdr_length = ip_hdr_length_v6(mp, ip6h);
946 			else
947 				hdr_length = IPV6_HDR_LEN;
948 
949 			/*
950 			 * Verify the modified message before any further
951 			 * processes.
952 			 */
953 			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
954 			if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
955 				freemsg(mp);
956 				return;
957 			}
958 
959 			/*
960 			 * Now recurse, and see what I _really_ should be
961 			 * doing here.
962 			 */
963 			icmp_inbound_error_fanout_v6(mp, icmp6, ira);
964 			return;
965 		}
966 	}
967 	/* FALLTHROUGH */
968 	case IPPROTO_ENCAP:
969 		if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
970 		    &rip6h.ip6_dst, ipst)) != NULL) {
971 			ira->ira_flags |= IRAF_ICMP_ERROR;
972 			connp->conn_recvicmp(connp, mp, NULL, ira);
973 			CONN_DEC_REF(connp);
974 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
975 			return;
976 		}
977 		/*
978 		 * No IP tunnel is interested, fallthrough and see
979 		 * if a raw socket will want it.
980 		 */
981 		/* FALLTHROUGH */
982 	default:
983 		ira->ira_flags |= IRAF_ICMP_ERROR;
984 		ASSERT(ira->ira_protocol == nexthdr);
985 		ip_fanout_proto_v6(mp, &rip6h, ira);
986 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
987 		return;
988 	}
989 	/* NOTREACHED */
990 drop_pkt:
991 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
992 	ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
993 	freemsg(mp);
994 }
995 
996 /*
997  * Process received IPv6 ICMP Redirect messages.
998  * Assumes the caller has verified that the headers are in the pulled up mblk.
999  * Consumes mp.
1000  */
1001 /* ARGSUSED */
1002 static void
1003 icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
1004     ip_recv_attr_t *ira)
1005 {
1006 	ire_t		*ire, *nire;
1007 	ire_t		*prev_ire = NULL;
1008 	ire_t		*redir_ire;
1009 	in6_addr_t	*src, *dst, *gateway;
1010 	nd_opt_hdr_t	*opt;
1011 	nce_t		*nce;
1012 	int		ncec_flags = 0;
1013 	int		err = 0;
1014 	boolean_t	redirect_to_router = B_FALSE;
1015 	int		len;
1016 	int		optlen;
1017 	ill_t		*ill = ira->ira_rill;
1018 	ill_t		*rill = ira->ira_rill;
1019 	ip_stack_t	*ipst = ill->ill_ipst;
1020 
1021 	/*
1022 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
1023 	 * and make it be the IPMP upper so avoid being confused by a packet
1024 	 * addressed to a unicast address on a different ill.
1025 	 */
1026 	if (IS_UNDER_IPMP(rill)) {
1027 		rill = ipmp_ill_hold_ipmp_ill(rill);
1028 		if (rill == NULL) {
1029 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1030 			ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
1031 			    mp, ill);
1032 			freemsg(mp);
1033 			return;
1034 		}
1035 		ASSERT(rill != ira->ira_rill);
1036 	}
1037 
1038 	len = mp->b_wptr - (uchar_t *)rd;
1039 	src = &ip6h->ip6_src;
1040 	dst = &rd->nd_rd_dst;
1041 	gateway = &rd->nd_rd_target;
1042 
1043 	/* Verify if it is a valid redirect */
1044 	if (!IN6_IS_ADDR_LINKLOCAL(src) ||
1045 	    (ip6h->ip6_hops != IPV6_MAX_HOPS) ||
1046 	    (rd->nd_rd_code != 0) ||
1047 	    (len < sizeof (nd_redirect_t)) ||
1048 	    (IN6_IS_ADDR_V4MAPPED(dst)) ||
1049 	    (IN6_IS_ADDR_MULTICAST(dst))) {
1050 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1051 		ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
1052 		goto fail_redirect;
1053 	}
1054 
1055 	if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
1056 	    IN6_ARE_ADDR_EQUAL(gateway, dst))) {
1057 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1058 		ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
1059 		    mp, ill);
1060 		goto fail_redirect;
1061 	}
1062 
1063 	optlen = len - sizeof (nd_redirect_t);
1064 	if (optlen != 0) {
1065 		if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
1066 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1067 			ip_drop_input("ipv6IfIcmpInBadRedirects - options",
1068 			    mp, ill);
1069 			goto fail_redirect;
1070 		}
1071 	}
1072 
1073 	if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
1074 		redirect_to_router = B_TRUE;
1075 		ncec_flags |= NCE_F_ISROUTER;
1076 	} else {
1077 		gateway = dst;	/* Add nce for dst */
1078 	}
1079 
1080 
1081 	/*
1082 	 * Verify that the IP source address of the redirect is
1083 	 * the same as the current first-hop router for the specified
1084 	 * ICMP destination address.
1085 	 * Also, Make sure we had a route for the dest in question and
1086 	 * that route was pointing to the old gateway (the source of the
1087 	 * redirect packet.)
1088 	 * We do longest match and then compare ire_gateway_addr_v6 below.
1089 	 */
1090 	prev_ire = ire_ftable_lookup_v6(dst, 0, 0, 0, rill,
1091 	    ALL_ZONES, NULL, MATCH_IRE_ILL, 0, ipst, NULL);
1092 
1093 	/*
1094 	 * Check that
1095 	 *	the redirect was not from ourselves
1096 	 *	old gateway is still directly reachable
1097 	 */
1098 	if (prev_ire == NULL ||
1099 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
1100 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1101 	    !IN6_ARE_ADDR_EQUAL(src, &prev_ire->ire_gateway_addr_v6)) {
1102 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1103 		ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
1104 		goto fail_redirect;
1105 	}
1106 
1107 	ASSERT(prev_ire->ire_ill != NULL);
1108 	if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
1109 		ncec_flags |= NCE_F_NONUD;
1110 
1111 	opt = (nd_opt_hdr_t *)&rd[1];
1112 	opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
1113 	if (opt != NULL) {
1114 		err = nce_lookup_then_add_v6(rill,
1115 		    (uchar_t *)&opt[1],		/* Link layer address */
1116 		    rill->ill_phys_addr_length,
1117 		    gateway, ncec_flags, ND_STALE, &nce);
1118 		switch (err) {
1119 		case 0:
1120 			nce_refrele(nce);
1121 			break;
1122 		case EEXIST:
1123 			/*
1124 			 * Check to see if link layer address has changed and
1125 			 * process the ncec_state accordingly.
1126 			 */
1127 			nce_process(nce->nce_common,
1128 			    (uchar_t *)&opt[1], 0, B_FALSE);
1129 			nce_refrele(nce);
1130 			break;
1131 		default:
1132 			ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
1133 			    err));
1134 			goto fail_redirect;
1135 		}
1136 	}
1137 	if (redirect_to_router) {
1138 		ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
1139 
1140 		/*
1141 		 * Create a Route Association.  This will allow us to remember
1142 		 * a router told us to use the particular gateway.
1143 		 */
1144 		ire = ire_create_v6(
1145 		    dst,
1146 		    &ipv6_all_ones,		/* mask */
1147 		    gateway,			/* gateway addr */
1148 		    IRE_HOST,
1149 		    prev_ire->ire_ill,
1150 		    ALL_ZONES,
1151 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
1152 		    NULL,
1153 		    ipst);
1154 	} else {
1155 		ipif_t *ipif;
1156 		in6_addr_t gw;
1157 
1158 		/*
1159 		 * Just create an on link entry, i.e. interface route.
1160 		 * The gateway field is our link-local on the ill.
1161 		 */
1162 		mutex_enter(&rill->ill_lock);
1163 		for (ipif = rill->ill_ipif; ipif != NULL;
1164 		    ipif = ipif->ipif_next) {
1165 			if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1166 			    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
1167 				break;
1168 		}
1169 		if (ipif == NULL) {
1170 			/* We have no link-local address! */
1171 			mutex_exit(&rill->ill_lock);
1172 			goto fail_redirect;
1173 		}
1174 		gw = ipif->ipif_v6lcl_addr;
1175 		mutex_exit(&rill->ill_lock);
1176 
1177 		ire = ire_create_v6(
1178 		    dst,				/* gateway == dst */
1179 		    &ipv6_all_ones,			/* mask */
1180 		    &gw,				/* gateway addr */
1181 		    rill->ill_net_type,			/* IF_[NO]RESOLVER */
1182 		    prev_ire->ire_ill,
1183 		    ALL_ZONES,
1184 		    (RTF_DYNAMIC | RTF_HOST),
1185 		    NULL,
1186 		    ipst);
1187 	}
1188 
1189 	if (ire == NULL)
1190 		goto fail_redirect;
1191 
1192 	nire = ire_add(ire);
1193 	/* Check if it was a duplicate entry */
1194 	if (nire != NULL && nire != ire) {
1195 		ASSERT(nire->ire_identical_ref > 1);
1196 		ire_delete(nire);
1197 		ire_refrele(nire);
1198 		nire = NULL;
1199 	}
1200 	ire = nire;
1201 	if (ire != NULL) {
1202 		ire_refrele(ire);		/* Held in ire_add */
1203 
1204 		/* tell routing sockets that we received a redirect */
1205 		ip_rts_change_v6(RTM_REDIRECT,
1206 		    &rd->nd_rd_dst,
1207 		    &rd->nd_rd_target,
1208 		    &ipv6_all_ones, 0, src,
1209 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
1210 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
1211 
1212 		/*
1213 		 * Delete any existing IRE_HOST type ires for this destination.
1214 		 * This together with the added IRE has the effect of
1215 		 * modifying an existing redirect.
1216 		 */
1217 		redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
1218 		    prev_ire->ire_ill, ALL_ZONES, NULL,
1219 		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
1220 		    NULL);
1221 
1222 		if (redir_ire != NULL) {
1223 			if (redir_ire->ire_flags & RTF_DYNAMIC)
1224 				ire_delete(redir_ire);
1225 			ire_refrele(redir_ire);
1226 		}
1227 	}
1228 
1229 	ire_refrele(prev_ire);
1230 	prev_ire = NULL;
1231 
1232 fail_redirect:
1233 	if (prev_ire != NULL)
1234 		ire_refrele(prev_ire);
1235 	freemsg(mp);
1236 	if (rill != ira->ira_rill)
1237 		ill_refrele(rill);
1238 }
1239 
1240 /*
1241  * Build and ship an IPv6 ICMP message using the packet data in mp,
1242  * and the ICMP header pointed to by "stuff".  (May be called as
1243  * writer.)
1244  * Note: assumes that icmp_pkt_err_ok_v6 has been called to
1245  * verify that an icmp error packet can be sent.
1246  *
1247  * If v6src_ptr is set use it as a source. Otherwise select a reasonable
1248  * source address (see above function).
1249  */
1250 static void
1251 icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
1252     const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
1253 {
1254 	ip6_t		*ip6h;
1255 	in6_addr_t	v6dst;
1256 	size_t		len_needed;
1257 	size_t		msg_len;
1258 	mblk_t		*mp1;
1259 	icmp6_t		*icmp6;
1260 	in6_addr_t	v6src;
1261 	ill_t		*ill = ira->ira_ill;
1262 	ip_stack_t	*ipst = ill->ill_ipst;
1263 	ip_xmit_attr_t	ixas;
1264 
1265 	ip6h = (ip6_t *)mp->b_rptr;
1266 
1267 	bzero(&ixas, sizeof (ixas));
1268 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
1269 	ixas.ixa_zoneid = ira->ira_zoneid;
1270 	ixas.ixa_ifindex = 0;
1271 	ixas.ixa_ipst = ipst;
1272 	ixas.ixa_cred = kcred;
1273 	ixas.ixa_cpid = NOPID;
1274 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
1275 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1276 
1277 	/*
1278 	 * If the source of the original packet was link-local, then
1279 	 * make sure we send on the same ill (group) as we received it on.
1280 	 */
1281 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
1282 		ixas.ixa_flags |= IXAF_SCOPEID_SET;
1283 		if (IS_UNDER_IPMP(ill))
1284 			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
1285 		else
1286 			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
1287 	}
1288 
1289 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1290 		/*
1291 		 * Apply IPsec based on how IPsec was applied to
1292 		 * the packet that had the error.
1293 		 *
1294 		 * If it was an outbound packet that caused the ICMP
1295 		 * error, then the caller will have setup the IRA
1296 		 * appropriately.
1297 		 */
1298 		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
1299 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1300 			/* Note: mp already consumed and ip_drop_packet done */
1301 			return;
1302 		}
1303 	} else {
1304 		/*
1305 		 * This is in clear. The icmp message we are building
1306 		 * here should go out in clear, independent of our policy.
1307 		 */
1308 		ixas.ixa_flags |= IXAF_NO_IPSEC;
1309 	}
1310 
1311 	/*
1312 	 * If the caller specified the source we use that.
1313 	 * Otherwise, if the packet was for one of our unicast addresses, make
1314 	 * sure we respond with that as the source. Otherwise
1315 	 * have ip_output_simple pick the source address.
1316 	 */
1317 	if (v6src_ptr != NULL) {
1318 		v6src = *v6src_ptr;
1319 	} else {
1320 		ire_t *ire;
1321 		uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
1322 
1323 		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
1324 		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
1325 			match_flags |= MATCH_IRE_ILL;
1326 
1327 		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
1328 		    (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
1329 		    match_flags, 0, ipst, NULL);
1330 		if (ire != NULL) {
1331 			v6src = ip6h->ip6_dst;
1332 			ire_refrele(ire);
1333 		} else {
1334 			v6src = ipv6_all_zeros;
1335 			ixas.ixa_flags |= IXAF_SET_SOURCE;
1336 		}
1337 	}
1338 	v6dst = ip6h->ip6_src;
1339 	len_needed = ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len;
1340 	msg_len = msgdsize(mp);
1341 	if (msg_len > len_needed) {
1342 		if (!adjmsg(mp, len_needed - msg_len)) {
1343 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1344 			freemsg(mp);
1345 			return;
1346 		}
1347 		msg_len = len_needed;
1348 	}
1349 	mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
1350 	if (mp1 == NULL) {
1351 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1352 		freemsg(mp);
1353 		return;
1354 	}
1355 	mp1->b_cont = mp;
1356 	mp = mp1;
1357 
1358 	/*
1359 	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
1360 	 * node generates be accepted in peace by all on-host destinations.
1361 	 * If we do NOT assume that all on-host destinations trust
1362 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
1363 	 * (Look for IXAF_TRUSTED_ICMP).
1364 	 */
1365 	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
1366 
1367 	ip6h = (ip6_t *)mp->b_rptr;
1368 	mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
1369 
1370 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1371 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1372 	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
1373 	ip6h->ip6_dst = v6dst;
1374 	ip6h->ip6_src = v6src;
1375 	msg_len += IPV6_HDR_LEN + len;
1376 	if (msg_len > IP_MAXPACKET + IPV6_HDR_LEN) {
1377 		(void) adjmsg(mp, IP_MAXPACKET + IPV6_HDR_LEN - msg_len);
1378 		msg_len = IP_MAXPACKET + IPV6_HDR_LEN;
1379 	}
1380 	ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
1381 	icmp6 = (icmp6_t *)&ip6h[1];
1382 	bcopy(stuff, (char *)icmp6, len);
1383 	/*
1384 	 * Prepare for checksum by putting icmp length in the icmp
1385 	 * checksum field. The checksum is calculated in ip_output_wire_v6.
1386 	 */
1387 	icmp6->icmp6_cksum = ip6h->ip6_plen;
1388 	if (icmp6->icmp6_type == ND_REDIRECT) {
1389 		ip6h->ip6_hops = IPV6_MAX_HOPS;
1390 	}
1391 
1392 	(void) ip_output_simple(mp, &ixas);
1393 	ixa_cleanup(&ixas);
1394 }
1395 
1396 /*
1397  * Update the output mib when ICMPv6 packets are sent.
1398  */
1399 void
1400 icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
1401 {
1402 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
1403 
1404 	switch (icmp6->icmp6_type) {
1405 	case ICMP6_DST_UNREACH:
1406 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutDestUnreachs);
1407 		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
1408 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutAdminProhibs);
1409 		break;
1410 
1411 	case ICMP6_TIME_EXCEEDED:
1412 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutTimeExcds);
1413 		break;
1414 
1415 	case ICMP6_PARAM_PROB:
1416 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutParmProblems);
1417 		break;
1418 
1419 	case ICMP6_PACKET_TOO_BIG:
1420 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutPktTooBigs);
1421 		break;
1422 
1423 	case ICMP6_ECHO_REQUEST:
1424 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchos);
1425 		break;
1426 
1427 	case ICMP6_ECHO_REPLY:
1428 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchoReplies);
1429 		break;
1430 
1431 	case ND_ROUTER_SOLICIT:
1432 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterSolicits);
1433 		break;
1434 
1435 	case ND_ROUTER_ADVERT:
1436 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterAdvertisements);
1437 		break;
1438 
1439 	case ND_NEIGHBOR_SOLICIT:
1440 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborSolicits);
1441 		break;
1442 
1443 	case ND_NEIGHBOR_ADVERT:
1444 		BUMP_MIB(ill->ill_icmp6_mib,
1445 		    ipv6IfIcmpOutNeighborAdvertisements);
1446 		break;
1447 
1448 	case ND_REDIRECT:
1449 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRedirects);
1450 		break;
1451 
1452 	case MLD_LISTENER_QUERY:
1453 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembQueries);
1454 		break;
1455 
1456 	case MLD_LISTENER_REPORT:
1457 	case MLD_V2_LISTENER_REPORT:
1458 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembResponses);
1459 		break;
1460 
1461 	case MLD_LISTENER_REDUCTION:
1462 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembReductions);
1463 		break;
1464 	}
1465 }
1466 
1467 /*
1468  * Check if it is ok to send an ICMPv6 error packet in
1469  * response to the IP packet in mp.
1470  * Free the message and return null if no
1471  * ICMP error packet should be sent.
1472  */
1473 static mblk_t *
1474 icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
1475 {
1476 	ill_t		*ill = ira->ira_ill;
1477 	ip_stack_t	*ipst = ill->ill_ipst;
1478 	boolean_t	llbcast;
1479 	ip6_t		*ip6h;
1480 
1481 	if (!mp)
1482 		return (NULL);
1483 
1484 	/* We view multicast and broadcast as the same.. */
1485 	llbcast = (ira->ira_flags &
1486 	    (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
1487 	ip6h = (ip6_t *)mp->b_rptr;
1488 
1489 	/* Check if source address uniquely identifies the host */
1490 
1491 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src) ||
1492 	    IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src) ||
1493 	    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
1494 		freemsg(mp);
1495 		return (NULL);
1496 	}
1497 
1498 	if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1499 		size_t	len_needed = IPV6_HDR_LEN + ICMP6_MINLEN;
1500 		icmp6_t		*icmp6;
1501 
1502 		if (mp->b_wptr - mp->b_rptr < len_needed) {
1503 			if (!pullupmsg(mp, len_needed)) {
1504 				BUMP_MIB(ill->ill_icmp6_mib,
1505 				    ipv6IfIcmpInErrors);
1506 				freemsg(mp);
1507 				return (NULL);
1508 			}
1509 			ip6h = (ip6_t *)mp->b_rptr;
1510 		}
1511 		icmp6 = (icmp6_t *)&ip6h[1];
1512 		/* Explicitly do not generate errors in response to redirects */
1513 		if (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
1514 		    icmp6->icmp6_type == ND_REDIRECT) {
1515 			freemsg(mp);
1516 			return (NULL);
1517 		}
1518 	}
1519 	/*
1520 	 * Check that the destination is not multicast and that the packet
1521 	 * was not sent on link layer broadcast or multicast.  (Exception
1522 	 * is Packet too big message as per the draft - when mcast_ok is set.)
1523 	 */
1524 	if (!mcast_ok &&
1525 	    (llbcast || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
1526 		freemsg(mp);
1527 		return (NULL);
1528 	}
1529 	/*
1530 	 * If this is a labeled system, then check to see if we're allowed to
1531 	 * send a response to this particular sender.  If not, then just drop.
1532 	 */
1533 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
1534 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1535 		freemsg(mp);
1536 		return (NULL);
1537 	}
1538 
1539 	if (icmp_err_rate_limit(ipst)) {
1540 		/*
1541 		 * Only send ICMP error packets every so often.
1542 		 * This should be done on a per port/source basis,
1543 		 * but for now this will suffice.
1544 		 */
1545 		freemsg(mp);
1546 		return (NULL);
1547 	}
1548 	return (mp);
1549 }
1550 
1551 /*
1552  * Called when a packet was sent out the same link that it arrived on.
1553  * Check if it is ok to send a redirect and then send it.
1554  */
1555 void
1556 ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
1557     ip_recv_attr_t *ira)
1558 {
1559 	ill_t		*ill = ira->ira_ill;
1560 	ip_stack_t	*ipst = ill->ill_ipst;
1561 	in6_addr_t	*v6targ;
1562 	ire_t		*src_ire_v6 = NULL;
1563 	mblk_t		*mp1;
1564 	ire_t		*nhop_ire = NULL;
1565 
1566 	/*
1567 	 * Don't send a redirect when forwarding a source
1568 	 * routed packet.
1569 	 */
1570 	if (ip_source_routed_v6(ip6h, mp, ipst))
1571 		return;
1572 
1573 	if (ire->ire_type & IRE_ONLINK) {
1574 		/* Target is directly connected */
1575 		v6targ = &ip6h->ip6_dst;
1576 	} else {
1577 		/* Determine the most specific IRE used to send the packets */
1578 		nhop_ire = ire_nexthop(ire);
1579 		if (nhop_ire == NULL)
1580 			return;
1581 
1582 		/*
1583 		 * We won't send redirects to a router
1584 		 * that doesn't have a link local
1585 		 * address, but will forward.
1586 		 */
1587 		if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
1588 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
1589 			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1590 			ire_refrele(nhop_ire);
1591 			return;
1592 		}
1593 		v6targ = &nhop_ire->ire_addr_v6;
1594 	}
1595 	src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
1596 	    NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
1597 	    MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
1598 
1599 	if (src_ire_v6 == NULL) {
1600 		if (nhop_ire != NULL)
1601 			ire_refrele(nhop_ire);
1602 		return;
1603 	}
1604 
1605 	/*
1606 	 * The source is directly connected.
1607 	 */
1608 	mp1 = copymsg(mp);
1609 	if (mp1 != NULL)
1610 		icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
1611 
1612 	if (nhop_ire != NULL)
1613 		ire_refrele(nhop_ire);
1614 	ire_refrele(src_ire_v6);
1615 }
1616 
1617 /*
1618  * Generate an ICMPv6 redirect message.
1619  * Include target link layer address option if it exits.
1620  * Always include redirect header.
1621  */
1622 static void
1623 icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
1624     ip_recv_attr_t *ira)
1625 {
1626 	nd_redirect_t	*rd;
1627 	nd_opt_rd_hdr_t	*rdh;
1628 	uchar_t		*buf;
1629 	ncec_t		*ncec = NULL;
1630 	nd_opt_hdr_t	*opt;
1631 	int		len;
1632 	int		ll_opt_len = 0;
1633 	int		max_redir_hdr_data_len;
1634 	int		pkt_len;
1635 	in6_addr_t	*srcp;
1636 	ill_t		*ill;
1637 	boolean_t	need_refrele;
1638 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
1639 
1640 	mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
1641 	if (mp == NULL)
1642 		return;
1643 
1644 	if (IS_UNDER_IPMP(ira->ira_ill)) {
1645 		ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
1646 		if (ill == NULL) {
1647 			ill = ira->ira_ill;
1648 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1649 			ip_drop_output("no IPMP ill for sending redirect",
1650 			    mp, ill);
1651 			freemsg(mp);
1652 			return;
1653 		}
1654 		need_refrele = B_TRUE;
1655 	} else {
1656 		ill = ira->ira_ill;
1657 		need_refrele = B_FALSE;
1658 	}
1659 
1660 	ncec = ncec_lookup_illgrp_v6(ill, targetp);
1661 	if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
1662 	    ncec->ncec_lladdr != NULL) {
1663 		ll_opt_len = (sizeof (nd_opt_hdr_t) +
1664 		    ill->ill_phys_addr_length + 7)/8 * 8;
1665 	}
1666 	len = sizeof (nd_redirect_t) + sizeof (nd_opt_rd_hdr_t) + ll_opt_len;
1667 	ASSERT(len % 4 == 0);
1668 	buf = kmem_alloc(len, KM_NOSLEEP);
1669 	if (buf == NULL) {
1670 		if (ncec != NULL)
1671 			ncec_refrele(ncec);
1672 		if (need_refrele)
1673 			ill_refrele(ill);
1674 		freemsg(mp);
1675 		return;
1676 	}
1677 
1678 	rd = (nd_redirect_t *)buf;
1679 	rd->nd_rd_type = (uint8_t)ND_REDIRECT;
1680 	rd->nd_rd_code = 0;
1681 	rd->nd_rd_reserved = 0;
1682 	rd->nd_rd_target = *targetp;
1683 	rd->nd_rd_dst = *dest;
1684 
1685 	opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
1686 	if (ncec != NULL && ll_opt_len != 0) {
1687 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1688 		opt->nd_opt_len = ll_opt_len/8;
1689 		bcopy((char *)ncec->ncec_lladdr, &opt[1],
1690 		    ill->ill_phys_addr_length);
1691 	}
1692 	if (ncec != NULL)
1693 		ncec_refrele(ncec);
1694 	rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
1695 	rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
1696 	/* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
1697 	max_redir_hdr_data_len =
1698 	    (ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len)/8*8;
1699 	pkt_len = msgdsize(mp);
1700 	/* Make sure mp is 8 byte aligned */
1701 	if (pkt_len > max_redir_hdr_data_len) {
1702 		rdh->nd_opt_rh_len = (max_redir_hdr_data_len +
1703 		    sizeof (nd_opt_rd_hdr_t))/8;
1704 		(void) adjmsg(mp, max_redir_hdr_data_len - pkt_len);
1705 	} else {
1706 		rdh->nd_opt_rh_len = (pkt_len + sizeof (nd_opt_rd_hdr_t))/8;
1707 		(void) adjmsg(mp, -(pkt_len % 8));
1708 	}
1709 	rdh->nd_opt_rh_reserved1 = 0;
1710 	rdh->nd_opt_rh_reserved2 = 0;
1711 	/* ipif_v6lcl_addr contains the link-local source address */
1712 	srcp = &ill->ill_ipif->ipif_v6lcl_addr;
1713 
1714 	/* Redirects sent by router, and router is global zone */
1715 	ASSERT(ira->ira_zoneid == ALL_ZONES);
1716 	ira->ira_zoneid = GLOBAL_ZONEID;
1717 	icmp_pkt_v6(mp, buf, len, srcp, ira);
1718 	kmem_free(buf, len);
1719 	if (need_refrele)
1720 		ill_refrele(ill);
1721 }
1722 
1723 
1724 /* Generate an ICMP time exceeded message.  (May be called as writer.) */
1725 void
1726 icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1727     ip_recv_attr_t *ira)
1728 {
1729 	icmp6_t	icmp6;
1730 
1731 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1732 	if (mp == NULL)
1733 		return;
1734 
1735 	bzero(&icmp6, sizeof (icmp6_t));
1736 	icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
1737 	icmp6.icmp6_code = code;
1738 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1739 }
1740 
1741 /*
1742  * Generate an ICMP unreachable message.
1743  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1744  * constructed by the caller.
1745  */
1746 void
1747 icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1748     ip_recv_attr_t *ira)
1749 {
1750 	icmp6_t	icmp6;
1751 
1752 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1753 	if (mp == NULL)
1754 		return;
1755 
1756 	bzero(&icmp6, sizeof (icmp6_t));
1757 	icmp6.icmp6_type = ICMP6_DST_UNREACH;
1758 	icmp6.icmp6_code = code;
1759 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1760 }
1761 
1762 /*
1763  * Generate an ICMP pkt too big message.
1764  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1765  * constructed by the caller.
1766  */
1767 void
1768 icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
1769     ip_recv_attr_t *ira)
1770 {
1771 	icmp6_t	icmp6;
1772 
1773 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1774 	if (mp == NULL)
1775 		return;
1776 
1777 	bzero(&icmp6, sizeof (icmp6_t));
1778 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
1779 	icmp6.icmp6_code = 0;
1780 	icmp6.icmp6_mtu = htonl(mtu);
1781 
1782 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1783 }
1784 
1785 /*
1786  * Generate an ICMP parameter problem message. (May be called as writer.)
1787  * 'offset' is the offset from the beginning of the packet in error.
1788  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1789  * constructed by the caller.
1790  */
1791 static void
1792 icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
1793     boolean_t mcast_ok, ip_recv_attr_t *ira)
1794 {
1795 	icmp6_t	icmp6;
1796 
1797 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1798 	if (mp == NULL)
1799 		return;
1800 
1801 	bzero((char *)&icmp6, sizeof (icmp6_t));
1802 	icmp6.icmp6_type = ICMP6_PARAM_PROB;
1803 	icmp6.icmp6_code = code;
1804 	icmp6.icmp6_pptr = htonl(offset);
1805 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1806 }
1807 
1808 void
1809 icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
1810     ip_recv_attr_t *ira)
1811 {
1812 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
1813 	uint16_t	hdr_length;
1814 	uint8_t		*nexthdrp;
1815 	uint32_t	offset;
1816 	ill_t		*ill = ira->ira_ill;
1817 
1818 	/* Determine the offset of the bad nexthdr value */
1819 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h,	&hdr_length, &nexthdrp)) {
1820 		/* Malformed packet */
1821 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1822 		ip_drop_input("ipIfStatsInDiscards", mp, ill);
1823 		freemsg(mp);
1824 		return;
1825 	}
1826 
1827 	offset = nexthdrp - mp->b_rptr;
1828 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
1829 	    mcast_ok, ira);
1830 }
1831 
1832 /*
1833  * Verify whether or not the IP address is a valid local address.
1834  * Could be a unicast, including one for a down interface.
1835  * If allow_mcbc then a multicast or broadcast address is also
1836  * acceptable.
1837  *
1838  * In the case of a multicast address, however, the
1839  * upper protocol is expected to reset the src address
1840  * to zero when we return IPVL_MCAST so that
1841  * no packets are emitted with multicast address as
1842  * source address.
1843  * The addresses valid for bind are:
1844  *	(1) - in6addr_any
1845  *	(2) - IP address of an UP interface
1846  *	(3) - IP address of a DOWN interface
1847  *	(4) - a multicast address. In this case
1848  *	the conn will only receive packets destined to
1849  *	the specified multicast address. Note: the
1850  *	application still has to issue an
1851  *	IPV6_JOIN_GROUP socket option.
1852  *
1853  * In all the above cases, the bound address must be valid in the current zone.
1854  * When the address is loopback or multicast, there might be many matching IREs
1855  * so bind has to look up based on the zone.
1856  */
1857 ip_laddr_t
1858 ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
1859     ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
1860 {
1861 	ire_t		*src_ire;
1862 	uint_t		match_flags;
1863 	ill_t		*ill = NULL;
1864 
1865 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
1866 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
1867 
1868 	match_flags = MATCH_IRE_ZONEONLY;
1869 	if (scopeid != 0) {
1870 		ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
1871 		if (ill == NULL)
1872 			return (IPVL_BAD);
1873 		match_flags |= MATCH_IRE_ILL;
1874 	}
1875 
1876 	src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
1877 	    ill, zoneid, NULL, match_flags, 0, ipst, NULL);
1878 	if (ill != NULL)
1879 		ill_refrele(ill);
1880 
1881 	/*
1882 	 * If an address other than in6addr_any is requested,
1883 	 * we verify that it is a valid address for bind
1884 	 * Note: Following code is in if-else-if form for
1885 	 * readability compared to a condition check.
1886 	 */
1887 	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
1888 		/*
1889 		 * (2) Bind to address of local UP interface
1890 		 */
1891 		ire_refrele(src_ire);
1892 		return (IPVL_UNICAST_UP);
1893 	} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
1894 		/* (4) bind to multicast address. */
1895 		if (src_ire != NULL)
1896 			ire_refrele(src_ire);
1897 
1898 		/*
1899 		 * Note: caller should take IPV6_MULTICAST_IF
1900 		 * into account when selecting a real source address.
1901 		 */
1902 		if (allow_mcbc)
1903 			return (IPVL_MCAST);
1904 		else
1905 			return (IPVL_BAD);
1906 	} else {
1907 		ipif_t *ipif;
1908 
1909 		/*
1910 		 * (3) Bind to address of local DOWN interface?
1911 		 * (ipif_lookup_addr() looks up all interfaces
1912 		 * but we do not get here for UP interfaces
1913 		 * - case (2) above)
1914 		 */
1915 		if (src_ire != NULL)
1916 			ire_refrele(src_ire);
1917 
1918 		ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
1919 		if (ipif == NULL)
1920 			return (IPVL_BAD);
1921 
1922 		/* Not a useful source? */
1923 		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
1924 			ipif_refrele(ipif);
1925 			return (IPVL_BAD);
1926 		}
1927 		ipif_refrele(ipif);
1928 		return (IPVL_UNICAST_DOWN);
1929 	}
1930 }
1931 
1932 /*
1933  * Verify that both the source and destination addresses are valid.  If
1934  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
1935  * i.e. have no route to it.  Protocols like TCP want to verify destination
1936  * reachability, while tunnels do not.
1937  *
1938  * Determine the route, the interface, and (optionally) the source address
1939  * to use to reach a given destination.
1940  * Note that we allow connect to broadcast and multicast addresses when
1941  * IPDF_ALLOW_MCBC is set.
1942  * first_hop and dst_addr are normally the same, but if source routing
1943  * they will differ; in that case the first_hop is what we'll use for the
1944  * routing lookup but the dce and label checks will be done on dst_addr,
1945  *
1946  * If uinfo is set, then we fill in the best available information
1947  * we have for the destination. This is based on (in priority order) any
1948  * metrics and path MTU stored in a dce_t, route metrics, and finally the
1949  * ill_mtu/ill_mc_mtu.
1950  *
1951  * Tsol note: If we have a source route then dst_addr != firsthop. But we
1952  * always do the label check on dst_addr.
1953  *
1954  * Assumes that the caller has set ixa_scopeid for link-local communication.
1955  */
1956 int
1957 ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
1958     const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
1959     uint32_t flags, uint_t mac_mode)
1960 {
1961 	ire_t		*ire;
1962 	int		error = 0;
1963 	in6_addr_t	setsrc;				/* RTF_SETSRC */
1964 	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
1965 	ip_stack_t	*ipst = ixa->ixa_ipst;
1966 	dce_t		*dce;
1967 	uint_t		pmtu;
1968 	uint_t		ifindex;
1969 	uint_t		generation;
1970 	nce_t		*nce;
1971 	ill_t		*ill = NULL;
1972 	boolean_t	multirt = B_FALSE;
1973 
1974 	ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
1975 
1976 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1977 
1978 	/*
1979 	 * We never send to zero; the ULPs map it to the loopback address.
1980 	 * We can't allow it since we use zero to mean unitialized in some
1981 	 * places.
1982 	 */
1983 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
1984 
1985 	if (is_system_labeled()) {
1986 		ts_label_t *tsl = NULL;
1987 
1988 		error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
1989 		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
1990 		if (error != 0)
1991 			return (error);
1992 		if (tsl != NULL) {
1993 			/* Update the label */
1994 			ip_xmit_attr_replace_tsl(ixa, tsl);
1995 		}
1996 	}
1997 
1998 	setsrc = ipv6_all_zeros;
1999 	/*
2000 	 * Select a route; For IPMP interfaces, we would only select
2001 	 * a "hidden" route (i.e., going through a specific under_ill)
2002 	 * if ixa_ifindex has been specified.
2003 	 */
2004 	ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
2005 	    &setsrc, &error, &multirt);
2006 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
2007 	if (error != 0)
2008 		goto bad_addr;
2009 
2010 	/*
2011 	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
2012 	 * If IPDF_VERIFY_DST is set, the destination must be reachable.
2013 	 * Otherwise the destination needn't be reachable.
2014 	 *
2015 	 * If we match on a reject or black hole, then we've got a
2016 	 * local failure.  May as well fail out the connect() attempt,
2017 	 * since it's never going to succeed.
2018 	 */
2019 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2020 		/*
2021 		 * If we're verifying destination reachability, we always want
2022 		 * to complain here.
2023 		 *
2024 		 * If we're not verifying destination reachability but the
2025 		 * destination has a route, we still want to fail on the
2026 		 * temporary address and broadcast address tests.
2027 		 *
2028 		 * In both cases do we let the code continue so some reasonable
2029 		 * information is returned to the caller. That enables the
2030 		 * caller to use (and even cache) the IRE. conn_ip_ouput will
2031 		 * use the generation mismatch path to check for the unreachable
2032 		 * case thereby avoiding any specific check in the main path.
2033 		 */
2034 		ASSERT(generation == IRE_GENERATION_VERIFY);
2035 		if (flags & IPDF_VERIFY_DST) {
2036 			/*
2037 			 * Set errno but continue to set up ixa_ire to be
2038 			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
2039 			 * That allows callers to use ip_output to get an
2040 			 * ICMP error back.
2041 			 */
2042 			if (!(ire->ire_type & IRE_HOST))
2043 				error = ENETUNREACH;
2044 			else
2045 				error = EHOSTUNREACH;
2046 		}
2047 	}
2048 
2049 	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
2050 	    !(flags & IPDF_ALLOW_MCBC)) {
2051 		ire_refrele(ire);
2052 		ire = ire_reject(ipst, B_FALSE);
2053 		generation = IRE_GENERATION_VERIFY;
2054 		error = ENETUNREACH;
2055 	}
2056 
2057 	/* Cache things */
2058 	if (ixa->ixa_ire != NULL)
2059 		ire_refrele_notr(ixa->ixa_ire);
2060 #ifdef DEBUG
2061 	ire_refhold_notr(ire);
2062 	ire_refrele(ire);
2063 #endif
2064 	ixa->ixa_ire = ire;
2065 	ixa->ixa_ire_generation = generation;
2066 
2067 	/*
2068 	 * Ensure that ixa_dce is always set any time that ixa_ire is set,
2069 	 * since some callers will send a packet to conn_ip_output() even if
2070 	 * there's an error.
2071 	 */
2072 	ifindex = 0;
2073 	if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
2074 		/* If we are creating a DCE we'd better have an ifindex */
2075 		if (ill != NULL)
2076 			ifindex = ill->ill_phyint->phyint_ifindex;
2077 		else
2078 			flags &= ~IPDF_UNIQUE_DCE;
2079 	}
2080 
2081 	if (flags & IPDF_UNIQUE_DCE) {
2082 		/* Fallback to the default dce if allocation fails */
2083 		dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
2084 		if (dce != NULL) {
2085 			generation = dce->dce_generation;
2086 		} else {
2087 			dce = dce_lookup_v6(dst_addr, ifindex, ipst,
2088 			    &generation);
2089 		}
2090 	} else {
2091 		dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
2092 	}
2093 	ASSERT(dce != NULL);
2094 	if (ixa->ixa_dce != NULL)
2095 		dce_refrele_notr(ixa->ixa_dce);
2096 #ifdef DEBUG
2097 	dce_refhold_notr(dce);
2098 	dce_refrele(dce);
2099 #endif
2100 	ixa->ixa_dce = dce;
2101 	ixa->ixa_dce_generation = generation;
2102 
2103 
2104 	/*
2105 	 * For multicast with multirt we have a flag passed back from
2106 	 * ire_lookup_multi_ill_v6 since we don't have an IRE for each
2107 	 * possible multicast address.
2108 	 * We also need a flag for multicast since we can't check
2109 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
2110 	 */
2111 	if (multirt) {
2112 		ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
2113 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
2114 	} else {
2115 		ixa->ixa_postfragfn = ire->ire_postfragfn;
2116 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
2117 	}
2118 	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2119 		/* Get an nce to cache. */
2120 		nce = ire_to_nce(ire, 0, firsthop);
2121 		if (nce == NULL) {
2122 			/* Allocation failure? */
2123 			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2124 		} else {
2125 			if (ixa->ixa_nce != NULL)
2126 				nce_refrele(ixa->ixa_nce);
2127 			ixa->ixa_nce = nce;
2128 		}
2129 	}
2130 
2131 	/*
2132 	 * If the source address is a loopback address, the
2133 	 * destination had best be local or multicast.
2134 	 * If we are sending to an IRE_LOCAL using a loopback source then
2135 	 * it had better be the same zoneid.
2136 	 */
2137 	if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
2138 		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
2139 			ire = NULL;	/* Stored in ixa_ire */
2140 			error = EADDRNOTAVAIL;
2141 			goto bad_addr;
2142 		}
2143 		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
2144 			ire = NULL;	/* Stored in ixa_ire */
2145 			error = EADDRNOTAVAIL;
2146 			goto bad_addr;
2147 		}
2148 	}
2149 
2150 	/*
2151 	 * Does the caller want us to pick a source address?
2152 	 */
2153 	if (flags & IPDF_SELECT_SRC) {
2154 		in6_addr_t	src_addr;
2155 
2156 		/*
2157 		 * We use use ire_nexthop_ill to avoid the under ipmp
2158 		 * interface for source address selection. Note that for ipmp
2159 		 * probe packets, ixa_ifindex would have been specified, and
2160 		 * the ip_select_route() invocation would have picked an ire
2161 		 * will ire_ill pointing at an under interface.
2162 		 */
2163 		ill = ire_nexthop_ill(ire);
2164 
2165 		/* If unreachable we have no ill but need some source */
2166 		if (ill == NULL) {
2167 			src_addr = ipv6_loopback;
2168 			/* Make sure we look for a better source address */
2169 			generation = SRC_GENERATION_VERIFY;
2170 		} else {
2171 			error = ip_select_source_v6(ill, &setsrc, dst_addr,
2172 			    zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
2173 			    &src_addr, &generation, NULL);
2174 			if (error != 0) {
2175 				ire = NULL;	/* Stored in ixa_ire */
2176 				goto bad_addr;
2177 			}
2178 		}
2179 
2180 		/*
2181 		 * We allow the source address to to down.
2182 		 * However, we check that we don't use the loopback address
2183 		 * as a source when sending out on the wire.
2184 		 */
2185 		if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
2186 		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
2187 		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2188 			ire = NULL;	/* Stored in ixa_ire */
2189 			error = EADDRNOTAVAIL;
2190 			goto bad_addr;
2191 		}
2192 
2193 		*src_addrp = src_addr;
2194 		ixa->ixa_src_generation = generation;
2195 	}
2196 
2197 	/*
2198 	 * Make sure we don't leave an unreachable ixa_nce in place
2199 	 * since ip_select_route is used when we unplumb i.e., remove
2200 	 * references on ixa_ire, ixa_nce, and ixa_dce.
2201 	 */
2202 	nce = ixa->ixa_nce;
2203 	if (nce != NULL && nce->nce_is_condemned) {
2204 		nce_refrele(nce);
2205 		ixa->ixa_nce = NULL;
2206 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2207 	}
2208 
2209 	/*
2210 	 * Note that IPv6 multicast supports PMTU discovery unlike IPv4
2211 	 * multicast. But pmtu discovery is only enabled for connected
2212 	 * sockets in general.
2213 	 */
2214 
2215 	/*
2216 	 * Set initial value for fragmentation limit.  Either conn_ip_output
2217 	 * or ULP might updates it when there are routing changes.
2218 	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
2219 	 */
2220 	pmtu = ip_get_pmtu(ixa);
2221 	ixa->ixa_fragsize = pmtu;
2222 	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
2223 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
2224 		ixa->ixa_pmtu = pmtu;
2225 
2226 	/*
2227 	 * Extract information useful for some transports.
2228 	 * First we look for DCE metrics. Then we take what we have in
2229 	 * the metrics in the route, where the offlink is used if we have
2230 	 * one.
2231 	 */
2232 	if (uinfo != NULL) {
2233 		bzero(uinfo, sizeof (*uinfo));
2234 
2235 		if (dce->dce_flags & DCEF_UINFO)
2236 			*uinfo = dce->dce_uinfo;
2237 
2238 		rts_merge_metrics(uinfo, &ire->ire_metrics);
2239 
2240 		/* Allow ire_metrics to decrease the path MTU from above */
2241 		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
2242 			uinfo->iulp_mtu = pmtu;
2243 
2244 		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
2245 		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
2246 		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
2247 	}
2248 
2249 	if (ill != NULL)
2250 		ill_refrele(ill);
2251 
2252 	return (error);
2253 
2254 bad_addr:
2255 	if (ire != NULL)
2256 		ire_refrele(ire);
2257 
2258 	if (ill != NULL)
2259 		ill_refrele(ill);
2260 
2261 	/*
2262 	 * Make sure we don't leave an unreachable ixa_nce in place
2263 	 * since ip_select_route is used when we unplumb i.e., remove
2264 	 * references on ixa_ire, ixa_nce, and ixa_dce.
2265 	 */
2266 	nce = ixa->ixa_nce;
2267 	if (nce != NULL && nce->nce_is_condemned) {
2268 		nce_refrele(nce);
2269 		ixa->ixa_nce = NULL;
2270 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2271 	}
2272 
2273 	return (error);
2274 }
2275 
2276 /*
2277  * Handle protocols with which IP is less intimate.  There
2278  * can be more than one stream bound to a particular
2279  * protocol.  When this is the case, normally each one gets a copy
2280  * of any incoming packets.
2281  *
2282  * Zones notes:
2283  * Packets will be distributed to conns in all zones. This is really only
2284  * useful for ICMPv6 as only applications in the global zone can create raw
2285  * sockets for other protocols.
2286  */
2287 void
2288 ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
2289 {
2290 	mblk_t		*mp1;
2291 	in6_addr_t	laddr = ip6h->ip6_dst;
2292 	conn_t		*connp, *first_connp, *next_connp;
2293 	connf_t		*connfp;
2294 	ill_t		*ill = ira->ira_ill;
2295 	ip_stack_t	*ipst = ill->ill_ipst;
2296 
2297 	connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
2298 	mutex_enter(&connfp->connf_lock);
2299 	connp = connfp->connf_head;
2300 	for (connp = connfp->connf_head; connp != NULL;
2301 	    connp = connp->conn_next) {
2302 		/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2303 		if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2304 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2305 		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2306 			break;
2307 	}
2308 
2309 	if (connp == NULL) {
2310 		/*
2311 		 * No one bound to this port.  Is
2312 		 * there a client that wants all
2313 		 * unclaimed datagrams?
2314 		 */
2315 		mutex_exit(&connfp->connf_lock);
2316 		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
2317 		    ICMP6_PARAMPROB_NEXTHEADER, ira);
2318 		return;
2319 	}
2320 
2321 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
2322 
2323 	CONN_INC_REF(connp);
2324 	first_connp = connp;
2325 
2326 	/*
2327 	 * XXX: Fix the multiple protocol listeners case. We should not
2328 	 * be walking the conn->conn_next list here.
2329 	 */
2330 	connp = connp->conn_next;
2331 	for (;;) {
2332 		while (connp != NULL) {
2333 			/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2334 			if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2335 			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2336 			    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2337 			    ira, connp)))
2338 				break;
2339 			connp = connp->conn_next;
2340 		}
2341 
2342 		if (connp == NULL) {
2343 			/* No more interested clients */
2344 			connp = first_connp;
2345 			break;
2346 		}
2347 		if (((mp1 = dupmsg(mp)) == NULL) &&
2348 		    ((mp1 = copymsg(mp)) == NULL)) {
2349 			/* Memory allocation failed */
2350 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2351 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2352 			connp = first_connp;
2353 			break;
2354 		}
2355 
2356 		CONN_INC_REF(connp);
2357 		mutex_exit(&connfp->connf_lock);
2358 
2359 		ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
2360 		    ira);
2361 
2362 		mutex_enter(&connfp->connf_lock);
2363 		/* Follow the next pointer before releasing the conn. */
2364 		next_connp = connp->conn_next;
2365 		CONN_DEC_REF(connp);
2366 		connp = next_connp;
2367 	}
2368 
2369 	/* Last one.  Send it upstream. */
2370 	mutex_exit(&connfp->connf_lock);
2371 
2372 	ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
2373 
2374 	CONN_DEC_REF(connp);
2375 }
2376 
2377 /*
2378  * Called when it is conceptually a ULP that would sent the packet
2379  * e.g., port unreachable and nexthdr unknown. Check that the packet
2380  * would have passed the IPsec global policy before sending the error.
2381  *
2382  * Send an ICMP error after patching up the packet appropriately.
2383  * Uses ip_drop_input and bumps the appropriate MIB.
2384  * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
2385  */
2386 void
2387 ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
2388     ip_recv_attr_t *ira)
2389 {
2390 	ip6_t		*ip6h;
2391 	boolean_t	secure;
2392 	ill_t		*ill = ira->ira_ill;
2393 	ip_stack_t	*ipst = ill->ill_ipst;
2394 	netstack_t	*ns = ipst->ips_netstack;
2395 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2396 
2397 	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
2398 
2399 	/*
2400 	 * We are generating an icmp error for some inbound packet.
2401 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
2402 	 * Before we generate an error, check with global policy
2403 	 * to see whether this is allowed to enter the system. As
2404 	 * there is no "conn", we are checking with global policy.
2405 	 */
2406 	ip6h = (ip6_t *)mp->b_rptr;
2407 	if (secure || ipss->ipsec_inbound_v6_policy_present) {
2408 		mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
2409 		if (mp == NULL)
2410 			return;
2411 	}
2412 
2413 	/* We never send errors for protocols that we do implement */
2414 	if (ira->ira_protocol == IPPROTO_ICMPV6) {
2415 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2416 		ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
2417 		freemsg(mp);
2418 		return;
2419 	}
2420 
2421 	switch (icmp_type) {
2422 	case ICMP6_DST_UNREACH:
2423 		ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
2424 
2425 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
2426 		ip_drop_input("ipIfStatsNoPorts", mp, ill);
2427 
2428 		icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
2429 		break;
2430 	case ICMP6_PARAM_PROB:
2431 		ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
2432 
2433 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
2434 		ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
2435 
2436 		/* Let the system determine the offset for this one */
2437 		icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
2438 		break;
2439 	default:
2440 #ifdef DEBUG
2441 		panic("ip_fanout_send_icmp_v6: wrong type");
2442 		/*NOTREACHED*/
2443 #else
2444 		freemsg(mp);
2445 		break;
2446 #endif
2447 	}
2448 }
2449 
2450 /*
2451  * Fanout for UDP packets that are multicast or ICMP errors.
2452  * (Unicast fanout is handled in ip_input_v6.)
2453  *
2454  * If SO_REUSEADDR is set all multicast packets
2455  * will be delivered to all conns bound to the same port.
2456  *
2457  * Fanout for UDP packets.
2458  * The caller puts <fport, lport> in the ports parameter.
2459  * ire_type must be IRE_BROADCAST for multicast and broadcast packets.
2460  *
2461  * If SO_REUSEADDR is set all multicast and broadcast packets
2462  * will be delivered to all conns bound to the same port.
2463  *
2464  * Zones notes:
2465  * Earlier in ip_input on a system with multiple shared-IP zones we
2466  * duplicate the multicast and broadcast packets and send them up
2467  * with each explicit zoneid that exists on that ill.
2468  * This means that here we can match the zoneid with SO_ALLZONES being special.
2469  */
2470 void
2471 ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
2472     ip_recv_attr_t *ira)
2473 {
2474 	in6_addr_t	laddr;
2475 	conn_t		*connp;
2476 	connf_t		*connfp;
2477 	in6_addr_t	faddr;
2478 	ill_t		*ill = ira->ira_ill;
2479 	ip_stack_t	*ipst = ill->ill_ipst;
2480 
2481 	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
2482 
2483 	laddr = ip6h->ip6_dst;
2484 	faddr = ip6h->ip6_src;
2485 
2486 	/* Attempt to find a client stream based on destination port. */
2487 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2488 	mutex_enter(&connfp->connf_lock);
2489 	connp = connfp->connf_head;
2490 	while (connp != NULL) {
2491 		if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
2492 		    conn_wantpacket_v6(connp, ira, ip6h) &&
2493 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2494 		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2495 			break;
2496 		connp = connp->conn_next;
2497 	}
2498 
2499 	if (connp == NULL)
2500 		goto notfound;
2501 
2502 	CONN_INC_REF(connp);
2503 
2504 	if (connp->conn_reuseaddr) {
2505 		conn_t		*first_connp = connp;
2506 		conn_t		*next_connp;
2507 		mblk_t		*mp1;
2508 
2509 		connp = connp->conn_next;
2510 		for (;;) {
2511 			while (connp != NULL) {
2512 				if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
2513 				    fport, faddr) &&
2514 				    conn_wantpacket_v6(connp, ira, ip6h) &&
2515 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2516 				    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2517 				    ira, connp)))
2518 					break;
2519 				connp = connp->conn_next;
2520 			}
2521 			if (connp == NULL) {
2522 				/* No more interested clients */
2523 				connp = first_connp;
2524 				break;
2525 			}
2526 			if (((mp1 = dupmsg(mp)) == NULL) &&
2527 			    ((mp1 = copymsg(mp)) == NULL)) {
2528 				/* Memory allocation failed */
2529 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2530 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2531 				connp = first_connp;
2532 				break;
2533 			}
2534 
2535 			CONN_INC_REF(connp);
2536 			mutex_exit(&connfp->connf_lock);
2537 
2538 			IP6_STAT(ipst, ip6_udp_fanmb);
2539 			ip_fanout_udp_conn(connp, mp1, NULL,
2540 			    (ip6_t *)mp1->b_rptr, ira);
2541 
2542 			mutex_enter(&connfp->connf_lock);
2543 			/* Follow the next pointer before releasing the conn. */
2544 			next_connp = connp->conn_next;
2545 			IP6_STAT(ipst, ip6_udp_fanmb);
2546 			CONN_DEC_REF(connp);
2547 			connp = next_connp;
2548 		}
2549 	}
2550 
2551 	/* Last one.  Send it upstream. */
2552 	mutex_exit(&connfp->connf_lock);
2553 
2554 	IP6_STAT(ipst, ip6_udp_fanmb);
2555 	ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
2556 	CONN_DEC_REF(connp);
2557 	return;
2558 
2559 notfound:
2560 	mutex_exit(&connfp->connf_lock);
2561 	/*
2562 	 * No one bound to this port.  Is
2563 	 * there a client that wants all
2564 	 * unclaimed datagrams?
2565 	 */
2566 	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
2567 		ASSERT(ira->ira_protocol == IPPROTO_UDP);
2568 		ip_fanout_proto_v6(mp, ip6h, ira);
2569 	} else {
2570 		ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
2571 		    ICMP6_DST_UNREACH_NOPORT, ira);
2572 	}
2573 }
2574 
2575 /*
2576  * int ip_find_hdr_v6()
2577  *
2578  * This routine is used by the upper layer protocols, iptun, and IPsec:
2579  * - Set extension header pointers to appropriate locations
2580  * - Determine IPv6 header length and return it
2581  * - Return a pointer to the last nexthdr value
2582  *
2583  * The caller must initialize ipp_fields.
2584  * The upper layer protocols normally set label_separate which makes the
2585  * routine put the TX label in ipp_label_v6. If this is not set then
2586  * the hop-by-hop options including the label are placed in ipp_hopopts.
2587  *
2588  * NOTE: If multiple extension headers of the same type are present,
2589  * ip_find_hdr_v6() will set the respective extension header pointers
2590  * to the first one that it encounters in the IPv6 header.  It also
2591  * skips fragment headers.  This routine deals with malformed packets
2592  * of various sorts in which case the returned length is up to the
2593  * malformed part.
2594  */
2595 int
2596 ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
2597     uint8_t *nexthdrp)
2598 {
2599 	uint_t	length, ehdrlen;
2600 	uint8_t nexthdr;
2601 	uint8_t *whereptr, *endptr;
2602 	ip6_dest_t *tmpdstopts;
2603 	ip6_rthdr_t *tmprthdr;
2604 	ip6_hbh_t *tmphopopts;
2605 	ip6_frag_t *tmpfraghdr;
2606 
2607 	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
2608 	ipp->ipp_hoplimit = ip6h->ip6_hops;
2609 	ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
2610 	ipp->ipp_addr = ip6h->ip6_dst;
2611 
2612 	length = IPV6_HDR_LEN;
2613 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2614 	endptr = mp->b_wptr;
2615 
2616 	nexthdr = ip6h->ip6_nxt;
2617 	while (whereptr < endptr) {
2618 		/* Is there enough left for len + nexthdr? */
2619 		if (whereptr + MIN_EHDR_LEN > endptr)
2620 			goto done;
2621 
2622 		switch (nexthdr) {
2623 		case IPPROTO_HOPOPTS: {
2624 			/* We check for any CIPSO */
2625 			uchar_t *secopt;
2626 			boolean_t hbh_needed;
2627 			uchar_t *after_secopt;
2628 
2629 			tmphopopts = (ip6_hbh_t *)whereptr;
2630 			ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
2631 			if ((uchar_t *)tmphopopts +  ehdrlen > endptr)
2632 				goto done;
2633 			nexthdr = tmphopopts->ip6h_nxt;
2634 
2635 			if (!label_separate) {
2636 				secopt = NULL;
2637 				after_secopt = whereptr;
2638 			} else {
2639 				/*
2640 				 * We have dropped packets with bad options in
2641 				 * ip6_input. No need to check return value
2642 				 * here.
2643 				 */
2644 				(void) tsol_find_secopt_v6(whereptr, ehdrlen,
2645 				    &secopt, &after_secopt, &hbh_needed);
2646 			}
2647 			if (secopt != NULL && after_secopt - whereptr > 0) {
2648 				ipp->ipp_fields |= IPPF_LABEL_V6;
2649 				ipp->ipp_label_v6 = secopt;
2650 				ipp->ipp_label_len_v6 = after_secopt - whereptr;
2651 			} else {
2652 				ipp->ipp_label_len_v6 = 0;
2653 				after_secopt = whereptr;
2654 				hbh_needed = B_TRUE;
2655 			}
2656 			/* return only 1st hbh */
2657 			if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
2658 				ipp->ipp_fields |= IPPF_HOPOPTS;
2659 				ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
2660 				ipp->ipp_hopoptslen = ehdrlen -
2661 				    ipp->ipp_label_len_v6;
2662 			}
2663 			break;
2664 		}
2665 		case IPPROTO_DSTOPTS:
2666 			tmpdstopts = (ip6_dest_t *)whereptr;
2667 			ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
2668 			if ((uchar_t *)tmpdstopts +  ehdrlen > endptr)
2669 				goto done;
2670 			nexthdr = tmpdstopts->ip6d_nxt;
2671 			/*
2672 			 * ipp_dstopts is set to the destination header after a
2673 			 * routing header.
2674 			 * Assume it is a post-rthdr destination header
2675 			 * and adjust when we find an rthdr.
2676 			 */
2677 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2678 				ipp->ipp_fields |= IPPF_DSTOPTS;
2679 				ipp->ipp_dstopts = tmpdstopts;
2680 				ipp->ipp_dstoptslen = ehdrlen;
2681 			}
2682 			break;
2683 		case IPPROTO_ROUTING:
2684 			tmprthdr = (ip6_rthdr_t *)whereptr;
2685 			ehdrlen = 8 * (tmprthdr->ip6r_len + 1);
2686 			if ((uchar_t *)tmprthdr +  ehdrlen > endptr)
2687 				goto done;
2688 			nexthdr = tmprthdr->ip6r_nxt;
2689 			/* return only 1st rthdr */
2690 			if (!(ipp->ipp_fields & IPPF_RTHDR)) {
2691 				ipp->ipp_fields |= IPPF_RTHDR;
2692 				ipp->ipp_rthdr = tmprthdr;
2693 				ipp->ipp_rthdrlen = ehdrlen;
2694 			}
2695 			/*
2696 			 * Make any destination header we've seen be a
2697 			 * pre-rthdr destination header.
2698 			 */
2699 			if (ipp->ipp_fields & IPPF_DSTOPTS) {
2700 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2701 				ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
2702 				ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
2703 				ipp->ipp_dstopts = NULL;
2704 				ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
2705 				ipp->ipp_dstoptslen = 0;
2706 			}
2707 			break;
2708 		case IPPROTO_FRAGMENT:
2709 			tmpfraghdr = (ip6_frag_t *)whereptr;
2710 			ehdrlen = sizeof (ip6_frag_t);
2711 			if ((uchar_t *)tmpfraghdr + ehdrlen > endptr)
2712 				goto done;
2713 			nexthdr = tmpfraghdr->ip6f_nxt;
2714 			if (!(ipp->ipp_fields & IPPF_FRAGHDR)) {
2715 				ipp->ipp_fields |= IPPF_FRAGHDR;
2716 				ipp->ipp_fraghdr = tmpfraghdr;
2717 				ipp->ipp_fraghdrlen = ehdrlen;
2718 			}
2719 			break;
2720 		case IPPROTO_NONE:
2721 		default:
2722 			goto done;
2723 		}
2724 		length += ehdrlen;
2725 		whereptr += ehdrlen;
2726 	}
2727 done:
2728 	if (nexthdrp != NULL)
2729 		*nexthdrp = nexthdr;
2730 	return (length);
2731 }
2732 
2733 /*
2734  * Return the length of the IPv6 related headers (including extension headers)
2735  * If the packet is malformed, this returns the simple IPv6 header length.
2736  */
2737 uint16_t
2738 ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
2739 {
2740 	uint16_t hdr_len;
2741 
2742 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL))
2743 		hdr_len = sizeof (*ip6h);
2744 	return (hdr_len);
2745 }
2746 
2747 /*
2748  * Parse and process any hop-by-hop or destination options.
2749  *
2750  * Assumes that q is an ill read queue so that ICMP errors for link-local
2751  * destinations are sent out the correct interface.
2752  *
2753  * Returns -1 if there was an error and mp has been consumed.
2754  * Returns 0 if no special action is needed.
2755  * Returns 1 if the packet contained a router alert option for this node
2756  * which is verified to be "interesting/known" for our implementation.
2757  *
2758  * XXX Note: In future as more hbh or dest options are defined,
2759  * it may be better to have different routines for hbh and dest
2760  * options as opt_type fields other than IP6OPT_PAD1 and IP6OPT_PADN
2761  * may have same value in different namespaces. Or is it same namespace ??
2762  * Current code checks for each opt_type (other than pads) if it is in
2763  * the expected  nexthdr (hbh or dest)
2764  */
2765 int
2766 ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
2767     uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
2768 {
2769 	uint8_t opt_type;
2770 	uint_t optused = 0;
2771 	int ret = 0;
2772 	const char *errtype;
2773 	ill_t		*ill = ira->ira_ill;
2774 	ip_stack_t	*ipst = ill->ill_ipst;
2775 
2776 	while (optlen != 0) {
2777 		opt_type = *optptr;
2778 		if (opt_type == IP6OPT_PAD1) {
2779 			optused = 1;
2780 		} else {
2781 			if (optlen < 2)
2782 				goto bad_opt;
2783 			errtype = "malformed";
2784 			if (opt_type == ip6opt_ls) {
2785 				optused = 2 + optptr[1];
2786 				if (optused > optlen)
2787 					goto bad_opt;
2788 			} else switch (opt_type) {
2789 			case IP6OPT_PADN:
2790 				/*
2791 				 * Note:We don't verify that (N-2) pad octets
2792 				 * are zero as required by spec. Adhere to
2793 				 * "be liberal in what you accept..." part of
2794 				 * implementation philosophy (RFC791,RFC1122)
2795 				 */
2796 				optused = 2 + optptr[1];
2797 				if (optused > optlen)
2798 					goto bad_opt;
2799 				break;
2800 
2801 			case IP6OPT_JUMBO:
2802 				if (hdr_type != IPPROTO_HOPOPTS)
2803 					goto opt_error;
2804 				goto opt_error; /* XXX Not implemented! */
2805 
2806 			case IP6OPT_ROUTER_ALERT: {
2807 				struct ip6_opt_router *or;
2808 
2809 				if (hdr_type != IPPROTO_HOPOPTS)
2810 					goto opt_error;
2811 				optused = 2 + optptr[1];
2812 				if (optused > optlen)
2813 					goto bad_opt;
2814 				or = (struct ip6_opt_router *)optptr;
2815 				/* Check total length and alignment */
2816 				if (optused != sizeof (*or) ||
2817 				    ((uintptr_t)or->ip6or_value & 0x1) != 0)
2818 					goto opt_error;
2819 				/* Check value */
2820 				switch (*((uint16_t *)or->ip6or_value)) {
2821 				case IP6_ALERT_MLD:
2822 				case IP6_ALERT_RSVP:
2823 					ret = 1;
2824 				}
2825 				break;
2826 			}
2827 			case IP6OPT_HOME_ADDRESS: {
2828 				/*
2829 				 * Minimal support for the home address option
2830 				 * (which is required by all IPv6 nodes).
2831 				 * Implement by just swapping the home address
2832 				 * and source address.
2833 				 * XXX Note: this has IPsec implications since
2834 				 * AH needs to take this into account.
2835 				 * Also, when IPsec is used we need to ensure
2836 				 * that this is only processed once
2837 				 * in the received packet (to avoid swapping
2838 				 * back and forth).
2839 				 * NOTE:This option processing is considered
2840 				 * to be unsafe and prone to a denial of
2841 				 * service attack.
2842 				 * The current processing is not safe even with
2843 				 * IPsec secured IP packets. Since the home
2844 				 * address option processing requirement still
2845 				 * is in the IETF draft and in the process of
2846 				 * being redefined for its usage, it has been
2847 				 * decided to turn off the option by default.
2848 				 * If this section of code needs to be executed,
2849 				 * ndd variable ip6_ignore_home_address_opt
2850 				 * should be set to 0 at the user's own risk.
2851 				 */
2852 				struct ip6_opt_home_address *oh;
2853 				in6_addr_t tmp;
2854 
2855 				if (ipst->ips_ipv6_ignore_home_address_opt)
2856 					goto opt_error;
2857 
2858 				if (hdr_type != IPPROTO_DSTOPTS)
2859 					goto opt_error;
2860 				optused = 2 + optptr[1];
2861 				if (optused > optlen)
2862 					goto bad_opt;
2863 
2864 				/*
2865 				 * We did this dest. opt the first time
2866 				 * around (i.e. before AH processing).
2867 				 * If we've done AH... stop now.
2868 				 */
2869 				if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
2870 				    ira->ira_ipsec_ah_sa != NULL)
2871 					break;
2872 
2873 				oh = (struct ip6_opt_home_address *)optptr;
2874 				/* Check total length and alignment */
2875 				if (optused < sizeof (*oh) ||
2876 				    ((uintptr_t)oh->ip6oh_addr & 0x7) != 0)
2877 					goto opt_error;
2878 				/* Swap ip6_src and the home address */
2879 				tmp = ip6h->ip6_src;
2880 				/* XXX Note: only 8 byte alignment option */
2881 				ip6h->ip6_src = *(in6_addr_t *)oh->ip6oh_addr;
2882 				*(in6_addr_t *)oh->ip6oh_addr = tmp;
2883 				break;
2884 			}
2885 
2886 			case IP6OPT_TUNNEL_LIMIT:
2887 				if (hdr_type != IPPROTO_DSTOPTS) {
2888 					goto opt_error;
2889 				}
2890 				optused = 2 + optptr[1];
2891 				if (optused > optlen) {
2892 					goto bad_opt;
2893 				}
2894 				if (optused != 3) {
2895 					goto opt_error;
2896 				}
2897 				break;
2898 
2899 			default:
2900 				errtype = "unknown";
2901 				/* FALLTHROUGH */
2902 			opt_error:
2903 				/* Determine which zone should send error */
2904 				switch (IP6OPT_TYPE(opt_type)) {
2905 				case IP6OPT_TYPE_SKIP:
2906 					optused = 2 + optptr[1];
2907 					if (optused > optlen)
2908 						goto bad_opt;
2909 					ip1dbg(("ip_process_options_v6: %s "
2910 					    "opt 0x%x skipped\n",
2911 					    errtype, opt_type));
2912 					break;
2913 				case IP6OPT_TYPE_DISCARD:
2914 					ip1dbg(("ip_process_options_v6: %s "
2915 					    "opt 0x%x; packet dropped\n",
2916 					    errtype, opt_type));
2917 					BUMP_MIB(ill->ill_ip_mib,
2918 					    ipIfStatsInHdrErrors);
2919 					ip_drop_input("ipIfStatsInHdrErrors",
2920 					    mp, ill);
2921 					freemsg(mp);
2922 					return (-1);
2923 				case IP6OPT_TYPE_ICMP:
2924 					BUMP_MIB(ill->ill_ip_mib,
2925 					    ipIfStatsInHdrErrors);
2926 					ip_drop_input("ipIfStatsInHdrErrors",
2927 					    mp, ill);
2928 					icmp_param_problem_v6(mp,
2929 					    ICMP6_PARAMPROB_OPTION,
2930 					    (uint32_t)(optptr -
2931 					    (uint8_t *)ip6h),
2932 					    B_FALSE, ira);
2933 					return (-1);
2934 				case IP6OPT_TYPE_FORCEICMP:
2935 					BUMP_MIB(ill->ill_ip_mib,
2936 					    ipIfStatsInHdrErrors);
2937 					ip_drop_input("ipIfStatsInHdrErrors",
2938 					    mp, ill);
2939 					icmp_param_problem_v6(mp,
2940 					    ICMP6_PARAMPROB_OPTION,
2941 					    (uint32_t)(optptr -
2942 					    (uint8_t *)ip6h),
2943 					    B_TRUE, ira);
2944 					return (-1);
2945 				default:
2946 					ASSERT(0);
2947 				}
2948 			}
2949 		}
2950 		optlen -= optused;
2951 		optptr += optused;
2952 	}
2953 	return (ret);
2954 
2955 bad_opt:
2956 	/* Determine which zone should send error */
2957 	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
2958 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
2959 	    (uint32_t)(optptr - (uint8_t *)ip6h),
2960 	    B_FALSE, ira);
2961 	return (-1);
2962 }
2963 
2964 /*
2965  * Process a routing header that is not yet empty.
2966  * Because of RFC 5095, we now reject all route headers.
2967  */
2968 void
2969 ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
2970     ip_recv_attr_t *ira)
2971 {
2972 	ill_t		*ill = ira->ira_ill;
2973 	ip_stack_t	*ipst = ill->ill_ipst;
2974 
2975 	ASSERT(rth->ip6r_segleft != 0);
2976 
2977 	if (!ipst->ips_ipv6_forward_src_routed) {
2978 		/* XXX Check for source routed out same interface? */
2979 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
2980 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
2981 		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
2982 		freemsg(mp);
2983 		return;
2984 	}
2985 
2986 	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
2987 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
2988 	    (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
2989 	    B_FALSE, ira);
2990 }
2991 
2992 /*
2993  * Read side put procedure for IPv6 module.
2994  */
2995 int
2996 ip_rput_v6(queue_t *q, mblk_t *mp)
2997 {
2998 	ill_t		*ill;
2999 
3000 	ill = (ill_t *)q->q_ptr;
3001 	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
3002 		union DL_primitives *dl;
3003 
3004 		dl = (union DL_primitives *)mp->b_rptr;
3005 		/*
3006 		 * Things are opening or closing - only accept DLPI
3007 		 * ack messages. If the stream is closing and ip_wsrv
3008 		 * has completed, ip_close is out of the qwait, but has
3009 		 * not yet completed qprocsoff. Don't proceed any further
3010 		 * because the ill has been cleaned up and things hanging
3011 		 * off the ill have been freed.
3012 		 */
3013 		if ((mp->b_datap->db_type != M_PCPROTO) ||
3014 		    (dl->dl_primitive == DL_UNITDATA_IND)) {
3015 			inet_freemsg(mp);
3016 			return (0);
3017 		}
3018 	}
3019 	if (DB_TYPE(mp) == M_DATA) {
3020 		struct mac_header_info_s mhi;
3021 
3022 		ip_mdata_to_mhi(ill, mp, &mhi);
3023 		ip_input_v6(ill, NULL, mp, &mhi);
3024 	} else {
3025 		ip_rput_notdata(ill, mp);
3026 	}
3027 	return (0);
3028 }
3029 
3030 /*
3031  * Walk through the IPv6 packet in mp and see if there's an AH header
3032  * in it.  See if the AH header needs to get done before other headers in
3033  * the packet.  (Worker function for ipsec_early_ah_v6().)
3034  */
3035 #define	IPSEC_HDR_DONT_PROCESS	0
3036 #define	IPSEC_HDR_PROCESS	1
3037 #define	IPSEC_MEMORY_ERROR	2 /* or malformed packet */
3038 static int
3039 ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
3040 {
3041 	uint_t	length;
3042 	uint_t	ehdrlen;
3043 	uint8_t *whereptr;
3044 	uint8_t *endptr;
3045 	uint8_t *nexthdrp;
3046 	ip6_dest_t *desthdr;
3047 	ip6_rthdr_t *rthdr;
3048 	ip6_t	*ip6h;
3049 
3050 	/*
3051 	 * For now just pullup everything.  In general, the less pullups,
3052 	 * the better, but there's so much squirrelling through anyway,
3053 	 * it's just easier this way.
3054 	 */
3055 	if (!pullupmsg(mp, -1)) {
3056 		return (IPSEC_MEMORY_ERROR);
3057 	}
3058 
3059 	ip6h = (ip6_t *)mp->b_rptr;
3060 	length = IPV6_HDR_LEN;
3061 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
3062 	endptr = mp->b_wptr;
3063 
3064 	/*
3065 	 * We can't just use the argument nexthdr in the place
3066 	 * of nexthdrp becaue we don't dereference nexthdrp
3067 	 * till we confirm whether it is a valid address.
3068 	 */
3069 	nexthdrp = &ip6h->ip6_nxt;
3070 	while (whereptr < endptr) {
3071 		/* Is there enough left for len + nexthdr? */
3072 		if (whereptr + MIN_EHDR_LEN > endptr)
3073 			return (IPSEC_MEMORY_ERROR);
3074 
3075 		switch (*nexthdrp) {
3076 		case IPPROTO_HOPOPTS:
3077 		case IPPROTO_DSTOPTS:
3078 			/* Assumes the headers are identical for hbh and dst */
3079 			desthdr = (ip6_dest_t *)whereptr;
3080 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
3081 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
3082 				return (IPSEC_MEMORY_ERROR);
3083 			/*
3084 			 * Return DONT_PROCESS because the destination
3085 			 * options header may be for each hop in a
3086 			 * routing-header, and we only want AH if we're
3087 			 * finished with routing headers.
3088 			 */
3089 			if (*nexthdrp == IPPROTO_DSTOPTS)
3090 				return (IPSEC_HDR_DONT_PROCESS);
3091 			nexthdrp = &desthdr->ip6d_nxt;
3092 			break;
3093 		case IPPROTO_ROUTING:
3094 			rthdr = (ip6_rthdr_t *)whereptr;
3095 
3096 			/*
3097 			 * If there's more hops left on the routing header,
3098 			 * return now with DON'T PROCESS.
3099 			 */
3100 			if (rthdr->ip6r_segleft > 0)
3101 				return (IPSEC_HDR_DONT_PROCESS);
3102 
3103 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
3104 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
3105 				return (IPSEC_MEMORY_ERROR);
3106 			nexthdrp = &rthdr->ip6r_nxt;
3107 			break;
3108 		case IPPROTO_FRAGMENT:
3109 			/* Wait for reassembly */
3110 			return (IPSEC_HDR_DONT_PROCESS);
3111 		case IPPROTO_AH:
3112 			*nexthdr = IPPROTO_AH;
3113 			return (IPSEC_HDR_PROCESS);
3114 		case IPPROTO_NONE:
3115 			/* No next header means we're finished */
3116 		default:
3117 			return (IPSEC_HDR_DONT_PROCESS);
3118 		}
3119 		length += ehdrlen;
3120 		whereptr += ehdrlen;
3121 	}
3122 	/*
3123 	 * Malformed/truncated packet.
3124 	 */
3125 	return (IPSEC_MEMORY_ERROR);
3126 }
3127 
3128 /*
3129  * Path for AH if options are present.
3130  * Returns NULL if the mblk was consumed.
3131  *
3132  * Sometimes AH needs to be done before other IPv6 headers for security
3133  * reasons.  This function (and its ipsec_needs_processing_v6() above)
3134  * indicates if that is so, and fans out to the appropriate IPsec protocol
3135  * for the datagram passed in.
3136  */
3137 mblk_t *
3138 ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
3139 {
3140 	uint8_t nexthdr;
3141 	ah_t *ah;
3142 	ill_t		*ill = ira->ira_ill;
3143 	ip_stack_t	*ipst = ill->ill_ipst;
3144 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
3145 
3146 	switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
3147 	case IPSEC_MEMORY_ERROR:
3148 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3149 		ip_drop_input("ipIfStatsInDiscards", mp, ill);
3150 		freemsg(mp);
3151 		return (NULL);
3152 	case IPSEC_HDR_DONT_PROCESS:
3153 		return (mp);
3154 	}
3155 
3156 	/* Default means send it to AH! */
3157 	ASSERT(nexthdr == IPPROTO_AH);
3158 
3159 	if (!ipsec_loaded(ipss)) {
3160 		ip_proto_not_sup(mp, ira);
3161 		return (NULL);
3162 	}
3163 
3164 	mp = ipsec_inbound_ah_sa(mp, ira, &ah);
3165 	if (mp == NULL)
3166 		return (NULL);
3167 	ASSERT(ah != NULL);
3168 	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3169 	ASSERT(ira->ira_ipsec_ah_sa != NULL);
3170 	ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
3171 	mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
3172 
3173 	if (mp == NULL) {
3174 		/*
3175 		 * Either it failed or is pending. In the former case
3176 		 * ipIfStatsInDiscards was increased.
3177 		 */
3178 		return (NULL);
3179 	}
3180 
3181 	/* we're done with IPsec processing, send it up */
3182 	ip_input_post_ipsec(mp, ira);
3183 	return (NULL);
3184 }
3185 
3186 /*
3187  * Reassemble fragment.
3188  * When it returns a completed message the first mblk will only contain
3189  * the headers prior to the fragment header, with the nexthdr value updated
3190  * to be the header after the fragment header.
3191  */
3192 mblk_t *
3193 ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
3194     ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
3195 {
3196 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
3197 	uint16_t	offset;
3198 	boolean_t	more_frags;
3199 	uint8_t		nexthdr = fraghdr->ip6f_nxt;
3200 	in6_addr_t	*v6dst_ptr;
3201 	in6_addr_t	*v6src_ptr;
3202 	uint_t		end;
3203 	uint_t		hdr_length;
3204 	size_t		count;
3205 	ipf_t		*ipf;
3206 	ipf_t		**ipfp;
3207 	ipfb_t		*ipfb;
3208 	mblk_t		*mp1;
3209 	uint8_t		ecn_info = 0;
3210 	size_t		msg_len;
3211 	mblk_t		*tail_mp;
3212 	mblk_t		*t_mp;
3213 	boolean_t	pruned = B_FALSE;
3214 	uint32_t	sum_val;
3215 	uint16_t	sum_flags;
3216 	ill_t		*ill = ira->ira_ill;
3217 	ip_stack_t	*ipst = ill->ill_ipst;
3218 	uint_t		prev_nexthdr_offset;
3219 	uint8_t		prev_nexthdr;
3220 	uint8_t		*ptr;
3221 	uint32_t	packet_size;
3222 
3223 	/*
3224 	 * We utilize hardware computed checksum info only for UDP since
3225 	 * IP fragmentation is a normal occurence for the protocol.  In
3226 	 * addition, checksum offload support for IP fragments carrying
3227 	 * UDP payload is commonly implemented across network adapters.
3228 	 */
3229 	ASSERT(ira->ira_rill != NULL);
3230 	if (nexthdr == IPPROTO_UDP && dohwcksum &&
3231 	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
3232 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
3233 		mblk_t *mp1 = mp->b_cont;
3234 		int32_t len;
3235 
3236 		/* Record checksum information from the packet */
3237 		sum_val = (uint32_t)DB_CKSUM16(mp);
3238 		sum_flags = DB_CKSUMFLAGS(mp);
3239 
3240 		/* fragmented payload offset from beginning of mblk */
3241 		offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
3242 
3243 		if ((sum_flags & HCK_PARTIALCKSUM) &&
3244 		    (mp1 == NULL || mp1->b_cont == NULL) &&
3245 		    offset >= DB_CKSUMSTART(mp) &&
3246 		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
3247 			uint32_t adj;
3248 			/*
3249 			 * Partial checksum has been calculated by hardware
3250 			 * and attached to the packet; in addition, any
3251 			 * prepended extraneous data is even byte aligned.
3252 			 * If any such data exists, we adjust the checksum;
3253 			 * this would also handle any postpended data.
3254 			 */
3255 			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
3256 			    mp, mp1, len, adj);
3257 
3258 			/* One's complement subtract extraneous checksum */
3259 			if (adj >= sum_val)
3260 				sum_val = ~(adj - sum_val) & 0xFFFF;
3261 			else
3262 				sum_val -= adj;
3263 		}
3264 	} else {
3265 		sum_val = 0;
3266 		sum_flags = 0;
3267 	}
3268 
3269 	/* Clear hardware checksumming flag */
3270 	DB_CKSUMFLAGS(mp) = 0;
3271 
3272 	/*
3273 	 * Determine the offset (from the begining of the IP header)
3274 	 * of the nexthdr value which has IPPROTO_FRAGMENT. We use
3275 	 * this when removing the fragment header from the packet.
3276 	 * This packet consists of the IPv6 header, a potential
3277 	 * hop-by-hop options header, a potential pre-routing-header
3278 	 * destination options header, and a potential routing header.
3279 	 */
3280 	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
3281 	prev_nexthdr = ip6h->ip6_nxt;
3282 	ptr = (uint8_t *)&ip6h[1];
3283 
3284 	if (prev_nexthdr == IPPROTO_HOPOPTS) {
3285 		ip6_hbh_t	*hbh_hdr;
3286 		uint_t		hdr_len;
3287 
3288 		hbh_hdr = (ip6_hbh_t *)ptr;
3289 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
3290 		prev_nexthdr = hbh_hdr->ip6h_nxt;
3291 		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
3292 		    - (uint8_t *)ip6h;
3293 		ptr += hdr_len;
3294 	}
3295 	if (prev_nexthdr == IPPROTO_DSTOPTS) {
3296 		ip6_dest_t	*dest_hdr;
3297 		uint_t		hdr_len;
3298 
3299 		dest_hdr = (ip6_dest_t *)ptr;
3300 		hdr_len = 8 * (dest_hdr->ip6d_len + 1);
3301 		prev_nexthdr = dest_hdr->ip6d_nxt;
3302 		prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
3303 		    - (uint8_t *)ip6h;
3304 		ptr += hdr_len;
3305 	}
3306 	if (prev_nexthdr == IPPROTO_ROUTING) {
3307 		ip6_rthdr_t	*rthdr;
3308 		uint_t		hdr_len;
3309 
3310 		rthdr = (ip6_rthdr_t *)ptr;
3311 		prev_nexthdr = rthdr->ip6r_nxt;
3312 		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
3313 		    - (uint8_t *)ip6h;
3314 		hdr_len = 8 * (rthdr->ip6r_len + 1);
3315 		ptr += hdr_len;
3316 	}
3317 	if (prev_nexthdr != IPPROTO_FRAGMENT) {
3318 		/* Can't handle other headers before the fragment header */
3319 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3320 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3321 		freemsg(mp);
3322 		return (NULL);
3323 	}
3324 
3325 	/*
3326 	 * Note: Fragment offset in header is in 8-octet units.
3327 	 * Clearing least significant 3 bits not only extracts
3328 	 * it but also gets it in units of octets.
3329 	 */
3330 	offset = ntohs(fraghdr->ip6f_offlg) & ~7;
3331 	more_frags = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG);
3332 
3333 	/*
3334 	 * Is the more frags flag on and the payload length not a multiple
3335 	 * of eight?
3336 	 */
3337 	if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
3338 		ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3339 		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3340 		    (uint32_t)((char *)&ip6h->ip6_plen -
3341 		    (char *)ip6h), B_FALSE, ira);
3342 		return (NULL);
3343 	}
3344 
3345 	v6src_ptr = &ip6h->ip6_src;
3346 	v6dst_ptr = &ip6h->ip6_dst;
3347 	end = remlen;
3348 
3349 	hdr_length = (uint_t)((char *)&fraghdr[1] - (char *)ip6h);
3350 	end += offset;
3351 
3352 	/*
3353 	 * Would fragment cause reassembled packet to have a payload length
3354 	 * greater than IP_MAXPACKET - the max payload size?
3355 	 */
3356 	if (end > IP_MAXPACKET) {
3357 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3358 		ip_drop_input("Reassembled packet too large", mp, ill);
3359 		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3360 		    (uint32_t)((char *)&fraghdr->ip6f_offlg -
3361 		    (char *)ip6h), B_FALSE, ira);
3362 		return (NULL);
3363 	}
3364 
3365 	/*
3366 	 * This packet just has one fragment. Reassembly not
3367 	 * needed.
3368 	 */
3369 	if (!more_frags && offset == 0) {
3370 		goto reass_done;
3371 	}
3372 
3373 	/*
3374 	 * Drop the fragmented as early as possible, if
3375 	 * we don't have resource(s) to re-assemble.
3376 	 */
3377 	if (ipst->ips_ip_reass_queue_bytes == 0) {
3378 		freemsg(mp);
3379 		return (NULL);
3380 	}
3381 
3382 	/* Record the ECN field info. */
3383 	ecn_info = (uint8_t)(ntohl(ip6h->ip6_vcf & htonl(~0xFFCFFFFF)) >> 20);
3384 	/*
3385 	 * If this is not the first fragment, dump the unfragmentable
3386 	 * portion of the packet.
3387 	 */
3388 	if (offset)
3389 		mp->b_rptr = (uchar_t *)&fraghdr[1];
3390 
3391 	/*
3392 	 * Fragmentation reassembly.  Each ILL has a hash table for
3393 	 * queueing packets undergoing reassembly for all IPIFs
3394 	 * associated with the ILL.  The hash is based on the packet
3395 	 * IP ident field.  The ILL frag hash table was allocated
3396 	 * as a timer block at the time the ILL was created.  Whenever
3397 	 * there is anything on the reassembly queue, the timer will
3398 	 * be running.
3399 	 */
3400 	/* Handle vnic loopback of fragments */
3401 	if (mp->b_datap->db_ref > 2)
3402 		msg_len = 0;
3403 	else
3404 		msg_len = MBLKSIZE(mp);
3405 
3406 	tail_mp = mp;
3407 	while (tail_mp->b_cont != NULL) {
3408 		tail_mp = tail_mp->b_cont;
3409 		if (tail_mp->b_datap->db_ref <= 2)
3410 			msg_len += MBLKSIZE(tail_mp);
3411 	}
3412 	/*
3413 	 * If the reassembly list for this ILL will get too big
3414 	 * prune it.
3415 	 */
3416 
3417 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
3418 	    ipst->ips_ip_reass_queue_bytes) {
3419 		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
3420 		    uint_t, ill->ill_frag_count,
3421 		    uint_t, ipst->ips_ip_reass_queue_bytes);
3422 		ill_frag_prune(ill,
3423 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
3424 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
3425 		pruned = B_TRUE;
3426 	}
3427 
3428 	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH_V6(*v6src_ptr, ident)];
3429 	mutex_enter(&ipfb->ipfb_lock);
3430 
3431 	ipfp = &ipfb->ipfb_ipf;
3432 	/* Try to find an existing fragment queue for this packet. */
3433 	for (;;) {
3434 		ipf = ipfp[0];
3435 		if (ipf) {
3436 			/*
3437 			 * It has to match on ident, source address, and
3438 			 * dest address.
3439 			 */
3440 			if (ipf->ipf_ident == ident &&
3441 			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6src, v6src_ptr) &&
3442 			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6dst, v6dst_ptr)) {
3443 
3444 				/*
3445 				 * If we have received too many
3446 				 * duplicate fragments for this packet
3447 				 * free it.
3448 				 */
3449 				if (ipf->ipf_num_dups > ip_max_frag_dups) {
3450 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
3451 					freemsg(mp);
3452 					mutex_exit(&ipfb->ipfb_lock);
3453 					return (NULL);
3454 				}
3455 
3456 				break;
3457 			}
3458 			ipfp = &ipf->ipf_hash_next;
3459 			continue;
3460 		}
3461 
3462 
3463 		/*
3464 		 * If we pruned the list, do we want to store this new
3465 		 * fragment?. We apply an optimization here based on the
3466 		 * fact that most fragments will be received in order.
3467 		 * So if the offset of this incoming fragment is zero,
3468 		 * it is the first fragment of a new packet. We will
3469 		 * keep it.  Otherwise drop the fragment, as we have
3470 		 * probably pruned the packet already (since the
3471 		 * packet cannot be found).
3472 		 */
3473 
3474 		if (pruned && offset != 0) {
3475 			mutex_exit(&ipfb->ipfb_lock);
3476 			freemsg(mp);
3477 			return (NULL);
3478 		}
3479 
3480 		/* New guy.  Allocate a frag message. */
3481 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
3482 		if (!mp1) {
3483 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3484 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3485 			freemsg(mp);
3486 	partial_reass_done:
3487 			mutex_exit(&ipfb->ipfb_lock);
3488 			return (NULL);
3489 		}
3490 
3491 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
3492 			/*
3493 			 * Too many fragmented packets in this hash bucket.
3494 			 * Free the oldest.
3495 			 */
3496 			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
3497 		}
3498 
3499 		mp1->b_cont = mp;
3500 
3501 		/* Initialize the fragment header. */
3502 		ipf = (ipf_t *)mp1->b_rptr;
3503 		ipf->ipf_mp = mp1;
3504 		ipf->ipf_ptphn = ipfp;
3505 		ipfp[0] = ipf;
3506 		ipf->ipf_hash_next = NULL;
3507 		ipf->ipf_ident = ident;
3508 		ipf->ipf_v6src = *v6src_ptr;
3509 		ipf->ipf_v6dst = *v6dst_ptr;
3510 		/* Record reassembly start time. */
3511 		ipf->ipf_timestamp = gethrestime_sec();
3512 		/* Record ipf generation and account for frag header */
3513 		ipf->ipf_gen = ill->ill_ipf_gen++;
3514 		ipf->ipf_count = MBLKSIZE(mp1);
3515 		ipf->ipf_protocol = nexthdr;
3516 		ipf->ipf_nf_hdr_len = 0;
3517 		ipf->ipf_prev_nexthdr_offset = 0;
3518 		ipf->ipf_last_frag_seen = B_FALSE;
3519 		ipf->ipf_ecn = ecn_info;
3520 		ipf->ipf_num_dups = 0;
3521 		ipfb->ipfb_frag_pkts++;
3522 		ipf->ipf_checksum = 0;
3523 		ipf->ipf_checksum_flags = 0;
3524 
3525 		/* Store checksum value in fragment header */
3526 		if (sum_flags != 0) {
3527 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3528 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3529 			ipf->ipf_checksum = sum_val;
3530 			ipf->ipf_checksum_flags = sum_flags;
3531 		}
3532 
3533 		/*
3534 		 * We handle reassembly two ways.  In the easy case,
3535 		 * where all the fragments show up in order, we do
3536 		 * minimal bookkeeping, and just clip new pieces on
3537 		 * the end.  If we ever see a hole, then we go off
3538 		 * to ip_reassemble which has to mark the pieces and
3539 		 * keep track of the number of holes, etc.  Obviously,
3540 		 * the point of having both mechanisms is so we can
3541 		 * handle the easy case as efficiently as possible.
3542 		 */
3543 		if (offset == 0) {
3544 			/* Easy case, in-order reassembly so far. */
3545 			/* Update the byte count */
3546 			ipf->ipf_count += msg_len;
3547 			ipf->ipf_tail_mp = tail_mp;
3548 			/*
3549 			 * Keep track of next expected offset in
3550 			 * ipf_end.
3551 			 */
3552 			ipf->ipf_end = end;
3553 			ipf->ipf_nf_hdr_len = hdr_length;
3554 			ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
3555 		} else {
3556 			/* Hard case, hole at the beginning. */
3557 			ipf->ipf_tail_mp = NULL;
3558 			/*
3559 			 * ipf_end == 0 means that we have given up
3560 			 * on easy reassembly.
3561 			 */
3562 			ipf->ipf_end = 0;
3563 
3564 			/* Forget checksum offload from now on */
3565 			ipf->ipf_checksum_flags = 0;
3566 
3567 			/*
3568 			 * ipf_hole_cnt is set by ip_reassemble.
3569 			 * ipf_count is updated by ip_reassemble.
3570 			 * No need to check for return value here
3571 			 * as we don't expect reassembly to complete or
3572 			 * fail for the first fragment itself.
3573 			 */
3574 			(void) ip_reassemble(mp, ipf, offset, more_frags, ill,
3575 			    msg_len);
3576 		}
3577 		/* Update per ipfb and ill byte counts */
3578 		ipfb->ipfb_count += ipf->ipf_count;
3579 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3580 		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
3581 		/* If the frag timer wasn't already going, start it. */
3582 		mutex_enter(&ill->ill_lock);
3583 		ill_frag_timer_start(ill);
3584 		mutex_exit(&ill->ill_lock);
3585 		goto partial_reass_done;
3586 	}
3587 
3588 	/*
3589 	 * If the packet's flag has changed (it could be coming up
3590 	 * from an interface different than the previous, therefore
3591 	 * possibly different checksum capability), then forget about
3592 	 * any stored checksum states.  Otherwise add the value to
3593 	 * the existing one stored in the fragment header.
3594 	 */
3595 	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
3596 		sum_val += ipf->ipf_checksum;
3597 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3598 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3599 		ipf->ipf_checksum = sum_val;
3600 	} else if (ipf->ipf_checksum_flags != 0) {
3601 		/* Forget checksum offload from now on */
3602 		ipf->ipf_checksum_flags = 0;
3603 	}
3604 
3605 	/*
3606 	 * We have a new piece of a datagram which is already being
3607 	 * reassembled.  Update the ECN info if all IP fragments
3608 	 * are ECN capable.  If there is one which is not, clear
3609 	 * all the info.  If there is at least one which has CE
3610 	 * code point, IP needs to report that up to transport.
3611 	 */
3612 	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
3613 		if (ecn_info == IPH_ECN_CE)
3614 			ipf->ipf_ecn = IPH_ECN_CE;
3615 	} else {
3616 		ipf->ipf_ecn = IPH_ECN_NECT;
3617 	}
3618 
3619 	if (offset && ipf->ipf_end == offset) {
3620 		/* The new fragment fits at the end */
3621 		ipf->ipf_tail_mp->b_cont = mp;
3622 		/* Update the byte count */
3623 		ipf->ipf_count += msg_len;
3624 		/* Update per ipfb and ill byte counts */
3625 		ipfb->ipfb_count += msg_len;
3626 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3627 		atomic_add_32(&ill->ill_frag_count, msg_len);
3628 		if (more_frags) {
3629 			/* More to come. */
3630 			ipf->ipf_end = end;
3631 			ipf->ipf_tail_mp = tail_mp;
3632 			goto partial_reass_done;
3633 		}
3634 	} else {
3635 		/*
3636 		 * Go do the hard cases.
3637 		 * Call ip_reassemble().
3638 		 */
3639 		int ret;
3640 
3641 		if (offset == 0) {
3642 			if (ipf->ipf_prev_nexthdr_offset == 0) {
3643 				ipf->ipf_nf_hdr_len = hdr_length;
3644 				ipf->ipf_prev_nexthdr_offset =
3645 				    prev_nexthdr_offset;
3646 			}
3647 		}
3648 		/* Save current byte count */
3649 		count = ipf->ipf_count;
3650 		ret = ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len);
3651 
3652 		/* Count of bytes added and subtracted (freeb()ed) */
3653 		count = ipf->ipf_count - count;
3654 		if (count) {
3655 			/* Update per ipfb and ill byte counts */
3656 			ipfb->ipfb_count += count;
3657 			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3658 			atomic_add_32(&ill->ill_frag_count, count);
3659 		}
3660 		if (ret == IP_REASS_PARTIAL) {
3661 			goto partial_reass_done;
3662 		} else if (ret == IP_REASS_FAILED) {
3663 			/* Reassembly failed. Free up all resources */
3664 			ill_frag_free_pkts(ill, ipfb, ipf, 1);
3665 			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
3666 				IP_REASS_SET_START(t_mp, 0);
3667 				IP_REASS_SET_END(t_mp, 0);
3668 			}
3669 			freemsg(mp);
3670 			goto partial_reass_done;
3671 		}
3672 
3673 		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
3674 	}
3675 	/*
3676 	 * We have completed reassembly.  Unhook the frag header from
3677 	 * the reassembly list.
3678 	 *
3679 	 * Grab the unfragmentable header length next header value out
3680 	 * of the first fragment
3681 	 */
3682 	ASSERT(ipf->ipf_nf_hdr_len != 0);
3683 	hdr_length = ipf->ipf_nf_hdr_len;
3684 
3685 	/*
3686 	 * Before we free the frag header, record the ECN info
3687 	 * to report back to the transport.
3688 	 */
3689 	ecn_info = ipf->ipf_ecn;
3690 
3691 	/*
3692 	 * Store the nextheader field in the header preceding the fragment
3693 	 * header
3694 	 */
3695 	nexthdr = ipf->ipf_protocol;
3696 	prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
3697 	ipfp = ipf->ipf_ptphn;
3698 
3699 	/* We need to supply these to caller */
3700 	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
3701 		sum_val = ipf->ipf_checksum;
3702 	else
3703 		sum_val = 0;
3704 
3705 	mp1 = ipf->ipf_mp;
3706 	count = ipf->ipf_count;
3707 	ipf = ipf->ipf_hash_next;
3708 	if (ipf)
3709 		ipf->ipf_ptphn = ipfp;
3710 	ipfp[0] = ipf;
3711 	atomic_add_32(&ill->ill_frag_count, -count);
3712 	ASSERT(ipfb->ipfb_count >= count);
3713 	ipfb->ipfb_count -= count;
3714 	ipfb->ipfb_frag_pkts--;
3715 	mutex_exit(&ipfb->ipfb_lock);
3716 	/* Ditch the frag header. */
3717 	mp = mp1->b_cont;
3718 	freeb(mp1);
3719 
3720 	/*
3721 	 * Make sure the packet is good by doing some sanity
3722 	 * check. If bad we can silentely drop the packet.
3723 	 */
3724 reass_done:
3725 	if (hdr_length < sizeof (ip6_frag_t)) {
3726 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3727 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3728 		ip1dbg(("ip_input_fragment_v6: bad packet\n"));
3729 		freemsg(mp);
3730 		return (NULL);
3731 	}
3732 
3733 	/*
3734 	 * Remove the fragment header from the initial header by
3735 	 * splitting the mblk into the non-fragmentable header and
3736 	 * everthing after the fragment extension header.  This has the
3737 	 * side effect of putting all the headers that need destination
3738 	 * processing into the b_cont block-- on return this fact is
3739 	 * used in order to avoid having to look at the extensions
3740 	 * already processed.
3741 	 *
3742 	 * Note that this code assumes that the unfragmentable portion
3743 	 * of the header is in the first mblk and increments
3744 	 * the read pointer past it.  If this assumption is broken
3745 	 * this code fails badly.
3746 	 */
3747 	if (mp->b_rptr + hdr_length != mp->b_wptr) {
3748 		mblk_t *nmp;
3749 
3750 		if (!(nmp = dupb(mp))) {
3751 			ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
3752 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3753 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3754 			freemsg(mp);
3755 			return (NULL);
3756 		}
3757 		nmp->b_cont = mp->b_cont;
3758 		mp->b_cont = nmp;
3759 		nmp->b_rptr += hdr_length;
3760 	}
3761 	mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
3762 
3763 	ip6h = (ip6_t *)mp->b_rptr;
3764 	((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
3765 
3766 	/* Restore original IP length in header. */
3767 	packet_size = msgdsize(mp);
3768 	ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
3769 	/* Record the ECN info. */
3770 	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
3771 	ip6h->ip6_vcf |= htonl(ecn_info << 20);
3772 
3773 	/* Update the receive attributes */
3774 	ira->ira_pktlen = packet_size;
3775 	ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
3776 	ira->ira_protocol = nexthdr;
3777 
3778 	/* Reassembly is successful; set checksum information in packet */
3779 	DB_CKSUM16(mp) = (uint16_t)sum_val;
3780 	DB_CKSUMFLAGS(mp) = sum_flags;
3781 	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
3782 
3783 	return (mp);
3784 }
3785 
3786 /*
3787  * Given an mblk and a ptr, find the destination address in an IPv6 routing
3788  * header.
3789  */
3790 static in6_addr_t
3791 pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
3792 {
3793 	ip6_rthdr0_t *rt0;
3794 	int segleft, numaddr;
3795 	in6_addr_t *ap, rv = oldrv;
3796 
3797 	rt0 = (ip6_rthdr0_t *)whereptr;
3798 	if (rt0->ip6r0_type != 0 && rt0->ip6r0_type != 2) {
3799 		DTRACE_PROBE2(pluck_out_dst_unknown_type, mblk_t *, mp,
3800 		    uint8_t *, whereptr);
3801 		return (rv);
3802 	}
3803 	segleft = rt0->ip6r0_segleft;
3804 	numaddr = rt0->ip6r0_len / 2;
3805 
3806 	if ((rt0->ip6r0_len & 0x1) ||
3807 	    (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
3808 	    (segleft > rt0->ip6r0_len / 2)) {
3809 		/*
3810 		 * Corrupt packet.  Either the routing header length is odd
3811 		 * (can't happen) or mismatched compared to the packet, or the
3812 		 * number of addresses is.  Return what we can.  This will
3813 		 * only be a problem on forwarded packets that get squeezed
3814 		 * through an outbound tunnel enforcing IPsec Tunnel Mode.
3815 		 */
3816 		DTRACE_PROBE2(pluck_out_dst_badpkt, mblk_t *, mp, uint8_t *,
3817 		    whereptr);
3818 		return (rv);
3819 	}
3820 
3821 	if (segleft != 0) {
3822 		ap = (in6_addr_t *)((char *)rt0 + sizeof (*rt0));
3823 		rv = ap[numaddr - 1];
3824 	}
3825 
3826 	return (rv);
3827 }
3828 
3829 /*
3830  * Walk through the options to see if there is a routing header.
3831  * If present get the destination which is the last address of
3832  * the option.
3833  * mp needs to be provided in cases when the extension headers might span
3834  * b_cont; mp is never modified by this function.
3835  */
3836 in6_addr_t
3837 ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
3838 {
3839 	const mblk_t *current_mp = mp;
3840 	uint8_t nexthdr;
3841 	uint8_t *whereptr;
3842 	int ehdrlen;
3843 	in6_addr_t rv;
3844 
3845 	whereptr = (uint8_t *)ip6h;
3846 	ehdrlen = sizeof (ip6_t);
3847 
3848 	/* We assume at least the IPv6 base header is within one mblk. */
3849 	ASSERT(mp == NULL ||
3850 	    (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
3851 
3852 	rv = ip6h->ip6_dst;
3853 	nexthdr = ip6h->ip6_nxt;
3854 	if (is_fragment != NULL)
3855 		*is_fragment = B_FALSE;
3856 
3857 	/*
3858 	 * We also assume (thanks to ipsec_tun_outbound()'s pullup) that
3859 	 * no extension headers will be split across mblks.
3860 	 */
3861 
3862 	while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS ||
3863 	    nexthdr == IPPROTO_ROUTING) {
3864 		if (nexthdr == IPPROTO_ROUTING)
3865 			rv = pluck_out_dst(current_mp, whereptr, rv);
3866 
3867 		/*
3868 		 * All IPv6 extension headers have the next-header in byte
3869 		 * 0, and the (length - 8) in 8-byte-words.
3870 		 */
3871 		while (current_mp != NULL &&
3872 		    whereptr + ehdrlen >= current_mp->b_wptr) {
3873 			ehdrlen -= (current_mp->b_wptr - whereptr);
3874 			current_mp = current_mp->b_cont;
3875 			if (current_mp == NULL) {
3876 				/* Bad packet.  Return what we can. */
3877 				DTRACE_PROBE3(ip_get_dst_v6_badpkt, mblk_t *,
3878 				    mp, mblk_t *, current_mp, ip6_t *, ip6h);
3879 				goto done;
3880 			}
3881 			whereptr = current_mp->b_rptr;
3882 		}
3883 		whereptr += ehdrlen;
3884 
3885 		nexthdr = *whereptr;
3886 		ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
3887 		ehdrlen = (*(whereptr + 1) + 1) * 8;
3888 	}
3889 
3890 done:
3891 	if (nexthdr == IPPROTO_FRAGMENT && is_fragment != NULL)
3892 		*is_fragment = B_TRUE;
3893 	return (rv);
3894 }
3895 
3896 /*
3897  * ip_source_routed_v6:
3898  * This function is called by redirect code (called from ip_input_v6) to
3899  * know whether this packet is source routed through this node i.e
3900  * whether this node (router) is part of the journey. This
3901  * function is called under two cases :
3902  *
3903  * case 1 : Routing header was processed by this node and
3904  *	    ip_process_rthdr replaced ip6_dst with the next hop
3905  *	    and we are forwarding the packet to the next hop.
3906  *
3907  * case 2 : Routing header was not processed by this node and we
3908  *	    are just forwarding the packet.
3909  *
3910  * For case (1) we don't want to send redirects. For case(2) we
3911  * want to send redirects.
3912  */
3913 static boolean_t
3914 ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
3915 {
3916 	uint8_t		nexthdr;
3917 	in6_addr_t	*addrptr;
3918 	ip6_rthdr0_t	*rthdr;
3919 	uint8_t		numaddr;
3920 	ip6_hbh_t	*hbhhdr;
3921 	uint_t		ehdrlen;
3922 	uint8_t		*byteptr;
3923 
3924 	ip2dbg(("ip_source_routed_v6\n"));
3925 	nexthdr = ip6h->ip6_nxt;
3926 	ehdrlen = IPV6_HDR_LEN;
3927 
3928 	/* if a routing hdr is preceeded by HOPOPT or DSTOPT */
3929 	while (nexthdr == IPPROTO_HOPOPTS ||
3930 	    nexthdr == IPPROTO_DSTOPTS) {
3931 		byteptr = (uint8_t *)ip6h + ehdrlen;
3932 		/*
3933 		 * Check if we have already processed
3934 		 * packets or we are just a forwarding
3935 		 * router which only pulled up msgs up
3936 		 * to IPV6HDR and  one HBH ext header
3937 		 */
3938 		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
3939 			ip2dbg(("ip_source_routed_v6: Extension"
3940 			    " headers not processed\n"));
3941 			return (B_FALSE);
3942 		}
3943 		hbhhdr = (ip6_hbh_t *)byteptr;
3944 		nexthdr = hbhhdr->ip6h_nxt;
3945 		ehdrlen = ehdrlen + 8 * (hbhhdr->ip6h_len + 1);
3946 	}
3947 	switch (nexthdr) {
3948 	case IPPROTO_ROUTING:
3949 		byteptr = (uint8_t *)ip6h + ehdrlen;
3950 		/*
3951 		 * If for some reason, we haven't pulled up
3952 		 * the routing hdr data mblk, then we must
3953 		 * not have processed it at all. So for sure
3954 		 * we are not part of the source routed journey.
3955 		 */
3956 		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
3957 			ip2dbg(("ip_source_routed_v6: Routing"
3958 			    " header not processed\n"));
3959 			return (B_FALSE);
3960 		}
3961 		rthdr = (ip6_rthdr0_t *)byteptr;
3962 		/*
3963 		 * Either we are an intermediate router or the
3964 		 * last hop before destination and we have
3965 		 * already processed the routing header.
3966 		 * If segment_left is greater than or equal to zero,
3967 		 * then we must be the (numaddr - segleft) entry
3968 		 * of the routing header. Although ip6r0_segleft
3969 		 * is a unit8_t variable, we still check for zero
3970 		 * or greater value, if in case the data type
3971 		 * is changed someday in future.
3972 		 */
3973 		if (rthdr->ip6r0_segleft > 0 ||
3974 		    rthdr->ip6r0_segleft == 0) {
3975 			numaddr = rthdr->ip6r0_len / 2;
3976 			addrptr = (in6_addr_t *)((char *)rthdr +
3977 			    sizeof (*rthdr));
3978 			addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
3979 			if (addrptr != NULL) {
3980 				if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
3981 					return (B_TRUE);
3982 				ip1dbg(("ip_source_routed_v6: Not local\n"));
3983 			}
3984 		}
3985 	/* FALLTHROUGH */
3986 	default:
3987 		ip2dbg(("ip_source_routed_v6: Not source routed here\n"));
3988 		return (B_FALSE);
3989 	}
3990 }
3991 
3992 /*
3993  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
3994  * We have not optimized this in terms of number of mblks
3995  * allocated. For instance, for each fragment sent we always allocate a
3996  * mblk to hold the IPv6 header and fragment header.
3997  *
3998  * Assumes that all the extension headers are contained in the first mblk
3999  * and that the fragment header has has already been added by calling
4000  * ip_fraghdr_add_v6.
4001  */
4002 int
4003 ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
4004     uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
4005     pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
4006 {
4007 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4008 	ip6_t		*fip6h;
4009 	mblk_t		*hmp;
4010 	mblk_t		*hmp0;
4011 	mblk_t		*dmp;
4012 	ip6_frag_t	*fraghdr;
4013 	size_t		unfragmentable_len;
4014 	size_t		mlen;
4015 	size_t		max_chunk;
4016 	uint16_t	off_flags;
4017 	uint16_t	offset = 0;
4018 	ill_t		*ill = nce->nce_ill;
4019 	uint8_t		nexthdr;
4020 	uint8_t		*ptr;
4021 	ip_stack_t	*ipst = ill->ill_ipst;
4022 	uint_t		priority = mp->b_band;
4023 	int		error = 0;
4024 
4025 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
4026 	if (max_frag == 0) {
4027 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4028 		ip_drop_output("FragFails: zero max_frag", mp, ill);
4029 		freemsg(mp);
4030 		return (EINVAL);
4031 	}
4032 
4033 	/*
4034 	 * Caller should have added fraghdr_t to pkt_len, and also
4035 	 * updated ip6_plen.
4036 	 */
4037 	ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
4038 	ASSERT(msgdsize(mp) == pkt_len);
4039 
4040 	/*
4041 	 * Determine the length of the unfragmentable portion of this
4042 	 * datagram.  This consists of the IPv6 header, a potential
4043 	 * hop-by-hop options header, a potential pre-routing-header
4044 	 * destination options header, and a potential routing header.
4045 	 */
4046 	nexthdr = ip6h->ip6_nxt;
4047 	ptr = (uint8_t *)&ip6h[1];
4048 
4049 	if (nexthdr == IPPROTO_HOPOPTS) {
4050 		ip6_hbh_t	*hbh_hdr;
4051 		uint_t		hdr_len;
4052 
4053 		hbh_hdr = (ip6_hbh_t *)ptr;
4054 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4055 		nexthdr = hbh_hdr->ip6h_nxt;
4056 		ptr += hdr_len;
4057 	}
4058 	if (nexthdr == IPPROTO_DSTOPTS) {
4059 		ip6_dest_t	*dest_hdr;
4060 		uint_t		hdr_len;
4061 
4062 		dest_hdr = (ip6_dest_t *)ptr;
4063 		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4064 			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4065 			nexthdr = dest_hdr->ip6d_nxt;
4066 			ptr += hdr_len;
4067 		}
4068 	}
4069 	if (nexthdr == IPPROTO_ROUTING) {
4070 		ip6_rthdr_t	*rthdr;
4071 		uint_t		hdr_len;
4072 
4073 		rthdr = (ip6_rthdr_t *)ptr;
4074 		nexthdr = rthdr->ip6r_nxt;
4075 		hdr_len = 8 * (rthdr->ip6r_len + 1);
4076 		ptr += hdr_len;
4077 	}
4078 	if (nexthdr != IPPROTO_FRAGMENT) {
4079 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4080 		ip_drop_output("FragFails: bad nexthdr", mp, ill);
4081 		freemsg(mp);
4082 		return (EINVAL);
4083 	}
4084 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4085 	unfragmentable_len += sizeof (ip6_frag_t);
4086 
4087 	max_chunk = (max_frag - unfragmentable_len) & ~7;
4088 
4089 	/*
4090 	 * Allocate an mblk with enough room for the link-layer
4091 	 * header and the unfragmentable part of the datagram, which includes
4092 	 * the fragment header.  This (or a copy) will be used as the
4093 	 * first mblk for each fragment we send.
4094 	 */
4095 	hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
4096 	if (hmp == NULL) {
4097 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4098 		ip_drop_output("FragFails: no hmp", mp, ill);
4099 		freemsg(mp);
4100 		return (ENOBUFS);
4101 	}
4102 	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4103 	hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
4104 
4105 	fip6h = (ip6_t *)hmp->b_rptr;
4106 	bcopy(ip6h, fip6h, unfragmentable_len);
4107 
4108 	/*
4109 	 * pkt_len is set to the total length of the fragmentable data in this
4110 	 * datagram.  For each fragment sent, we will decrement pkt_len
4111 	 * by the amount of fragmentable data sent in that fragment
4112 	 * until len reaches zero.
4113 	 */
4114 	pkt_len -= unfragmentable_len;
4115 
4116 	/*
4117 	 * Move read ptr past unfragmentable portion, we don't want this part
4118 	 * of the data in our fragments.
4119 	 */
4120 	mp->b_rptr += unfragmentable_len;
4121 	if (mp->b_rptr == mp->b_wptr) {
4122 		mblk_t *mp1 = mp->b_cont;
4123 		freeb(mp);
4124 		mp = mp1;
4125 	}
4126 
4127 	while (pkt_len != 0) {
4128 		mlen = MIN(pkt_len, max_chunk);
4129 		pkt_len -= mlen;
4130 		if (pkt_len != 0) {
4131 			/* Not last */
4132 			hmp0 = copyb(hmp);
4133 			if (hmp0 == NULL) {
4134 				BUMP_MIB(ill->ill_ip_mib,
4135 				    ipIfStatsOutFragFails);
4136 				ip_drop_output("FragFails: copyb failed",
4137 				    mp, ill);
4138 				freeb(hmp);
4139 				freemsg(mp);
4140 				ip1dbg(("ip_fragment_v6: copyb failed\n"));
4141 				return (ENOBUFS);
4142 			}
4143 			off_flags = IP6F_MORE_FRAG;
4144 		} else {
4145 			/* Last fragment */
4146 			hmp0 = hmp;
4147 			hmp = NULL;
4148 			off_flags = 0;
4149 		}
4150 		fip6h = (ip6_t *)(hmp0->b_rptr);
4151 		fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
4152 		    sizeof (ip6_frag_t));
4153 
4154 		fip6h->ip6_plen = htons((uint16_t)(mlen +
4155 		    unfragmentable_len - IPV6_HDR_LEN));
4156 		/*
4157 		 * Note: Optimization alert.
4158 		 * In IPv6 (and IPv4) protocol header, Fragment Offset
4159 		 * ("offset") is 13 bits wide and in 8-octet units.
4160 		 * In IPv6 protocol header (unlike IPv4) in a 16 bit field,
4161 		 * it occupies the most significant 13 bits.
4162 		 * (least significant 13 bits in IPv4).
4163 		 * We do not do any shifts here. Not shifting is same effect
4164 		 * as taking offset value in octet units, dividing by 8 and
4165 		 * then shifting 3 bits left to line it up in place in proper
4166 		 * place protocol header.
4167 		 */
4168 		fraghdr->ip6f_offlg = htons(offset) | off_flags;
4169 
4170 		if (!(dmp = ip_carve_mp(&mp, mlen))) {
4171 			/* mp has already been freed by ip_carve_mp() */
4172 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4173 			ip_drop_output("FragFails: could not carve mp",
4174 			    hmp0, ill);
4175 			if (hmp != NULL)
4176 				freeb(hmp);
4177 			freeb(hmp0);
4178 			ip1dbg(("ip_carve_mp: failed\n"));
4179 			return (ENOBUFS);
4180 		}
4181 		hmp0->b_cont = dmp;
4182 		/* Get the priority marking, if any */
4183 		hmp0->b_band = priority;
4184 
4185 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
4186 
4187 		error = postfragfn(hmp0, nce, ixaflags,
4188 		    mlen + unfragmentable_len, xmit_hint, szone, nolzid,
4189 		    ixa_cookie);
4190 		if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
4191 			/* No point in sending the other fragments */
4192 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4193 			ip_drop_output("FragFails: postfragfn failed",
4194 			    hmp, ill);
4195 			freeb(hmp);
4196 			freemsg(mp);
4197 			return (error);
4198 		}
4199 		/* No need to redo state machine in loop */
4200 		ixaflags &= ~IXAF_REACH_CONF;
4201 
4202 		offset += mlen;
4203 	}
4204 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
4205 	return (error);
4206 }
4207 
4208 /*
4209  * Add a fragment header to an IPv6 packet.
4210  * Assumes that all the extension headers are contained in the first mblk.
4211  *
4212  * The fragment header is inserted after an hop-by-hop options header
4213  * and after [an optional destinations header followed by] a routing header.
4214  */
4215 mblk_t *
4216 ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
4217 {
4218 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4219 	ip6_t		*fip6h;
4220 	mblk_t		*hmp;
4221 	ip6_frag_t	*fraghdr;
4222 	size_t		unfragmentable_len;
4223 	uint8_t		nexthdr;
4224 	uint_t		prev_nexthdr_offset;
4225 	uint8_t		*ptr;
4226 	uint_t		priority = mp->b_band;
4227 	ip_stack_t	*ipst = ixa->ixa_ipst;
4228 
4229 	/*
4230 	 * Determine the length of the unfragmentable portion of this
4231 	 * datagram.  This consists of the IPv6 header, a potential
4232 	 * hop-by-hop options header, a potential pre-routing-header
4233 	 * destination options header, and a potential routing header.
4234 	 */
4235 	nexthdr = ip6h->ip6_nxt;
4236 	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
4237 	ptr = (uint8_t *)&ip6h[1];
4238 
4239 	if (nexthdr == IPPROTO_HOPOPTS) {
4240 		ip6_hbh_t	*hbh_hdr;
4241 		uint_t		hdr_len;
4242 
4243 		hbh_hdr = (ip6_hbh_t *)ptr;
4244 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4245 		nexthdr = hbh_hdr->ip6h_nxt;
4246 		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
4247 		    - (uint8_t *)ip6h;
4248 		ptr += hdr_len;
4249 	}
4250 	if (nexthdr == IPPROTO_DSTOPTS) {
4251 		ip6_dest_t	*dest_hdr;
4252 		uint_t		hdr_len;
4253 
4254 		dest_hdr = (ip6_dest_t *)ptr;
4255 		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4256 			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4257 			nexthdr = dest_hdr->ip6d_nxt;
4258 			prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
4259 			    - (uint8_t *)ip6h;
4260 			ptr += hdr_len;
4261 		}
4262 	}
4263 	if (nexthdr == IPPROTO_ROUTING) {
4264 		ip6_rthdr_t	*rthdr;
4265 		uint_t		hdr_len;
4266 
4267 		rthdr = (ip6_rthdr_t *)ptr;
4268 		nexthdr = rthdr->ip6r_nxt;
4269 		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
4270 		    - (uint8_t *)ip6h;
4271 		hdr_len = 8 * (rthdr->ip6r_len + 1);
4272 		ptr += hdr_len;
4273 	}
4274 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4275 
4276 	/*
4277 	 * Allocate an mblk with enough room for the link-layer
4278 	 * header, the unfragmentable part of the datagram, and the
4279 	 * fragment header.
4280 	 */
4281 	hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
4282 	    ipst->ips_ip_wroff_extra, mp);
4283 	if (hmp == NULL) {
4284 		ill_t *ill = ixa->ixa_nce->nce_ill;
4285 
4286 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
4287 		ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
4288 		freemsg(mp);
4289 		return (NULL);
4290 	}
4291 	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4292 	hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
4293 
4294 	fip6h = (ip6_t *)hmp->b_rptr;
4295 	fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
4296 
4297 	bcopy(ip6h, fip6h, unfragmentable_len);
4298 	fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
4299 	hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
4300 
4301 	fraghdr->ip6f_nxt = nexthdr;
4302 	fraghdr->ip6f_reserved = 0;
4303 	fraghdr->ip6f_offlg = 0;
4304 	fraghdr->ip6f_ident = htonl(ident);
4305 
4306 	/* Get the priority marking, if any */
4307 	hmp->b_band = priority;
4308 
4309 	/*
4310 	 * Move read ptr past unfragmentable portion, we don't want this part
4311 	 * of the data in our fragments.
4312 	 */
4313 	mp->b_rptr += unfragmentable_len;
4314 	hmp->b_cont = mp;
4315 	return (hmp);
4316 }
4317 
4318 /*
4319  * Determine if the ill and multicast aspects of that packets
4320  * "matches" the conn.
4321  */
4322 boolean_t
4323 conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
4324 {
4325 	ill_t		*ill = ira->ira_rill;
4326 	zoneid_t	zoneid = ira->ira_zoneid;
4327 	uint_t		in_ifindex;
4328 	in6_addr_t	*v6dst_ptr = &ip6h->ip6_dst;
4329 	in6_addr_t	*v6src_ptr = &ip6h->ip6_src;
4330 
4331 	/*
4332 	 * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
4333 	 * scopeid. This is used to limit
4334 	 * unicast and multicast reception to conn_incoming_ifindex.
4335 	 * conn_wantpacket_v6 is called both for unicast and
4336 	 * multicast packets.
4337 	 */
4338 	in_ifindex = connp->conn_incoming_ifindex;
4339 
4340 	/* mpathd can bind to the under IPMP interface, which we allow */
4341 	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
4342 		if (!IS_UNDER_IPMP(ill))
4343 			return (B_FALSE);
4344 
4345 		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
4346 			return (B_FALSE);
4347 	}
4348 
4349 	if (!IPCL_ZONE_MATCH(connp, zoneid))
4350 		return (B_FALSE);
4351 
4352 	if (!(ira->ira_flags & IRAF_MULTICAST))
4353 		return (B_TRUE);
4354 
4355 	if (connp->conn_multi_router)
4356 		return (B_TRUE);
4357 
4358 	if (ira->ira_protocol == IPPROTO_RSVP)
4359 		return (B_TRUE);
4360 
4361 	return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
4362 	    ira->ira_ill));
4363 }
4364 
4365 /*
4366  * pr_addr_dbg function provides the needed buffer space to call
4367  * inet_ntop() function's 3rd argument. This function should be
4368  * used by any kernel routine which wants to save INET6_ADDRSTRLEN
4369  * stack buffer space in it's own stack frame. This function uses
4370  * a buffer from it's own stack and prints the information.
4371  * Example: pr_addr_dbg("func: no route for %s\n ", AF_INET, addr)
4372  *
4373  * Note:    This function can call inet_ntop() once.
4374  */
4375 void
4376 pr_addr_dbg(char *fmt1, int af, const void *addr)
4377 {
4378 	char	buf[INET6_ADDRSTRLEN];
4379 
4380 	if (fmt1 == NULL) {
4381 		ip0dbg(("pr_addr_dbg: Wrong arguments\n"));
4382 		return;
4383 	}
4384 
4385 	/*
4386 	 * This does not compare debug level and just prints
4387 	 * out. Thus it is the responsibility of the caller
4388 	 * to check the appropriate debug-level before calling
4389 	 * this function.
4390 	 */
4391 	if (ip_debug > 0) {
4392 		printf(fmt1, inet_ntop(af, addr, buf, sizeof (buf)));
4393 	}
4394 
4395 
4396 }
4397 
4398 
4399 /*
4400  * Return the length in bytes of the IPv6 headers (base header
4401  * extension headers) that will be needed based on the
4402  * ip_pkt_t structure passed by the caller.
4403  *
4404  * The returned length does not include the length of the upper level
4405  * protocol (ULP) header.
4406  */
4407 int
4408 ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
4409 {
4410 	int len;
4411 
4412 	len = IPV6_HDR_LEN;
4413 
4414 	/*
4415 	 * If there's a security label here, then we ignore any hop-by-hop
4416 	 * options the user may try to set.
4417 	 */
4418 	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4419 		uint_t hopoptslen;
4420 		/*
4421 		 * Note that ipp_label_len_v6 is just the option - not
4422 		 * the hopopts extension header. It also needs to be padded
4423 		 * to a multiple of 8 bytes.
4424 		 */
4425 		ASSERT(ipp->ipp_label_len_v6 != 0);
4426 		hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4427 		hopoptslen = (hopoptslen + 7)/8 * 8;
4428 		len += hopoptslen;
4429 	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4430 		ASSERT(ipp->ipp_hopoptslen != 0);
4431 		len += ipp->ipp_hopoptslen;
4432 	}
4433 
4434 	/*
4435 	 * En-route destination options
4436 	 * Only do them if there's a routing header as well
4437 	 */
4438 	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4439 	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4440 		ASSERT(ipp->ipp_rthdrdstoptslen != 0);
4441 		len += ipp->ipp_rthdrdstoptslen;
4442 	}
4443 	if (ipp->ipp_fields & IPPF_RTHDR) {
4444 		ASSERT(ipp->ipp_rthdrlen != 0);
4445 		len += ipp->ipp_rthdrlen;
4446 	}
4447 	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4448 		ASSERT(ipp->ipp_dstoptslen != 0);
4449 		len += ipp->ipp_dstoptslen;
4450 	}
4451 	return (len);
4452 }
4453 
4454 /*
4455  * All-purpose routine to build a header chain of an IPv6 header
4456  * followed by any required extension headers and a proto header.
4457  *
4458  * The caller has to set the source and destination address as well as
4459  * ip6_plen. The caller has to massage any routing header and compensate
4460  * for the ULP pseudo-header checksum due to the source route.
4461  *
4462  * The extension headers will all be fully filled in.
4463  */
4464 void
4465 ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
4466     uint8_t protocol, uint32_t flowinfo)
4467 {
4468 	uint8_t *nxthdr_ptr;
4469 	uint8_t *cp;
4470 	ip6_t	*ip6h = (ip6_t *)buf;
4471 
4472 	/* Initialize IPv6 header */
4473 	ip6h->ip6_vcf =
4474 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4475 	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4476 
4477 	if (ipp->ipp_fields & IPPF_TCLASS) {
4478 		/* Overrides the class part of flowinfo */
4479 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4480 		    ipp->ipp_tclass);
4481 	}
4482 
4483 	if (ipp->ipp_fields & IPPF_HOPLIMIT)
4484 		ip6h->ip6_hops = ipp->ipp_hoplimit;
4485 	else
4486 		ip6h->ip6_hops = ipp->ipp_unicast_hops;
4487 
4488 	if ((ipp->ipp_fields & IPPF_ADDR) &&
4489 	    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4490 		ip6h->ip6_src = ipp->ipp_addr;
4491 
4492 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
4493 	cp = (uint8_t *)&ip6h[1];
4494 	/*
4495 	 * Here's where we have to start stringing together
4496 	 * any extension headers in the right order:
4497 	 * Hop-by-hop, destination, routing, and final destination opts.
4498 	 */
4499 	/*
4500 	 * If there's a security label here, then we ignore any hop-by-hop
4501 	 * options the user may try to set.
4502 	 */
4503 	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4504 		/*
4505 		 * Hop-by-hop options with the label.
4506 		 * Note that ipp_label_v6 is just the option - not
4507 		 * the hopopts extension header. It also needs to be padded
4508 		 * to a multiple of 8 bytes.
4509 		 */
4510 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4511 		uint_t hopoptslen;
4512 		uint_t padlen;
4513 
4514 		padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4515 		hopoptslen = (padlen + 7)/8 * 8;
4516 		padlen = hopoptslen - padlen;
4517 
4518 		*nxthdr_ptr = IPPROTO_HOPOPTS;
4519 		nxthdr_ptr = &hbh->ip6h_nxt;
4520 		hbh->ip6h_len = hopoptslen/8 - 1;
4521 		cp += sizeof (ip6_hbh_t);
4522 		bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
4523 		cp += ipp->ipp_label_len_v6;
4524 
4525 		ASSERT(padlen <= 7);
4526 		switch (padlen) {
4527 		case 0:
4528 			break;
4529 		case 1:
4530 			cp[0] = IP6OPT_PAD1;
4531 			break;
4532 		default:
4533 			cp[0] = IP6OPT_PADN;
4534 			cp[1] = padlen - 2;
4535 			bzero(&cp[2], padlen - 2);
4536 			break;
4537 		}
4538 		cp += padlen;
4539 	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4540 		/* Hop-by-hop options */
4541 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4542 
4543 		*nxthdr_ptr = IPPROTO_HOPOPTS;
4544 		nxthdr_ptr = &hbh->ip6h_nxt;
4545 
4546 		bcopy(ipp->ipp_hopopts, cp, ipp->ipp_hopoptslen);
4547 		cp += ipp->ipp_hopoptslen;
4548 	}
4549 	/*
4550 	 * En-route destination options
4551 	 * Only do them if there's a routing header as well
4552 	 */
4553 	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4554 	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4555 		ip6_dest_t *dst = (ip6_dest_t *)cp;
4556 
4557 		*nxthdr_ptr = IPPROTO_DSTOPTS;
4558 		nxthdr_ptr = &dst->ip6d_nxt;
4559 
4560 		bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
4561 		cp += ipp->ipp_rthdrdstoptslen;
4562 	}
4563 	/*
4564 	 * Routing header next
4565 	 */
4566 	if (ipp->ipp_fields & IPPF_RTHDR) {
4567 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
4568 
4569 		*nxthdr_ptr = IPPROTO_ROUTING;
4570 		nxthdr_ptr = &rt->ip6r_nxt;
4571 
4572 		bcopy(ipp->ipp_rthdr, cp, ipp->ipp_rthdrlen);
4573 		cp += ipp->ipp_rthdrlen;
4574 	}
4575 	/*
4576 	 * Do ultimate destination options
4577 	 */
4578 	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4579 		ip6_dest_t *dest = (ip6_dest_t *)cp;
4580 
4581 		*nxthdr_ptr = IPPROTO_DSTOPTS;
4582 		nxthdr_ptr = &dest->ip6d_nxt;
4583 
4584 		bcopy(ipp->ipp_dstopts, cp, ipp->ipp_dstoptslen);
4585 		cp += ipp->ipp_dstoptslen;
4586 	}
4587 	/*
4588 	 * Now set the last header pointer to the proto passed in
4589 	 */
4590 	*nxthdr_ptr = protocol;
4591 	ASSERT((int)(cp - buf) == buf_len);
4592 }
4593 
4594 /*
4595  * Return a pointer to the routing header extension header
4596  * in the IPv6 header(s) chain passed in.
4597  * If none found, return NULL
4598  * Assumes that all extension headers are in same mblk as the v6 header
4599  */
4600 ip6_rthdr_t *
4601 ip_find_rthdr_v6(ip6_t *ip6h, uint8_t *endptr)
4602 {
4603 	ip6_dest_t	*desthdr;
4604 	ip6_frag_t	*fraghdr;
4605 	uint_t		hdrlen;
4606 	uint8_t		nexthdr;
4607 	uint8_t		*ptr = (uint8_t *)&ip6h[1];
4608 
4609 	if (ip6h->ip6_nxt == IPPROTO_ROUTING)
4610 		return ((ip6_rthdr_t *)ptr);
4611 
4612 	/*
4613 	 * The routing header will precede all extension headers
4614 	 * other than the hop-by-hop and destination options
4615 	 * extension headers, so if we see anything other than those,
4616 	 * we're done and didn't find it.
4617 	 * We could see a destination options header alone but no
4618 	 * routing header, in which case we'll return NULL as soon as
4619 	 * we see anything after that.
4620 	 * Hop-by-hop and destination option headers are identical,
4621 	 * so we can use either one we want as a template.
4622 	 */
4623 	nexthdr = ip6h->ip6_nxt;
4624 	while (ptr < endptr) {
4625 		/* Is there enough left for len + nexthdr? */
4626 		if (ptr + MIN_EHDR_LEN > endptr)
4627 			return (NULL);
4628 
4629 		switch (nexthdr) {
4630 		case IPPROTO_HOPOPTS:
4631 		case IPPROTO_DSTOPTS:
4632 			/* Assumes the headers are identical for hbh and dst */
4633 			desthdr = (ip6_dest_t *)ptr;
4634 			hdrlen = 8 * (desthdr->ip6d_len + 1);
4635 			nexthdr = desthdr->ip6d_nxt;
4636 			break;
4637 
4638 		case IPPROTO_ROUTING:
4639 			return ((ip6_rthdr_t *)ptr);
4640 
4641 		case IPPROTO_FRAGMENT:
4642 			fraghdr = (ip6_frag_t *)ptr;
4643 			hdrlen = sizeof (ip6_frag_t);
4644 			nexthdr = fraghdr->ip6f_nxt;
4645 			break;
4646 
4647 		default:
4648 			return (NULL);
4649 		}
4650 		ptr += hdrlen;
4651 	}
4652 	return (NULL);
4653 }
4654 
4655 /*
4656  * Called for source-routed packets originating on this node.
4657  * Manipulates the original routing header by moving every entry up
4658  * one slot, placing the first entry in the v6 header's v6_dst field,
4659  * and placing the ultimate destination in the routing header's last
4660  * slot.
4661  *
4662  * Returns the checksum diference between the ultimate destination
4663  * (last hop in the routing header when the packet is sent) and
4664  * the first hop (ip6_dst when the packet is sent)
4665  */
4666 /* ARGSUSED2 */
4667 uint32_t
4668 ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
4669 {
4670 	uint_t		numaddr;
4671 	uint_t		i;
4672 	in6_addr_t	*addrptr;
4673 	in6_addr_t	tmp;
4674 	ip6_rthdr0_t	*rthdr = (ip6_rthdr0_t *)rth;
4675 	uint32_t	cksm;
4676 	uint32_t	addrsum = 0;
4677 	uint16_t	*ptr;
4678 
4679 	/*
4680 	 * Perform any processing needed for source routing.
4681 	 * We know that all extension headers will be in the same mblk
4682 	 * as the IPv6 header.
4683 	 */
4684 
4685 	/*
4686 	 * If no segments left in header, or the header length field is zero,
4687 	 * don't move hop addresses around;
4688 	 * Checksum difference is zero.
4689 	 */
4690 	if ((rthdr->ip6r0_segleft == 0) || (rthdr->ip6r0_len == 0))
4691 		return (0);
4692 
4693 	ptr = (uint16_t *)&ip6h->ip6_dst;
4694 	cksm = 0;
4695 	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4696 		cksm += ptr[i];
4697 	}
4698 	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4699 
4700 	/*
4701 	 * Here's where the fun begins - we have to
4702 	 * move all addresses up one spot, take the
4703 	 * first hop and make it our first ip6_dst,
4704 	 * and place the ultimate destination in the
4705 	 * newly-opened last slot.
4706 	 */
4707 	addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
4708 	numaddr = rthdr->ip6r0_len / 2;
4709 	tmp = *addrptr;
4710 	for (i = 0; i < (numaddr - 1); addrptr++, i++) {
4711 		*addrptr = addrptr[1];
4712 	}
4713 	*addrptr = ip6h->ip6_dst;
4714 	ip6h->ip6_dst = tmp;
4715 
4716 	/*
4717 	 * From the checksummed ultimate destination subtract the checksummed
4718 	 * current ip6_dst (the first hop address). Return that number.
4719 	 * (In the v4 case, the second part of this is done in each routine
4720 	 *  that calls ip_massage_options(). We do it all in this one place
4721 	 *  for v6).
4722 	 */
4723 	ptr = (uint16_t *)&ip6h->ip6_dst;
4724 	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4725 		addrsum += ptr[i];
4726 	}
4727 	cksm -= ((addrsum >> 16) + (addrsum & 0xFFFF));
4728 	if ((int)cksm < 0)
4729 		cksm--;
4730 	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4731 
4732 	return (cksm);
4733 }
4734 
4735 void
4736 *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
4737 {
4738 	kstat_t *ksp;
4739 
4740 	ip6_stat_t template = {
4741 		{ "ip6_udp_fannorm",	KSTAT_DATA_UINT64 },
4742 		{ "ip6_udp_fanmb",	KSTAT_DATA_UINT64 },
4743 		{ "ip6_recv_pullup",		KSTAT_DATA_UINT64 },
4744 		{ "ip6_db_ref",			KSTAT_DATA_UINT64 },
4745 		{ "ip6_notaligned",		KSTAT_DATA_UINT64 },
4746 		{ "ip6_multimblk",		KSTAT_DATA_UINT64 },
4747 		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
4748 		{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
4749 		{ "ip6_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
4750 		{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
4751 		{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4752 		{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4753 		{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4754 		{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4755 		{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4756 		{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4757 	};
4758 	ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
4759 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
4760 	    KSTAT_FLAG_VIRTUAL, stackid);
4761 
4762 	if (ksp == NULL)
4763 		return (NULL);
4764 
4765 	bcopy(&template, ip6_statisticsp, sizeof (template));
4766 	ksp->ks_data = (void *)ip6_statisticsp;
4767 	ksp->ks_private = (void *)(uintptr_t)stackid;
4768 
4769 	kstat_install(ksp);
4770 	return (ksp);
4771 }
4772 
4773 void
4774 ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4775 {
4776 	if (ksp != NULL) {
4777 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4778 		kstat_delete_netstack(ksp, stackid);
4779 	}
4780 }
4781 
4782 /*
4783  * The following two functions set and get the value for the
4784  * IPV6_SRC_PREFERENCES socket option.
4785  */
4786 int
4787 ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
4788 {
4789 	/*
4790 	 * We only support preferences that are covered by
4791 	 * IPV6_PREFER_SRC_MASK.
4792 	 */
4793 	if (prefs & ~IPV6_PREFER_SRC_MASK)
4794 		return (EINVAL);
4795 
4796 	/*
4797 	 * Look for conflicting preferences or default preferences.  If
4798 	 * both bits of a related pair are clear, the application wants the
4799 	 * system's default value for that pair.  Both bits in a pair can't
4800 	 * be set.
4801 	 */
4802 	if ((prefs & IPV6_PREFER_SRC_MIPMASK) == 0) {
4803 		prefs |= IPV6_PREFER_SRC_MIPDEFAULT;
4804 	} else if ((prefs & IPV6_PREFER_SRC_MIPMASK) ==
4805 	    IPV6_PREFER_SRC_MIPMASK) {
4806 		return (EINVAL);
4807 	}
4808 	if ((prefs & IPV6_PREFER_SRC_TMPMASK) == 0) {
4809 		prefs |= IPV6_PREFER_SRC_TMPDEFAULT;
4810 	} else if ((prefs & IPV6_PREFER_SRC_TMPMASK) ==
4811 	    IPV6_PREFER_SRC_TMPMASK) {
4812 		return (EINVAL);
4813 	}
4814 	if ((prefs & IPV6_PREFER_SRC_CGAMASK) == 0) {
4815 		prefs |= IPV6_PREFER_SRC_CGADEFAULT;
4816 	} else if ((prefs & IPV6_PREFER_SRC_CGAMASK) ==
4817 	    IPV6_PREFER_SRC_CGAMASK) {
4818 		return (EINVAL);
4819 	}
4820 
4821 	ixa->ixa_src_preferences = prefs;
4822 	return (0);
4823 }
4824 
4825 size_t
4826 ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
4827 {
4828 	*val = ixa->ixa_src_preferences;
4829 	return (sizeof (ixa->ixa_src_preferences));
4830 }
4831 
4832 /*
4833  * Get the size of the IP options (including the IP headers size)
4834  * without including the AH header's size. If till_ah is B_FALSE,
4835  * and if AH header is present, dest options beyond AH header will
4836  * also be included in the returned size.
4837  */
4838 int
4839 ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
4840 {
4841 	ip6_t *ip6h;
4842 	uint8_t nexthdr;
4843 	uint8_t *whereptr;
4844 	ip6_hbh_t *hbhhdr;
4845 	ip6_dest_t *dsthdr;
4846 	ip6_rthdr_t *rthdr;
4847 	int ehdrlen;
4848 	int size;
4849 	ah_t *ah;
4850 
4851 	ip6h = (ip6_t *)mp->b_rptr;
4852 	size = IPV6_HDR_LEN;
4853 	nexthdr = ip6h->ip6_nxt;
4854 	whereptr = (uint8_t *)&ip6h[1];
4855 	for (;;) {
4856 		/* Assume IP has already stripped it */
4857 		ASSERT(nexthdr != IPPROTO_FRAGMENT);
4858 		switch (nexthdr) {
4859 		case IPPROTO_HOPOPTS:
4860 			hbhhdr = (ip6_hbh_t *)whereptr;
4861 			nexthdr = hbhhdr->ip6h_nxt;
4862 			ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
4863 			break;
4864 		case IPPROTO_DSTOPTS:
4865 			dsthdr = (ip6_dest_t *)whereptr;
4866 			nexthdr = dsthdr->ip6d_nxt;
4867 			ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4868 			break;
4869 		case IPPROTO_ROUTING:
4870 			rthdr = (ip6_rthdr_t *)whereptr;
4871 			nexthdr = rthdr->ip6r_nxt;
4872 			ehdrlen = 8 * (rthdr->ip6r_len + 1);
4873 			break;
4874 		default :
4875 			if (till_ah) {
4876 				ASSERT(nexthdr == IPPROTO_AH);
4877 				return (size);
4878 			}
4879 			/*
4880 			 * If we don't have a AH header to traverse,
4881 			 * return now. This happens normally for
4882 			 * outbound datagrams where we have not inserted
4883 			 * the AH header.
4884 			 */
4885 			if (nexthdr != IPPROTO_AH) {
4886 				return (size);
4887 			}
4888 
4889 			/*
4890 			 * We don't include the AH header's size
4891 			 * to be symmetrical with other cases where
4892 			 * we either don't have a AH header (outbound)
4893 			 * or peek into the AH header yet (inbound and
4894 			 * not pulled up yet).
4895 			 */
4896 			ah = (ah_t *)whereptr;
4897 			nexthdr = ah->ah_nexthdr;
4898 			ehdrlen = (ah->ah_length << 2) + 8;
4899 
4900 			if (nexthdr == IPPROTO_DSTOPTS) {
4901 				if (whereptr + ehdrlen >= mp->b_wptr) {
4902 					/*
4903 					 * The destination options header
4904 					 * is not part of the first mblk.
4905 					 */
4906 					whereptr = mp->b_cont->b_rptr;
4907 				} else {
4908 					whereptr += ehdrlen;
4909 				}
4910 
4911 				dsthdr = (ip6_dest_t *)whereptr;
4912 				ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4913 				size += ehdrlen;
4914 			}
4915 			return (size);
4916 		}
4917 		whereptr += ehdrlen;
4918 		size += ehdrlen;
4919 	}
4920 }
4921 
4922 /*
4923  * Utility routine that checks if `v6srcp' is a valid address on underlying
4924  * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
4925  * associated with `v6srcp' on success.  NOTE: if this is not called from
4926  * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
4927  * group during or after this lookup.
4928  */
4929 boolean_t
4930 ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
4931 {
4932 	ipif_t *ipif;
4933 
4934 
4935 	ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
4936 	if (ipif != NULL) {
4937 		if (ipifp != NULL)
4938 			*ipifp = ipif;
4939 		else
4940 			ipif_refrele(ipif);
4941 		return (B_TRUE);
4942 	}
4943 
4944 	if (ip_debug > 2) {
4945 		pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
4946 		    "src %s\n", AF_INET6, v6srcp);
4947 	}
4948 	return (B_FALSE);
4949 }
4950