xref: /illumos-gate/usr/src/uts/common/inet/ip/ip6.c (revision f641a59486e69100969f92a7ec309574f76b238d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 1990 Mentat Inc.
24  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
25  * Copyright 2019 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/dlpi.h>
31 #include <sys/stropts.h>
32 #include <sys/sysmacros.h>
33 #include <sys/strsun.h>
34 #include <sys/strlog.h>
35 #include <sys/strsubr.h>
36 #define	_SUN_TPI_VERSION	2
37 #include <sys/tihdr.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/cmn_err.h>
41 #include <sys/debug.h>
42 #include <sys/sdt.h>
43 #include <sys/kobj.h>
44 #include <sys/zone.h>
45 #include <sys/neti.h>
46 #include <sys/hook.h>
47 
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/param.h>
51 #include <sys/socket.h>
52 #include <sys/vtrace.h>
53 #include <sys/isa_defs.h>
54 #include <sys/atomic.h>
55 #include <sys/policy.h>
56 #include <sys/mac.h>
57 #include <net/if.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 #include <net/if_dl.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <netinet/ip6.h>
64 #include <netinet/icmp6.h>
65 #include <netinet/sctp.h>
66 
67 #include <inet/common.h>
68 #include <inet/mi.h>
69 #include <inet/optcom.h>
70 #include <inet/mib2.h>
71 #include <inet/nd.h>
72 #include <inet/arp.h>
73 
74 #include <inet/ip.h>
75 #include <inet/ip_impl.h>
76 #include <inet/ip6.h>
77 #include <inet/ip6_asp.h>
78 #include <inet/tcp.h>
79 #include <inet/tcp_impl.h>
80 #include <inet/udp_impl.h>
81 #include <inet/ipp_common.h>
82 
83 #include <inet/ip_multi.h>
84 #include <inet/ip_if.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_rts.h>
87 #include <inet/ip_ndp.h>
88 #include <net/pfkeyv2.h>
89 #include <inet/sadb.h>
90 #include <inet/ipsec_impl.h>
91 #include <inet/iptun/iptun_impl.h>
92 #include <inet/sctp_ip.h>
93 #include <sys/pattr.h>
94 #include <inet/ipclassifier.h>
95 #include <inet/ipsecah.h>
96 #include <inet/rawip_impl.h>
97 #include <inet/rts_impl.h>
98 #include <sys/squeue_impl.h>
99 #include <sys/squeue.h>
100 
101 #include <sys/tsol/label.h>
102 #include <sys/tsol/tnet.h>
103 
104 /* Temporary; for CR 6451644 work-around */
105 #include <sys/ethernet.h>
106 
107 /*
108  * Naming conventions:
109  *      These rules should be judiciously applied
110  *	if there is a need to identify something as IPv6 versus IPv4
111  *	IPv6 funcions will end with _v6 in the ip module.
112  *	IPv6 funcions will end with _ipv6 in the transport modules.
113  *	IPv6 macros:
114  *		Some macros end with _V6; e.g. ILL_FRAG_HASH_V6
115  *		Some macros start with V6_; e.g. V6_OR_V4_INADDR_ANY
116  *		And then there are ..V4_PART_OF_V6.
117  *		The intent is that macros in the ip module end with _V6.
118  *	IPv6 global variables will start with ipv6_
119  *	IPv6 structures will start with ipv6
120  *	IPv6 defined constants should start with IPV6_
121  *		(but then there are NDP_DEFAULT_VERS_PRI_AND_FLOW, etc)
122  */
123 
124 /*
125  * ip6opt_ls is used to enable IPv6 (via /etc/system on TX systems).
126  * We need to do this because we didn't obtain the IP6OPT_LS (0x0a)
127  * from IANA. This mechanism will remain in effect until an official
128  * number is obtained.
129  */
130 uchar_t ip6opt_ls;
131 
132 const in6_addr_t ipv6_all_ones =
133 	{ 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
134 const in6_addr_t ipv6_all_zeros = { 0, 0, 0, 0 };
135 
136 #ifdef	_BIG_ENDIAN
137 const in6_addr_t ipv6_unspecified_group = { 0xff000000U, 0, 0, 0 };
138 #else	/* _BIG_ENDIAN */
139 const in6_addr_t ipv6_unspecified_group = { 0x000000ffU, 0, 0, 0 };
140 #endif	/* _BIG_ENDIAN */
141 
142 #ifdef	_BIG_ENDIAN
143 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x00000001U };
144 #else  /* _BIG_ENDIAN */
145 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x01000000U };
146 #endif /* _BIG_ENDIAN */
147 
148 #ifdef _BIG_ENDIAN
149 const in6_addr_t ipv6_all_hosts_mcast = { 0xff020000U, 0, 0, 0x00000001U };
150 #else  /* _BIG_ENDIAN */
151 const in6_addr_t ipv6_all_hosts_mcast = { 0x000002ffU, 0, 0, 0x01000000U };
152 #endif /* _BIG_ENDIAN */
153 
154 #ifdef _BIG_ENDIAN
155 const in6_addr_t ipv6_all_rtrs_mcast = { 0xff020000U, 0, 0, 0x00000002U };
156 #else  /* _BIG_ENDIAN */
157 const in6_addr_t ipv6_all_rtrs_mcast = { 0x000002ffU, 0, 0, 0x02000000U };
158 #endif /* _BIG_ENDIAN */
159 
160 #ifdef _BIG_ENDIAN
161 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0xff020000U, 0, 0, 0x00000016U };
162 #else  /* _BIG_ENDIAN */
163 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0x000002ffU, 0, 0, 0x16000000U };
164 #endif /* _BIG_ENDIAN */
165 
166 #ifdef _BIG_ENDIAN
167 const in6_addr_t ipv6_solicited_node_mcast =
168 			{ 0xff020000U, 0, 0x00000001U, 0xff000000U };
169 #else  /* _BIG_ENDIAN */
170 const in6_addr_t ipv6_solicited_node_mcast =
171 			{ 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
172 #endif /* _BIG_ENDIAN */
173 
174 static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
175 static void	icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
176 static void	icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
177     ip_recv_attr_t *);
178 static void	icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
179     ip_recv_attr_t *);
180 static void	icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
181     in6_addr_t *, ip_recv_attr_t *);
182 static void	icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
183     ip_recv_attr_t *);
184 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
185 
186 /*
187  * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
188  * If the ICMP message is consumed by IP, i.e., it should not be delivered
189  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
190  * Likewise, if the ICMP error is misformed (too short, etc), then it
191  * returns NULL. The caller uses this to determine whether or not to send
192  * to raw sockets.
193  *
194  * All error messages are passed to the matching transport stream.
195  *
196  * See comment for icmp_inbound_v4() on how IPsec is handled.
197  */
198 mblk_t *
199 icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
200 {
201 	icmp6_t		*icmp6;
202 	ip6_t		*ip6h;		/* Outer header */
203 	int		ip_hdr_length;	/* Outer header length */
204 	boolean_t	interested;
205 	ill_t		*ill = ira->ira_ill;
206 	ip_stack_t	*ipst = ill->ill_ipst;
207 	mblk_t		*mp_ret = NULL;
208 
209 	ip6h = (ip6_t *)mp->b_rptr;
210 
211 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
212 
213 	/* Check for Martian packets  */
214 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
215 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
216 		ip_drop_input("ipIfStatsInAddrErrors: mcast src", mp, ill);
217 		freemsg(mp);
218 		return (NULL);
219 	}
220 
221 	/* Make sure ira_l2src is set for ndp_input */
222 	if (!(ira->ira_flags & IRAF_L2SRC_SET))
223 		ip_setl2src(mp, ira, ira->ira_rill);
224 
225 	ip_hdr_length = ira->ira_ip_hdr_length;
226 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
227 		if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
228 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
229 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
230 			freemsg(mp);
231 			return (NULL);
232 		}
233 		ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
234 		if (ip6h == NULL) {
235 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
236 			freemsg(mp);
237 			return (NULL);
238 		}
239 	}
240 
241 	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
242 	DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
243 	ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
244 	    icmp6->icmp6_code));
245 
246 	/*
247 	 * We will set "interested" to "true" if we should pass a copy to
248 	 * the transport i.e., if it is an error message.
249 	 */
250 	interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
251 
252 	switch (icmp6->icmp6_type) {
253 	case ICMP6_DST_UNREACH:
254 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInDestUnreachs);
255 		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
256 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInAdminProhibs);
257 		break;
258 
259 	case ICMP6_TIME_EXCEEDED:
260 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInTimeExcds);
261 		break;
262 
263 	case ICMP6_PARAM_PROB:
264 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInParmProblems);
265 		break;
266 
267 	case ICMP6_PACKET_TOO_BIG:
268 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
269 		break;
270 
271 	case ICMP6_ECHO_REQUEST:
272 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
273 		if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
274 		    !ipst->ips_ipv6_resp_echo_mcast)
275 			break;
276 
277 		/*
278 		 * We must have exclusive use of the mblk to convert it to
279 		 * a response.
280 		 * If not, we copy it.
281 		 */
282 		if (mp->b_datap->db_ref > 1) {
283 			mblk_t	*mp1;
284 
285 			mp1 = copymsg(mp);
286 			if (mp1 == NULL) {
287 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
288 				ip_drop_input("ipIfStatsInDiscards - copymsg",
289 				    mp, ill);
290 				freemsg(mp);
291 				return (NULL);
292 			}
293 			freemsg(mp);
294 			mp = mp1;
295 			ip6h = (ip6_t *)mp->b_rptr;
296 			icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
297 		}
298 
299 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
300 		icmp_send_reply_v6(mp, ip6h, icmp6, ira);
301 		return (NULL);
302 
303 	case ICMP6_ECHO_REPLY:
304 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
305 		break;
306 
307 	case ND_ROUTER_SOLICIT:
308 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterSolicits);
309 		break;
310 
311 	case ND_ROUTER_ADVERT:
312 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterAdvertisements);
313 		break;
314 
315 	case ND_NEIGHBOR_SOLICIT:
316 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
317 		ndp_input(mp, ira);
318 		return (NULL);
319 
320 	case ND_NEIGHBOR_ADVERT:
321 		BUMP_MIB(ill->ill_icmp6_mib,
322 		    ipv6IfIcmpInNeighborAdvertisements);
323 		ndp_input(mp, ira);
324 		return (NULL);
325 
326 	case ND_REDIRECT:
327 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
328 
329 		if (ipst->ips_ipv6_ignore_redirect)
330 			break;
331 
332 		/* We now allow a RAW socket to receive this. */
333 		interested = B_TRUE;
334 		break;
335 
336 	/*
337 	 * The next three icmp messages will be handled by MLD.
338 	 * Pass all valid MLD packets up to any process(es)
339 	 * listening on a raw ICMP socket.
340 	 */
341 	case MLD_LISTENER_QUERY:
342 	case MLD_LISTENER_REPORT:
343 	case MLD_LISTENER_REDUCTION:
344 		mp = mld_input(mp, ira);
345 		return (mp);
346 	default:
347 		break;
348 	}
349 	/*
350 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
351 	 * if there isn't one.
352 	 */
353 	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
354 		/* If there is an ICMP client and we want one too, copy it. */
355 
356 		if (!interested) {
357 			/* Caller will deliver to RAW sockets */
358 			return (mp);
359 		}
360 		mp_ret = copymsg(mp);
361 		if (mp_ret == NULL) {
362 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
363 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
364 		}
365 	} else if (!interested) {
366 		/* Neither we nor raw sockets are interested. Drop packet now */
367 		freemsg(mp);
368 		return (NULL);
369 	}
370 
371 	/*
372 	 * ICMP error or redirect packet. Make sure we have enough of
373 	 * the header and that db_ref == 1 since we might end up modifying
374 	 * the packet.
375 	 */
376 	if (mp->b_cont != NULL) {
377 		if (ip_pullup(mp, -1, ira) == NULL) {
378 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
379 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
380 			    mp, ill);
381 			freemsg(mp);
382 			return (mp_ret);
383 		}
384 	}
385 
386 	if (mp->b_datap->db_ref > 1) {
387 		mblk_t	*mp1;
388 
389 		mp1 = copymsg(mp);
390 		if (mp1 == NULL) {
391 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
392 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
393 			freemsg(mp);
394 			return (mp_ret);
395 		}
396 		freemsg(mp);
397 		mp = mp1;
398 	}
399 
400 	/*
401 	 * In case mp has changed, verify the message before any further
402 	 * processes.
403 	 */
404 	ip6h = (ip6_t *)mp->b_rptr;
405 	icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
406 	if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
407 		freemsg(mp);
408 		return (mp_ret);
409 	}
410 
411 	switch (icmp6->icmp6_type) {
412 	case ND_REDIRECT:
413 		icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
414 		break;
415 	case ICMP6_PACKET_TOO_BIG:
416 		/* Update DCE and adjust MTU is icmp header if needed */
417 		icmp_inbound_too_big_v6(icmp6, ira);
418 		/* FALLTHROUGH */
419 	default:
420 		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
421 		break;
422 	}
423 
424 	return (mp_ret);
425 }
426 
427 /*
428  * Send an ICMP echo reply.
429  * The caller has already updated the payload part of the packet.
430  * We handle the ICMP checksum, IP source address selection and feed
431  * the packet into ip_output_simple.
432  */
433 static void
434 icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
435     ip_recv_attr_t *ira)
436 {
437 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
438 	ill_t		*ill = ira->ira_ill;
439 	ip_stack_t	*ipst = ill->ill_ipst;
440 	ip_xmit_attr_t	ixas;
441 	in6_addr_t	origsrc;
442 
443 	/*
444 	 * Remove any extension headers (do not reverse a source route)
445 	 * and clear the flow id (keep traffic class for now).
446 	 */
447 	if (ip_hdr_length != IPV6_HDR_LEN) {
448 		int	i;
449 
450 		for (i = 0; i < IPV6_HDR_LEN; i++) {
451 			mp->b_rptr[ip_hdr_length - i - 1] =
452 			    mp->b_rptr[IPV6_HDR_LEN - i - 1];
453 		}
454 		mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
455 		ip6h = (ip6_t *)mp->b_rptr;
456 		ip6h->ip6_nxt = IPPROTO_ICMPV6;
457 		i = ntohs(ip6h->ip6_plen);
458 		i -= (ip_hdr_length - IPV6_HDR_LEN);
459 		ip6h->ip6_plen = htons(i);
460 		ip_hdr_length = IPV6_HDR_LEN;
461 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
462 	}
463 	ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
464 
465 	/* Reverse the source and destination addresses. */
466 	origsrc = ip6h->ip6_src;
467 	ip6h->ip6_src = ip6h->ip6_dst;
468 	ip6h->ip6_dst = origsrc;
469 
470 	/* set the hop limit */
471 	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
472 
473 	/*
474 	 * Prepare for checksum by putting icmp length in the icmp
475 	 * checksum field. The checksum is calculated in ip_output
476 	 */
477 	icmp6->icmp6_cksum = ip6h->ip6_plen;
478 
479 	bzero(&ixas, sizeof (ixas));
480 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
481 	ixas.ixa_zoneid = ira->ira_zoneid;
482 	ixas.ixa_cred = kcred;
483 	ixas.ixa_cpid = NOPID;
484 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
485 	ixas.ixa_ifindex = 0;
486 	ixas.ixa_ipst = ipst;
487 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
488 
489 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
490 		/*
491 		 * This packet should go out the same way as it
492 		 * came in i.e in clear, independent of the IPsec
493 		 * policy for transmitting packets.
494 		 */
495 		ixas.ixa_flags |= IXAF_NO_IPSEC;
496 	} else {
497 		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
498 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
499 			/* Note: mp already consumed and ip_drop_packet done */
500 			return;
501 		}
502 	}
503 
504 	/* Was the destination (now source) link-local? Send out same group */
505 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
506 		ixas.ixa_flags |= IXAF_SCOPEID_SET;
507 		if (IS_UNDER_IPMP(ill))
508 			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
509 		else
510 			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
511 	}
512 
513 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
514 		/*
515 		 * Not one or our addresses (IRE_LOCALs), thus we let
516 		 * ip_output_simple pick the source.
517 		 */
518 		ip6h->ip6_src = ipv6_all_zeros;
519 		ixas.ixa_flags |= IXAF_SET_SOURCE;
520 	}
521 
522 	/* Should we send using dce_pmtu? */
523 	if (ipst->ips_ipv6_icmp_return_pmtu)
524 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
525 
526 	(void) ip_output_simple(mp, &ixas);
527 	ixa_cleanup(&ixas);
528 
529 }
530 
531 /*
532  * Verify the ICMP messages for either for ICMP error or redirect packet.
533  * The caller should have fully pulled up the message. If it's a redirect
534  * packet, only basic checks on IP header will be done; otherwise, verify
535  * the packet by looking at the included ULP header.
536  *
537  * Called before icmp_inbound_error_fanout_v6 is called.
538  */
539 static boolean_t
540 icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
541 {
542 	ill_t		*ill = ira->ira_ill;
543 	uint16_t	hdr_length;
544 	uint8_t		*nexthdrp;
545 	uint8_t		nexthdr;
546 	ip_stack_t	*ipst = ill->ill_ipst;
547 	conn_t		*connp;
548 	ip6_t		*ip6h;	/* Inner header */
549 
550 	ip6h = (ip6_t *)&icmp6[1];
551 	if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
552 		goto truncated;
553 
554 	if (icmp6->icmp6_type == ND_REDIRECT) {
555 		hdr_length = sizeof (nd_redirect_t);
556 	} else {
557 		if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
558 			goto discard_pkt;
559 		hdr_length = IPV6_HDR_LEN;
560 	}
561 
562 	if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
563 		goto truncated;
564 
565 	/*
566 	 * Stop here for ICMP_REDIRECT.
567 	 */
568 	if (icmp6->icmp6_type == ND_REDIRECT)
569 		return (B_TRUE);
570 
571 	/*
572 	 * ICMP errors only.
573 	 */
574 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
575 		goto discard_pkt;
576 	nexthdr = *nexthdrp;
577 
578 	/* Try to pass the ICMP message to clients who need it */
579 	switch (nexthdr) {
580 	case IPPROTO_UDP:
581 		/*
582 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
583 		 * transport header.
584 		 */
585 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
586 		    mp->b_wptr)
587 			goto truncated;
588 		break;
589 	case IPPROTO_TCP: {
590 		tcpha_t		*tcpha;
591 
592 		/*
593 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
594 		 * transport header.
595 		 */
596 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
597 		    mp->b_wptr)
598 			goto truncated;
599 
600 		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
601 		/*
602 		 * With IPMP we need to match across group, which we do
603 		 * since we have the upper ill from ira_ill.
604 		 */
605 		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
606 		    ill->ill_phyint->phyint_ifindex, ipst);
607 		if (connp == NULL)
608 			goto discard_pkt;
609 
610 		if ((connp->conn_verifyicmp != NULL) &&
611 		    !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
612 			CONN_DEC_REF(connp);
613 			goto discard_pkt;
614 		}
615 		CONN_DEC_REF(connp);
616 		break;
617 	}
618 	case IPPROTO_SCTP:
619 		/*
620 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
621 		 * transport header.
622 		 */
623 		if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
624 		    mp->b_wptr)
625 			goto truncated;
626 		break;
627 	case IPPROTO_ESP:
628 	case IPPROTO_AH:
629 		break;
630 	case IPPROTO_ENCAP:
631 	case IPPROTO_IPV6: {
632 		/* Look for self-encapsulated packets that caused an error */
633 		ip6_t *in_ip6h;
634 
635 		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
636 		if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
637 		    sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
638 			goto truncated;
639 		break;
640 	}
641 	default:
642 		break;
643 	}
644 
645 	return (B_TRUE);
646 
647 discard_pkt:
648 	/* Bogus ICMP error. */
649 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
650 	return (B_FALSE);
651 
652 truncated:
653 	/* We pulled up everthing already. Must be truncated */
654 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
655 	return (B_FALSE);
656 }
657 
658 /*
659  * Process received IPv6 ICMP Packet too big.
660  * The caller is responsible for validating the packet before passing it in
661  * and also to fanout the ICMP error to any matching transport conns. Assumes
662  * the message has been fully pulled up.
663  *
664  * Before getting here, the caller has called icmp_inbound_verify_v6()
665  * that should have verified with ULP to prevent undoing the changes we're
666  * going to make to DCE. For example, TCP might have verified that the packet
667  * which generated error is in the send window.
668  *
669  * In some cases modified this MTU in the ICMP header packet; the caller
670  * should pass to the matching ULP after this returns.
671  */
672 static void
673 icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
674 {
675 	uint32_t	mtu;
676 	dce_t		*dce;
677 	ill_t		*ill = ira->ira_ill;	/* Upper ill if IPMP */
678 	ip_stack_t	*ipst = ill->ill_ipst;
679 	int		old_max_frag;
680 	in6_addr_t	final_dst;
681 	ip6_t		*ip6h;	/* Inner IP header */
682 
683 	/* Caller has already pulled up everything. */
684 	ip6h = (ip6_t *)&icmp6[1];
685 	final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
686 
687 	mtu = ntohl(icmp6->icmp6_mtu);
688 	if (mtu < IPV6_MIN_MTU) {
689 		/*
690 		 * RFC 8021 suggests to ignore messages where mtu is
691 		 * less than the IPv6 minimum.
692 		 */
693 		ip1dbg(("Received mtu less than IPv6 "
694 		    "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
695 		DTRACE_PROBE1(icmp6__too__small__mtu, uint32_t, mtu);
696 		return;
697 	}
698 
699 	/*
700 	 * For link local destinations matching simply on address is not
701 	 * sufficient. Same link local addresses for different ILL's is
702 	 * possible.
703 	 */
704 	if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
705 		dce = dce_lookup_and_add_v6(&final_dst,
706 		    ill->ill_phyint->phyint_ifindex, ipst);
707 	} else {
708 		dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
709 	}
710 	if (dce == NULL) {
711 		/* Couldn't add a unique one - ENOMEM */
712 		if (ip_debug > 2) {
713 			/* ip1dbg */
714 			pr_addr_dbg("icmp_inbound_too_big_v6:"
715 			    "no dce for dst %s\n", AF_INET6,
716 			    &final_dst);
717 		}
718 		return;
719 	}
720 
721 	mutex_enter(&dce->dce_lock);
722 	if (dce->dce_flags & DCEF_PMTU)
723 		old_max_frag = dce->dce_pmtu;
724 	else if (IN6_IS_ADDR_MULTICAST(&final_dst))
725 		old_max_frag = ill->ill_mc_mtu;
726 	else
727 		old_max_frag = ill->ill_mtu;
728 
729 	ip1dbg(("Received mtu from router: %d\n", mtu));
730 	DTRACE_PROBE1(icmp6__received__mtu, uint32_t, mtu);
731 	dce->dce_pmtu = MIN(old_max_frag, mtu);
732 	icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
733 
734 	/* We now have a PMTU for sure */
735 	dce->dce_flags |= DCEF_PMTU;
736 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
737 
738 	mutex_exit(&dce->dce_lock);
739 	/*
740 	 * After dropping the lock the new value is visible to everyone.
741 	 * Then we bump the generation number so any cached values reinspect
742 	 * the dce_t.
743 	 */
744 	dce_increment_generation(dce);
745 	dce_refrele(dce);
746 }
747 
748 /*
749  * Fanout received ICMPv6 error packets to the transports.
750  * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
751  *
752  * The caller must have called icmp_inbound_verify_v6.
753  */
754 void
755 icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
756 {
757 	uint16_t	*up;	/* Pointer to ports in ULP header */
758 	uint32_t	ports;	/* reversed ports for fanout */
759 	ip6_t		rip6h;	/* With reversed addresses */
760 	ip6_t		*ip6h;	/* Inner IP header */
761 	uint16_t	hdr_length; /* Inner IP header length */
762 	uint8_t		*nexthdrp;
763 	uint8_t		nexthdr;
764 	tcpha_t		*tcpha;
765 	conn_t		*connp;
766 	ill_t		*ill = ira->ira_ill;	/* Upper in the case of IPMP */
767 	ip_stack_t	*ipst = ill->ill_ipst;
768 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
769 
770 	/* Caller has already pulled up everything. */
771 	ip6h = (ip6_t *)&icmp6[1];
772 	ASSERT(mp->b_cont == NULL);
773 	ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
774 
775 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
776 		goto drop_pkt;
777 	nexthdr = *nexthdrp;
778 	ira->ira_protocol = nexthdr;
779 
780 	/*
781 	 * We need a separate IP header with the source and destination
782 	 * addresses reversed to do fanout/classification because the ip6h in
783 	 * the ICMPv6 error is in the form we sent it out.
784 	 */
785 	rip6h.ip6_src = ip6h->ip6_dst;
786 	rip6h.ip6_dst = ip6h->ip6_src;
787 	rip6h.ip6_nxt = nexthdr;
788 
789 	/* Try to pass the ICMP message to clients who need it */
790 	switch (nexthdr) {
791 	case IPPROTO_UDP: {
792 		/* Attempt to find a client stream based on port. */
793 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
794 
795 		/* Note that we send error to all matches. */
796 		ira->ira_flags |= IRAF_ICMP_ERROR;
797 		ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
798 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
799 		return;
800 	}
801 	case IPPROTO_TCP: {
802 		/*
803 		 * Attempt to find a client stream based on port.
804 		 * Note that we do a reverse lookup since the header is
805 		 * in the form we sent it out.
806 		 */
807 		tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
808 		/*
809 		 * With IPMP we need to match across group, which we do
810 		 * since we have the upper ill from ira_ill.
811 		 */
812 		connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
813 		    TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
814 		if (connp == NULL) {
815 			goto drop_pkt;
816 		}
817 
818 		if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
819 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
820 			mp = ipsec_check_inbound_policy(mp, connp,
821 			    NULL, ip6h, ira);
822 			if (mp == NULL) {
823 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
824 				/* Note that mp is NULL */
825 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
826 				CONN_DEC_REF(connp);
827 				return;
828 			}
829 		}
830 
831 		ira->ira_flags |= IRAF_ICMP_ERROR;
832 		if (IPCL_IS_TCP(connp)) {
833 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
834 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
835 			    SQTAG_TCP6_INPUT_ICMP_ERR);
836 		} else {
837 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
838 			ill_t *rill = ira->ira_rill;
839 
840 			ira->ira_ill = ira->ira_rill = NULL;
841 			(connp->conn_recv)(connp, mp, NULL, ira);
842 			CONN_DEC_REF(connp);
843 			ira->ira_ill = ill;
844 			ira->ira_rill = rill;
845 		}
846 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
847 		return;
848 
849 	}
850 	case IPPROTO_SCTP:
851 		up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
852 		/* Find a SCTP client stream for this packet. */
853 		((uint16_t *)&ports)[0] = up[1];
854 		((uint16_t *)&ports)[1] = up[0];
855 
856 		ira->ira_flags |= IRAF_ICMP_ERROR;
857 		ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
858 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
859 		return;
860 
861 	case IPPROTO_ESP:
862 	case IPPROTO_AH:
863 		if (!ipsec_loaded(ipss)) {
864 			ip_proto_not_sup(mp, ira);
865 			return;
866 		}
867 
868 		if (nexthdr == IPPROTO_ESP)
869 			mp = ipsecesp_icmp_error(mp, ira);
870 		else
871 			mp = ipsecah_icmp_error(mp, ira);
872 		if (mp == NULL)
873 			return;
874 
875 		/* Just in case ipsec didn't preserve the NULL b_cont */
876 		if (mp->b_cont != NULL) {
877 			if (!pullupmsg(mp, -1))
878 				goto drop_pkt;
879 		}
880 
881 		/*
882 		 * If succesful, the mp has been modified to not include
883 		 * the ESP/AH header so we can fanout to the ULP's icmp
884 		 * error handler.
885 		 */
886 		if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
887 			goto drop_pkt;
888 
889 		ip6h = (ip6_t *)mp->b_rptr;
890 		/* Don't call hdr_length_v6() unless you have to. */
891 		if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
892 			hdr_length = ip_hdr_length_v6(mp, ip6h);
893 		else
894 			hdr_length = IPV6_HDR_LEN;
895 
896 		/* Verify the modified message before any further processes. */
897 		icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
898 		if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
899 			freemsg(mp);
900 			return;
901 		}
902 
903 		icmp_inbound_error_fanout_v6(mp, icmp6, ira);
904 		return;
905 
906 	case IPPROTO_IPV6: {
907 		/* Look for self-encapsulated packets that caused an error */
908 		ip6_t *in_ip6h;
909 
910 		in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
911 
912 		if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
913 		    IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
914 			/*
915 			 * Self-encapsulated case. As in the ipv4 case,
916 			 * we need to strip the 2nd IP header. Since mp
917 			 * is already pulled-up, we can simply bcopy
918 			 * the 3rd header + data over the 2nd header.
919 			 */
920 			uint16_t unused_len;
921 
922 			/*
923 			 * Make sure we don't do recursion more than once.
924 			 */
925 			if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
926 			    &unused_len, &nexthdrp) ||
927 			    *nexthdrp == IPPROTO_IPV6) {
928 				goto drop_pkt;
929 			}
930 
931 			/*
932 			 * Copy the 3rd header + remaining data on top
933 			 * of the 2nd header.
934 			 */
935 			bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
936 
937 			/*
938 			 * Subtract length of the 2nd header.
939 			 */
940 			mp->b_wptr -= hdr_length;
941 
942 			ip6h = (ip6_t *)mp->b_rptr;
943 			/* Don't call hdr_length_v6() unless you have to. */
944 			if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
945 				hdr_length = ip_hdr_length_v6(mp, ip6h);
946 			else
947 				hdr_length = IPV6_HDR_LEN;
948 
949 			/*
950 			 * Verify the modified message before any further
951 			 * processes.
952 			 */
953 			icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
954 			if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
955 				freemsg(mp);
956 				return;
957 			}
958 
959 			/*
960 			 * Now recurse, and see what I _really_ should be
961 			 * doing here.
962 			 */
963 			icmp_inbound_error_fanout_v6(mp, icmp6, ira);
964 			return;
965 		}
966 	}
967 	/* FALLTHROUGH */
968 	case IPPROTO_ENCAP:
969 		if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
970 		    &rip6h.ip6_dst, ipst)) != NULL) {
971 			ira->ira_flags |= IRAF_ICMP_ERROR;
972 			connp->conn_recvicmp(connp, mp, NULL, ira);
973 			CONN_DEC_REF(connp);
974 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
975 			return;
976 		}
977 		/*
978 		 * No IP tunnel is interested, fallthrough and see
979 		 * if a raw socket will want it.
980 		 */
981 		/* FALLTHROUGH */
982 	default:
983 		ira->ira_flags |= IRAF_ICMP_ERROR;
984 		ASSERT(ira->ira_protocol == nexthdr);
985 		ip_fanout_proto_v6(mp, &rip6h, ira);
986 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
987 		return;
988 	}
989 	/* NOTREACHED */
990 drop_pkt:
991 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
992 	ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
993 	freemsg(mp);
994 }
995 
996 /*
997  * Process received IPv6 ICMP Redirect messages.
998  * Assumes the caller has verified that the headers are in the pulled up mblk.
999  * Consumes mp.
1000  */
1001 /* ARGSUSED */
1002 static void
1003 icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
1004     ip_recv_attr_t *ira)
1005 {
1006 	ire_t		*ire, *nire;
1007 	ire_t		*prev_ire = NULL;
1008 	ire_t		*redir_ire;
1009 	in6_addr_t	*src, *dst, *gateway;
1010 	nd_opt_hdr_t	*opt;
1011 	nce_t		*nce;
1012 	int		ncec_flags = 0;
1013 	int		err = 0;
1014 	boolean_t	redirect_to_router = B_FALSE;
1015 	int		len;
1016 	int		optlen;
1017 	ill_t		*ill = ira->ira_rill;
1018 	ill_t		*rill = ira->ira_rill;
1019 	ip_stack_t	*ipst = ill->ill_ipst;
1020 
1021 	/*
1022 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
1023 	 * and make it be the IPMP upper so avoid being confused by a packet
1024 	 * addressed to a unicast address on a different ill.
1025 	 */
1026 	if (IS_UNDER_IPMP(rill)) {
1027 		rill = ipmp_ill_hold_ipmp_ill(rill);
1028 		if (rill == NULL) {
1029 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1030 			ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
1031 			    mp, ill);
1032 			freemsg(mp);
1033 			return;
1034 		}
1035 		ASSERT(rill != ira->ira_rill);
1036 	}
1037 
1038 	len = mp->b_wptr - (uchar_t *)rd;
1039 	src = &ip6h->ip6_src;
1040 	dst = &rd->nd_rd_dst;
1041 	gateway = &rd->nd_rd_target;
1042 
1043 	/* Verify if it is a valid redirect */
1044 	if (!IN6_IS_ADDR_LINKLOCAL(src) ||
1045 	    (ip6h->ip6_hops != IPV6_MAX_HOPS) ||
1046 	    (rd->nd_rd_code != 0) ||
1047 	    (len < sizeof (nd_redirect_t)) ||
1048 	    (IN6_IS_ADDR_V4MAPPED(dst)) ||
1049 	    (IN6_IS_ADDR_MULTICAST(dst))) {
1050 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1051 		ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
1052 		goto fail_redirect;
1053 	}
1054 
1055 	if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
1056 	    IN6_ARE_ADDR_EQUAL(gateway, dst))) {
1057 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1058 		ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
1059 		    mp, ill);
1060 		goto fail_redirect;
1061 	}
1062 
1063 	optlen = len - sizeof (nd_redirect_t);
1064 	if (optlen != 0) {
1065 		if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
1066 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1067 			ip_drop_input("ipv6IfIcmpInBadRedirects - options",
1068 			    mp, ill);
1069 			goto fail_redirect;
1070 		}
1071 	}
1072 
1073 	if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
1074 		redirect_to_router = B_TRUE;
1075 		ncec_flags |= NCE_F_ISROUTER;
1076 	} else {
1077 		gateway = dst;	/* Add nce for dst */
1078 	}
1079 
1080 
1081 	/*
1082 	 * Verify that the IP source address of the redirect is
1083 	 * the same as the current first-hop router for the specified
1084 	 * ICMP destination address.
1085 	 * Also, Make sure we had a route for the dest in question and
1086 	 * that route was pointing to the old gateway (the source of the
1087 	 * redirect packet.)
1088 	 * We do longest match and then compare ire_gateway_addr_v6 below.
1089 	 */
1090 	prev_ire = ire_ftable_lookup_v6(dst, 0, 0, 0, rill,
1091 	    ALL_ZONES, NULL, MATCH_IRE_ILL, 0, ipst, NULL);
1092 
1093 	/*
1094 	 * Check that
1095 	 *	the redirect was not from ourselves
1096 	 *	old gateway is still directly reachable
1097 	 */
1098 	if (prev_ire == NULL ||
1099 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
1100 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1101 	    !IN6_ARE_ADDR_EQUAL(src, &prev_ire->ire_gateway_addr_v6)) {
1102 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1103 		ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
1104 		goto fail_redirect;
1105 	}
1106 
1107 	ASSERT(prev_ire->ire_ill != NULL);
1108 	if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
1109 		ncec_flags |= NCE_F_NONUD;
1110 
1111 	opt = (nd_opt_hdr_t *)&rd[1];
1112 	opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
1113 	if (opt != NULL) {
1114 		err = nce_lookup_then_add_v6(rill,
1115 		    (uchar_t *)&opt[1],		/* Link layer address */
1116 		    rill->ill_phys_addr_length,
1117 		    gateway, ncec_flags, ND_STALE, &nce);
1118 		switch (err) {
1119 		case 0:
1120 			nce_refrele(nce);
1121 			break;
1122 		case EEXIST:
1123 			/*
1124 			 * Check to see if link layer address has changed and
1125 			 * process the ncec_state accordingly.
1126 			 */
1127 			nce_process(nce->nce_common,
1128 			    (uchar_t *)&opt[1], 0, B_FALSE);
1129 			nce_refrele(nce);
1130 			break;
1131 		default:
1132 			ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
1133 			    err));
1134 			goto fail_redirect;
1135 		}
1136 	}
1137 	if (redirect_to_router) {
1138 		ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
1139 
1140 		/*
1141 		 * Create a Route Association.  This will allow us to remember
1142 		 * a router told us to use the particular gateway.
1143 		 */
1144 		ire = ire_create_v6(
1145 		    dst,
1146 		    &ipv6_all_ones,		/* mask */
1147 		    gateway,			/* gateway addr */
1148 		    IRE_HOST,
1149 		    prev_ire->ire_ill,
1150 		    ALL_ZONES,
1151 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
1152 		    NULL,
1153 		    ipst);
1154 	} else {
1155 		ipif_t *ipif;
1156 		in6_addr_t gw;
1157 
1158 		/*
1159 		 * Just create an on link entry, i.e. interface route.
1160 		 * The gateway field is our link-local on the ill.
1161 		 */
1162 		mutex_enter(&rill->ill_lock);
1163 		for (ipif = rill->ill_ipif; ipif != NULL;
1164 		    ipif = ipif->ipif_next) {
1165 			if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1166 			    IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
1167 				break;
1168 		}
1169 		if (ipif == NULL) {
1170 			/* We have no link-local address! */
1171 			mutex_exit(&rill->ill_lock);
1172 			goto fail_redirect;
1173 		}
1174 		gw = ipif->ipif_v6lcl_addr;
1175 		mutex_exit(&rill->ill_lock);
1176 
1177 		ire = ire_create_v6(
1178 		    dst,				/* gateway == dst */
1179 		    &ipv6_all_ones,			/* mask */
1180 		    &gw,				/* gateway addr */
1181 		    rill->ill_net_type,			/* IF_[NO]RESOLVER */
1182 		    prev_ire->ire_ill,
1183 		    ALL_ZONES,
1184 		    (RTF_DYNAMIC | RTF_HOST),
1185 		    NULL,
1186 		    ipst);
1187 	}
1188 
1189 	if (ire == NULL)
1190 		goto fail_redirect;
1191 
1192 	nire = ire_add(ire);
1193 	/* Check if it was a duplicate entry */
1194 	if (nire != NULL && nire != ire) {
1195 		ASSERT(nire->ire_identical_ref > 1);
1196 		ire_delete(nire);
1197 		ire_refrele(nire);
1198 		nire = NULL;
1199 	}
1200 	ire = nire;
1201 	if (ire != NULL) {
1202 		ire_refrele(ire);		/* Held in ire_add */
1203 
1204 		/* tell routing sockets that we received a redirect */
1205 		ip_rts_change_v6(RTM_REDIRECT,
1206 		    &rd->nd_rd_dst,
1207 		    &rd->nd_rd_target,
1208 		    &ipv6_all_ones, 0, src,
1209 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
1210 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
1211 
1212 		/*
1213 		 * Delete any existing IRE_HOST type ires for this destination.
1214 		 * This together with the added IRE has the effect of
1215 		 * modifying an existing redirect.
1216 		 */
1217 		redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
1218 		    prev_ire->ire_ill, ALL_ZONES, NULL,
1219 		    (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
1220 		    NULL);
1221 
1222 		if (redir_ire != NULL) {
1223 			if (redir_ire->ire_flags & RTF_DYNAMIC)
1224 				ire_delete(redir_ire);
1225 			ire_refrele(redir_ire);
1226 		}
1227 	}
1228 
1229 	ire_refrele(prev_ire);
1230 	prev_ire = NULL;
1231 
1232 fail_redirect:
1233 	if (prev_ire != NULL)
1234 		ire_refrele(prev_ire);
1235 	freemsg(mp);
1236 	if (rill != ira->ira_rill)
1237 		ill_refrele(rill);
1238 }
1239 
1240 /*
1241  * Build and ship an IPv6 ICMP message using the packet data in mp,
1242  * and the ICMP header pointed to by "stuff".  (May be called as
1243  * writer.)
1244  * Note: assumes that icmp_pkt_err_ok_v6 has been called to
1245  * verify that an icmp error packet can be sent.
1246  *
1247  * If v6src_ptr is set use it as a source. Otherwise select a reasonable
1248  * source address (see above function).
1249  */
1250 static void
1251 icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
1252     const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
1253 {
1254 	ip6_t		*ip6h;
1255 	in6_addr_t	v6dst;
1256 	size_t		len_needed;
1257 	size_t		msg_len;
1258 	mblk_t		*mp1;
1259 	icmp6_t		*icmp6;
1260 	in6_addr_t	v6src;
1261 	ill_t		*ill = ira->ira_ill;
1262 	ip_stack_t	*ipst = ill->ill_ipst;
1263 	ip_xmit_attr_t	ixas;
1264 
1265 	ip6h = (ip6_t *)mp->b_rptr;
1266 
1267 	bzero(&ixas, sizeof (ixas));
1268 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
1269 	ixas.ixa_zoneid = ira->ira_zoneid;
1270 	ixas.ixa_ifindex = 0;
1271 	ixas.ixa_ipst = ipst;
1272 	ixas.ixa_cred = kcred;
1273 	ixas.ixa_cpid = NOPID;
1274 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
1275 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1276 
1277 	/*
1278 	 * If the source of the original packet was link-local, then
1279 	 * make sure we send on the same ill (group) as we received it on.
1280 	 */
1281 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
1282 		ixas.ixa_flags |= IXAF_SCOPEID_SET;
1283 		if (IS_UNDER_IPMP(ill))
1284 			ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
1285 		else
1286 			ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
1287 	}
1288 
1289 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1290 		/*
1291 		 * Apply IPsec based on how IPsec was applied to
1292 		 * the packet that had the error.
1293 		 *
1294 		 * If it was an outbound packet that caused the ICMP
1295 		 * error, then the caller will have setup the IRA
1296 		 * appropriately.
1297 		 */
1298 		if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
1299 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1300 			/* Note: mp already consumed and ip_drop_packet done */
1301 			return;
1302 		}
1303 	} else {
1304 		/*
1305 		 * This is in clear. The icmp message we are building
1306 		 * here should go out in clear, independent of our policy.
1307 		 */
1308 		ixas.ixa_flags |= IXAF_NO_IPSEC;
1309 	}
1310 
1311 	/*
1312 	 * If the caller specified the source we use that.
1313 	 * Otherwise, if the packet was for one of our unicast addresses, make
1314 	 * sure we respond with that as the source. Otherwise
1315 	 * have ip_output_simple pick the source address.
1316 	 */
1317 	if (v6src_ptr != NULL) {
1318 		v6src = *v6src_ptr;
1319 	} else {
1320 		ire_t *ire;
1321 		uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
1322 
1323 		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
1324 		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
1325 			match_flags |= MATCH_IRE_ILL;
1326 
1327 		ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
1328 		    (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
1329 		    match_flags, 0, ipst, NULL);
1330 		if (ire != NULL) {
1331 			v6src = ip6h->ip6_dst;
1332 			ire_refrele(ire);
1333 		} else {
1334 			v6src = ipv6_all_zeros;
1335 			ixas.ixa_flags |= IXAF_SET_SOURCE;
1336 		}
1337 	}
1338 	v6dst = ip6h->ip6_src;
1339 	len_needed = ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len;
1340 	msg_len = msgdsize(mp);
1341 	if (msg_len > len_needed) {
1342 		if (!adjmsg(mp, len_needed - msg_len)) {
1343 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1344 			freemsg(mp);
1345 			return;
1346 		}
1347 		msg_len = len_needed;
1348 	}
1349 	mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
1350 	if (mp1 == NULL) {
1351 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1352 		freemsg(mp);
1353 		return;
1354 	}
1355 	mp1->b_cont = mp;
1356 	mp = mp1;
1357 
1358 	/*
1359 	 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
1360 	 * node generates be accepted in peace by all on-host destinations.
1361 	 * If we do NOT assume that all on-host destinations trust
1362 	 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
1363 	 * (Look for IXAF_TRUSTED_ICMP).
1364 	 */
1365 	ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
1366 
1367 	ip6h = (ip6_t *)mp->b_rptr;
1368 	mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
1369 
1370 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1371 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1372 	ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
1373 	ip6h->ip6_dst = v6dst;
1374 	ip6h->ip6_src = v6src;
1375 	msg_len += IPV6_HDR_LEN + len;
1376 	if (msg_len > IP_MAXPACKET + IPV6_HDR_LEN) {
1377 		(void) adjmsg(mp, IP_MAXPACKET + IPV6_HDR_LEN - msg_len);
1378 		msg_len = IP_MAXPACKET + IPV6_HDR_LEN;
1379 	}
1380 	ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
1381 	icmp6 = (icmp6_t *)&ip6h[1];
1382 	bcopy(stuff, (char *)icmp6, len);
1383 	/*
1384 	 * Prepare for checksum by putting icmp length in the icmp
1385 	 * checksum field. The checksum is calculated in ip_output_wire_v6.
1386 	 */
1387 	icmp6->icmp6_cksum = ip6h->ip6_plen;
1388 	if (icmp6->icmp6_type == ND_REDIRECT) {
1389 		ip6h->ip6_hops = IPV6_MAX_HOPS;
1390 	}
1391 
1392 	(void) ip_output_simple(mp, &ixas);
1393 	ixa_cleanup(&ixas);
1394 }
1395 
1396 /*
1397  * Update the output mib when ICMPv6 packets are sent.
1398  */
1399 void
1400 icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
1401 {
1402 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
1403 
1404 	switch (icmp6->icmp6_type) {
1405 	case ICMP6_DST_UNREACH:
1406 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutDestUnreachs);
1407 		if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
1408 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutAdminProhibs);
1409 		break;
1410 
1411 	case ICMP6_TIME_EXCEEDED:
1412 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutTimeExcds);
1413 		break;
1414 
1415 	case ICMP6_PARAM_PROB:
1416 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutParmProblems);
1417 		break;
1418 
1419 	case ICMP6_PACKET_TOO_BIG:
1420 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutPktTooBigs);
1421 		break;
1422 
1423 	case ICMP6_ECHO_REQUEST:
1424 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchos);
1425 		break;
1426 
1427 	case ICMP6_ECHO_REPLY:
1428 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchoReplies);
1429 		break;
1430 
1431 	case ND_ROUTER_SOLICIT:
1432 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterSolicits);
1433 		break;
1434 
1435 	case ND_ROUTER_ADVERT:
1436 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterAdvertisements);
1437 		break;
1438 
1439 	case ND_NEIGHBOR_SOLICIT:
1440 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborSolicits);
1441 		break;
1442 
1443 	case ND_NEIGHBOR_ADVERT:
1444 		BUMP_MIB(ill->ill_icmp6_mib,
1445 		    ipv6IfIcmpOutNeighborAdvertisements);
1446 		break;
1447 
1448 	case ND_REDIRECT:
1449 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRedirects);
1450 		break;
1451 
1452 	case MLD_LISTENER_QUERY:
1453 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembQueries);
1454 		break;
1455 
1456 	case MLD_LISTENER_REPORT:
1457 	case MLD_V2_LISTENER_REPORT:
1458 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembResponses);
1459 		break;
1460 
1461 	case MLD_LISTENER_REDUCTION:
1462 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembReductions);
1463 		break;
1464 	}
1465 }
1466 
1467 /*
1468  * Check if it is ok to send an ICMPv6 error packet in
1469  * response to the IP packet in mp.
1470  * Free the message and return null if no
1471  * ICMP error packet should be sent.
1472  */
1473 static mblk_t *
1474 icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
1475 {
1476 	ill_t		*ill = ira->ira_ill;
1477 	ip_stack_t	*ipst = ill->ill_ipst;
1478 	boolean_t	llbcast;
1479 	ip6_t		*ip6h;
1480 
1481 	if (!mp)
1482 		return (NULL);
1483 
1484 	/* We view multicast and broadcast as the same.. */
1485 	llbcast = (ira->ira_flags &
1486 	    (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
1487 	ip6h = (ip6_t *)mp->b_rptr;
1488 
1489 	/* Check if source address uniquely identifies the host */
1490 
1491 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src) ||
1492 	    IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src) ||
1493 	    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
1494 		freemsg(mp);
1495 		return (NULL);
1496 	}
1497 
1498 	if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1499 		size_t	len_needed = IPV6_HDR_LEN + ICMP6_MINLEN;
1500 		icmp6_t		*icmp6;
1501 
1502 		if (mp->b_wptr - mp->b_rptr < len_needed) {
1503 			if (!pullupmsg(mp, len_needed)) {
1504 				BUMP_MIB(ill->ill_icmp6_mib,
1505 				    ipv6IfIcmpInErrors);
1506 				freemsg(mp);
1507 				return (NULL);
1508 			}
1509 			ip6h = (ip6_t *)mp->b_rptr;
1510 		}
1511 		icmp6 = (icmp6_t *)&ip6h[1];
1512 		/* Explicitly do not generate errors in response to redirects */
1513 		if (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
1514 		    icmp6->icmp6_type == ND_REDIRECT) {
1515 			freemsg(mp);
1516 			return (NULL);
1517 		}
1518 	}
1519 	/*
1520 	 * Check that the destination is not multicast and that the packet
1521 	 * was not sent on link layer broadcast or multicast.  (Exception
1522 	 * is Packet too big message as per the draft - when mcast_ok is set.)
1523 	 */
1524 	if (!mcast_ok &&
1525 	    (llbcast || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
1526 		freemsg(mp);
1527 		return (NULL);
1528 	}
1529 	/*
1530 	 * If this is a labeled system, then check to see if we're allowed to
1531 	 * send a response to this particular sender.  If not, then just drop.
1532 	 */
1533 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
1534 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1535 		freemsg(mp);
1536 		return (NULL);
1537 	}
1538 
1539 	if (icmp_err_rate_limit(ipst)) {
1540 		/*
1541 		 * Only send ICMP error packets every so often.
1542 		 * This should be done on a per port/source basis,
1543 		 * but for now this will suffice.
1544 		 */
1545 		freemsg(mp);
1546 		return (NULL);
1547 	}
1548 	return (mp);
1549 }
1550 
1551 /*
1552  * Called when a packet was sent out the same link that it arrived on.
1553  * Check if it is ok to send a redirect and then send it.
1554  */
1555 void
1556 ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
1557     ip_recv_attr_t *ira)
1558 {
1559 	ill_t		*ill = ira->ira_ill;
1560 	ip_stack_t	*ipst = ill->ill_ipst;
1561 	in6_addr_t	*v6targ;
1562 	ire_t		*src_ire_v6 = NULL;
1563 	mblk_t		*mp1;
1564 	ire_t		*nhop_ire = NULL;
1565 
1566 	/*
1567 	 * Don't send a redirect when forwarding a source
1568 	 * routed packet.
1569 	 */
1570 	if (ip_source_routed_v6(ip6h, mp, ipst))
1571 		return;
1572 
1573 	if (ire->ire_type & IRE_ONLINK) {
1574 		/* Target is directly connected */
1575 		v6targ = &ip6h->ip6_dst;
1576 	} else {
1577 		/* Determine the most specific IRE used to send the packets */
1578 		nhop_ire = ire_nexthop(ire);
1579 		if (nhop_ire == NULL)
1580 			return;
1581 
1582 		/*
1583 		 * We won't send redirects to a router
1584 		 * that doesn't have a link local
1585 		 * address, but will forward.
1586 		 */
1587 		if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
1588 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
1589 			ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1590 			ire_refrele(nhop_ire);
1591 			return;
1592 		}
1593 		v6targ = &nhop_ire->ire_addr_v6;
1594 	}
1595 	src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
1596 	    NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
1597 	    MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
1598 
1599 	if (src_ire_v6 == NULL) {
1600 		if (nhop_ire != NULL)
1601 			ire_refrele(nhop_ire);
1602 		return;
1603 	}
1604 
1605 	/*
1606 	 * The source is directly connected.
1607 	 */
1608 	mp1 = copymsg(mp);
1609 	if (mp1 != NULL)
1610 		icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
1611 
1612 	if (nhop_ire != NULL)
1613 		ire_refrele(nhop_ire);
1614 	ire_refrele(src_ire_v6);
1615 }
1616 
1617 /*
1618  * Generate an ICMPv6 redirect message.
1619  * Include target link layer address option if it exits.
1620  * Always include redirect header.
1621  */
1622 static void
1623 icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
1624     ip_recv_attr_t *ira)
1625 {
1626 	nd_redirect_t	*rd;
1627 	nd_opt_rd_hdr_t	*rdh;
1628 	uchar_t		*buf;
1629 	ncec_t		*ncec = NULL;
1630 	nd_opt_hdr_t	*opt;
1631 	int		len;
1632 	int		ll_opt_len = 0;
1633 	int		max_redir_hdr_data_len;
1634 	int		pkt_len;
1635 	in6_addr_t	*srcp;
1636 	ill_t		*ill;
1637 	boolean_t	need_refrele;
1638 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
1639 
1640 	mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
1641 	if (mp == NULL)
1642 		return;
1643 
1644 	if (IS_UNDER_IPMP(ira->ira_ill)) {
1645 		ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
1646 		if (ill == NULL) {
1647 			ill = ira->ira_ill;
1648 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1649 			ip_drop_output("no IPMP ill for sending redirect",
1650 			    mp, ill);
1651 			freemsg(mp);
1652 			return;
1653 		}
1654 		need_refrele = B_TRUE;
1655 	} else {
1656 		ill = ira->ira_ill;
1657 		need_refrele = B_FALSE;
1658 	}
1659 
1660 	ncec = ncec_lookup_illgrp_v6(ill, targetp);
1661 	if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
1662 	    ncec->ncec_lladdr != NULL) {
1663 		ll_opt_len = (sizeof (nd_opt_hdr_t) +
1664 		    ill->ill_phys_addr_length + 7)/8 * 8;
1665 	}
1666 	len = sizeof (nd_redirect_t) + sizeof (nd_opt_rd_hdr_t) + ll_opt_len;
1667 	ASSERT(len % 4 == 0);
1668 	buf = kmem_alloc(len, KM_NOSLEEP);
1669 	if (buf == NULL) {
1670 		if (ncec != NULL)
1671 			ncec_refrele(ncec);
1672 		if (need_refrele)
1673 			ill_refrele(ill);
1674 		freemsg(mp);
1675 		return;
1676 	}
1677 
1678 	rd = (nd_redirect_t *)buf;
1679 	rd->nd_rd_type = (uint8_t)ND_REDIRECT;
1680 	rd->nd_rd_code = 0;
1681 	rd->nd_rd_reserved = 0;
1682 	rd->nd_rd_target = *targetp;
1683 	rd->nd_rd_dst = *dest;
1684 
1685 	opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
1686 	if (ncec != NULL && ll_opt_len != 0) {
1687 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1688 		opt->nd_opt_len = ll_opt_len/8;
1689 		bcopy((char *)ncec->ncec_lladdr, &opt[1],
1690 		    ill->ill_phys_addr_length);
1691 	}
1692 	if (ncec != NULL)
1693 		ncec_refrele(ncec);
1694 	rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
1695 	rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
1696 	/* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
1697 	max_redir_hdr_data_len =
1698 	    (ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len)/8*8;
1699 	pkt_len = msgdsize(mp);
1700 	/* Make sure mp is 8 byte aligned */
1701 	if (pkt_len > max_redir_hdr_data_len) {
1702 		rdh->nd_opt_rh_len = (max_redir_hdr_data_len +
1703 		    sizeof (nd_opt_rd_hdr_t))/8;
1704 		(void) adjmsg(mp, max_redir_hdr_data_len - pkt_len);
1705 	} else {
1706 		rdh->nd_opt_rh_len = (pkt_len + sizeof (nd_opt_rd_hdr_t))/8;
1707 		(void) adjmsg(mp, -(pkt_len % 8));
1708 	}
1709 	rdh->nd_opt_rh_reserved1 = 0;
1710 	rdh->nd_opt_rh_reserved2 = 0;
1711 	/* ipif_v6lcl_addr contains the link-local source address */
1712 	srcp = &ill->ill_ipif->ipif_v6lcl_addr;
1713 
1714 	/* Redirects sent by router, and router is global zone */
1715 	ASSERT(ira->ira_zoneid == ALL_ZONES);
1716 	ira->ira_zoneid = GLOBAL_ZONEID;
1717 	icmp_pkt_v6(mp, buf, len, srcp, ira);
1718 	kmem_free(buf, len);
1719 	if (need_refrele)
1720 		ill_refrele(ill);
1721 }
1722 
1723 
1724 /* Generate an ICMP time exceeded message.  (May be called as writer.) */
1725 void
1726 icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1727     ip_recv_attr_t *ira)
1728 {
1729 	icmp6_t	icmp6;
1730 
1731 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1732 	if (mp == NULL)
1733 		return;
1734 
1735 	bzero(&icmp6, sizeof (icmp6_t));
1736 	icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
1737 	icmp6.icmp6_code = code;
1738 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1739 }
1740 
1741 /*
1742  * Generate an ICMP unreachable message.
1743  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1744  * constructed by the caller.
1745  */
1746 void
1747 icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1748     ip_recv_attr_t *ira)
1749 {
1750 	icmp6_t	icmp6;
1751 
1752 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1753 	if (mp == NULL)
1754 		return;
1755 
1756 	bzero(&icmp6, sizeof (icmp6_t));
1757 	icmp6.icmp6_type = ICMP6_DST_UNREACH;
1758 	icmp6.icmp6_code = code;
1759 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1760 }
1761 
1762 /*
1763  * Generate an ICMP pkt too big message.
1764  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1765  * constructed by the caller.
1766  */
1767 void
1768 icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
1769     ip_recv_attr_t *ira)
1770 {
1771 	icmp6_t	icmp6;
1772 
1773 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1774 	if (mp == NULL)
1775 		return;
1776 
1777 	bzero(&icmp6, sizeof (icmp6_t));
1778 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
1779 	icmp6.icmp6_code = 0;
1780 	icmp6.icmp6_mtu = htonl(mtu);
1781 
1782 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1783 }
1784 
1785 /*
1786  * Generate an ICMP parameter problem message. (May be called as writer.)
1787  * 'offset' is the offset from the beginning of the packet in error.
1788  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1789  * constructed by the caller.
1790  */
1791 static void
1792 icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
1793     boolean_t mcast_ok, ip_recv_attr_t *ira)
1794 {
1795 	icmp6_t	icmp6;
1796 
1797 	mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1798 	if (mp == NULL)
1799 		return;
1800 
1801 	bzero((char *)&icmp6, sizeof (icmp6_t));
1802 	icmp6.icmp6_type = ICMP6_PARAM_PROB;
1803 	icmp6.icmp6_code = code;
1804 	icmp6.icmp6_pptr = htonl(offset);
1805 	icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1806 }
1807 
1808 void
1809 icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
1810     ip_recv_attr_t *ira)
1811 {
1812 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
1813 	uint16_t	hdr_length;
1814 	uint8_t		*nexthdrp;
1815 	uint32_t	offset;
1816 	ill_t		*ill = ira->ira_ill;
1817 
1818 	/* Determine the offset of the bad nexthdr value */
1819 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h,	&hdr_length, &nexthdrp)) {
1820 		/* Malformed packet */
1821 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1822 		ip_drop_input("ipIfStatsInDiscards", mp, ill);
1823 		freemsg(mp);
1824 		return;
1825 	}
1826 
1827 	offset = nexthdrp - mp->b_rptr;
1828 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
1829 	    mcast_ok, ira);
1830 }
1831 
1832 /*
1833  * Verify whether or not the IP address is a valid local address.
1834  * Could be a unicast, including one for a down interface.
1835  * If allow_mcbc then a multicast or broadcast address is also
1836  * acceptable.
1837  *
1838  * In the case of a multicast address, however, the
1839  * upper protocol is expected to reset the src address
1840  * to zero when we return IPVL_MCAST so that
1841  * no packets are emitted with multicast address as
1842  * source address.
1843  * The addresses valid for bind are:
1844  *	(1) - in6addr_any
1845  *	(2) - IP address of an UP interface
1846  *	(3) - IP address of a DOWN interface
1847  *	(4) - a multicast address. In this case
1848  *	the conn will only receive packets destined to
1849  *	the specified multicast address. Note: the
1850  *	application still has to issue an
1851  *	IPV6_JOIN_GROUP socket option.
1852  *
1853  * In all the above cases, the bound address must be valid in the current zone.
1854  * When the address is loopback or multicast, there might be many matching IREs
1855  * so bind has to look up based on the zone.
1856  */
1857 ip_laddr_t
1858 ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
1859     ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
1860 {
1861 	ire_t		*src_ire;
1862 	uint_t		match_flags;
1863 	ill_t		*ill = NULL;
1864 
1865 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
1866 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
1867 
1868 	match_flags = MATCH_IRE_ZONEONLY;
1869 	if (scopeid != 0) {
1870 		ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
1871 		if (ill == NULL)
1872 			return (IPVL_BAD);
1873 		match_flags |= MATCH_IRE_ILL;
1874 	}
1875 
1876 	src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
1877 	    ill, zoneid, NULL, match_flags, 0, ipst, NULL);
1878 	if (ill != NULL)
1879 		ill_refrele(ill);
1880 
1881 	/*
1882 	 * If an address other than in6addr_any is requested,
1883 	 * we verify that it is a valid address for bind
1884 	 * Note: Following code is in if-else-if form for
1885 	 * readability compared to a condition check.
1886 	 */
1887 	if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
1888 		/*
1889 		 * (2) Bind to address of local UP interface
1890 		 */
1891 		ire_refrele(src_ire);
1892 		return (IPVL_UNICAST_UP);
1893 	} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
1894 		/* (4) bind to multicast address. */
1895 		if (src_ire != NULL)
1896 			ire_refrele(src_ire);
1897 
1898 		/*
1899 		 * Note: caller should take IPV6_MULTICAST_IF
1900 		 * into account when selecting a real source address.
1901 		 */
1902 		if (allow_mcbc)
1903 			return (IPVL_MCAST);
1904 		else
1905 			return (IPVL_BAD);
1906 	} else {
1907 		ipif_t *ipif;
1908 
1909 		/*
1910 		 * (3) Bind to address of local DOWN interface?
1911 		 * (ipif_lookup_addr() looks up all interfaces
1912 		 * but we do not get here for UP interfaces
1913 		 * - case (2) above)
1914 		 */
1915 		if (src_ire != NULL)
1916 			ire_refrele(src_ire);
1917 
1918 		ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
1919 		if (ipif == NULL)
1920 			return (IPVL_BAD);
1921 
1922 		/* Not a useful source? */
1923 		if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
1924 			ipif_refrele(ipif);
1925 			return (IPVL_BAD);
1926 		}
1927 		ipif_refrele(ipif);
1928 		return (IPVL_UNICAST_DOWN);
1929 	}
1930 }
1931 
1932 /*
1933  * Verify that both the source and destination addresses are valid.  If
1934  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
1935  * i.e. have no route to it.  Protocols like TCP want to verify destination
1936  * reachability, while tunnels do not.
1937  *
1938  * Determine the route, the interface, and (optionally) the source address
1939  * to use to reach a given destination.
1940  * Note that we allow connect to broadcast and multicast addresses when
1941  * IPDF_ALLOW_MCBC is set.
1942  * first_hop and dst_addr are normally the same, but if source routing
1943  * they will differ; in that case the first_hop is what we'll use for the
1944  * routing lookup but the dce and label checks will be done on dst_addr,
1945  *
1946  * If uinfo is set, then we fill in the best available information
1947  * we have for the destination. This is based on (in priority order) any
1948  * metrics and path MTU stored in a dce_t, route metrics, and finally the
1949  * ill_mtu/ill_mc_mtu.
1950  *
1951  * Tsol note: If we have a source route then dst_addr != firsthop. But we
1952  * always do the label check on dst_addr.
1953  *
1954  * Assumes that the caller has set ixa_scopeid for link-local communication.
1955  */
1956 int
1957 ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
1958     const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
1959     uint32_t flags, uint_t mac_mode)
1960 {
1961 	ire_t		*ire;
1962 	int		error = 0;
1963 	in6_addr_t	setsrc;				/* RTF_SETSRC */
1964 	zoneid_t	zoneid = ixa->ixa_zoneid;	/* Honors SO_ALLZONES */
1965 	ip_stack_t	*ipst = ixa->ixa_ipst;
1966 	dce_t		*dce;
1967 	uint_t		pmtu;
1968 	uint_t		ifindex;
1969 	uint_t		generation;
1970 	nce_t		*nce;
1971 	ill_t		*ill = NULL;
1972 	boolean_t	multirt = B_FALSE;
1973 
1974 	ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
1975 
1976 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1977 
1978 	/*
1979 	 * We never send to zero; the ULPs map it to the loopback address.
1980 	 * We can't allow it since we use zero to mean unitialized in some
1981 	 * places.
1982 	 */
1983 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
1984 
1985 	if (is_system_labeled()) {
1986 		ts_label_t *tsl = NULL;
1987 
1988 		error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
1989 		    mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
1990 		if (error != 0)
1991 			return (error);
1992 		if (tsl != NULL) {
1993 			/* Update the label */
1994 			ip_xmit_attr_replace_tsl(ixa, tsl);
1995 		}
1996 	}
1997 
1998 	setsrc = ipv6_all_zeros;
1999 	/*
2000 	 * Select a route; For IPMP interfaces, we would only select
2001 	 * a "hidden" route (i.e., going through a specific under_ill)
2002 	 * if ixa_ifindex has been specified.
2003 	 */
2004 	ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
2005 	    &setsrc, &error, &multirt);
2006 	ASSERT(ire != NULL);	/* IRE_NOROUTE if none found */
2007 	if (error != 0)
2008 		goto bad_addr;
2009 
2010 	/*
2011 	 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
2012 	 * If IPDF_VERIFY_DST is set, the destination must be reachable.
2013 	 * Otherwise the destination needn't be reachable.
2014 	 *
2015 	 * If we match on a reject or black hole, then we've got a
2016 	 * local failure.  May as well fail out the connect() attempt,
2017 	 * since it's never going to succeed.
2018 	 */
2019 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2020 		/*
2021 		 * If we're verifying destination reachability, we always want
2022 		 * to complain here.
2023 		 *
2024 		 * If we're not verifying destination reachability but the
2025 		 * destination has a route, we still want to fail on the
2026 		 * temporary address and broadcast address tests.
2027 		 *
2028 		 * In both cases do we let the code continue so some reasonable
2029 		 * information is returned to the caller. That enables the
2030 		 * caller to use (and even cache) the IRE. conn_ip_ouput will
2031 		 * use the generation mismatch path to check for the unreachable
2032 		 * case thereby avoiding any specific check in the main path.
2033 		 */
2034 		ASSERT(generation == IRE_GENERATION_VERIFY);
2035 		if (flags & IPDF_VERIFY_DST) {
2036 			/*
2037 			 * Set errno but continue to set up ixa_ire to be
2038 			 * the RTF_REJECT|RTF_BLACKHOLE IRE.
2039 			 * That allows callers to use ip_output to get an
2040 			 * ICMP error back.
2041 			 */
2042 			if (!(ire->ire_type & IRE_HOST))
2043 				error = ENETUNREACH;
2044 			else
2045 				error = EHOSTUNREACH;
2046 		}
2047 	}
2048 
2049 	if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
2050 	    !(flags & IPDF_ALLOW_MCBC)) {
2051 		ire_refrele(ire);
2052 		ire = ire_reject(ipst, B_FALSE);
2053 		generation = IRE_GENERATION_VERIFY;
2054 		error = ENETUNREACH;
2055 	}
2056 
2057 	/* Cache things */
2058 	if (ixa->ixa_ire != NULL)
2059 		ire_refrele_notr(ixa->ixa_ire);
2060 #ifdef DEBUG
2061 	ire_refhold_notr(ire);
2062 	ire_refrele(ire);
2063 #endif
2064 	ixa->ixa_ire = ire;
2065 	ixa->ixa_ire_generation = generation;
2066 
2067 	/*
2068 	 * Ensure that ixa_dce is always set any time that ixa_ire is set,
2069 	 * since some callers will send a packet to conn_ip_output() even if
2070 	 * there's an error.
2071 	 */
2072 	ifindex = 0;
2073 	if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
2074 		/* If we are creating a DCE we'd better have an ifindex */
2075 		if (ill != NULL)
2076 			ifindex = ill->ill_phyint->phyint_ifindex;
2077 		else
2078 			flags &= ~IPDF_UNIQUE_DCE;
2079 	}
2080 
2081 	if (flags & IPDF_UNIQUE_DCE) {
2082 		/* Fallback to the default dce if allocation fails */
2083 		dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
2084 		if (dce != NULL) {
2085 			generation = dce->dce_generation;
2086 		} else {
2087 			dce = dce_lookup_v6(dst_addr, ifindex, ipst,
2088 			    &generation);
2089 		}
2090 	} else {
2091 		dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
2092 	}
2093 	ASSERT(dce != NULL);
2094 	if (ixa->ixa_dce != NULL)
2095 		dce_refrele_notr(ixa->ixa_dce);
2096 #ifdef DEBUG
2097 	dce_refhold_notr(dce);
2098 	dce_refrele(dce);
2099 #endif
2100 	ixa->ixa_dce = dce;
2101 	ixa->ixa_dce_generation = generation;
2102 
2103 
2104 	/*
2105 	 * For multicast with multirt we have a flag passed back from
2106 	 * ire_lookup_multi_ill_v6 since we don't have an IRE for each
2107 	 * possible multicast address.
2108 	 * We also need a flag for multicast since we can't check
2109 	 * whether RTF_MULTIRT is set in ixa_ire for multicast.
2110 	 */
2111 	if (multirt) {
2112 		ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
2113 		ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
2114 	} else {
2115 		ixa->ixa_postfragfn = ire->ire_postfragfn;
2116 		ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
2117 	}
2118 	if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2119 		/* Get an nce to cache. */
2120 		nce = ire_to_nce(ire, 0, firsthop);
2121 		if (nce == NULL) {
2122 			/* Allocation failure? */
2123 			ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2124 		} else {
2125 			if (ixa->ixa_nce != NULL)
2126 				nce_refrele(ixa->ixa_nce);
2127 			ixa->ixa_nce = nce;
2128 		}
2129 	}
2130 
2131 	/*
2132 	 * If the source address is a loopback address, the
2133 	 * destination had best be local or multicast.
2134 	 * If we are sending to an IRE_LOCAL using a loopback source then
2135 	 * it had better be the same zoneid.
2136 	 */
2137 	if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
2138 		if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
2139 			ire = NULL;	/* Stored in ixa_ire */
2140 			error = EADDRNOTAVAIL;
2141 			goto bad_addr;
2142 		}
2143 		if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
2144 			ire = NULL;	/* Stored in ixa_ire */
2145 			error = EADDRNOTAVAIL;
2146 			goto bad_addr;
2147 		}
2148 	}
2149 
2150 	/*
2151 	 * Does the caller want us to pick a source address?
2152 	 */
2153 	if (flags & IPDF_SELECT_SRC) {
2154 		in6_addr_t	src_addr;
2155 
2156 		/*
2157 		 * We use use ire_nexthop_ill to avoid the under ipmp
2158 		 * interface for source address selection. Note that for ipmp
2159 		 * probe packets, ixa_ifindex would have been specified, and
2160 		 * the ip_select_route() invocation would have picked an ire
2161 		 * will ire_ill pointing at an under interface.
2162 		 */
2163 		ill = ire_nexthop_ill(ire);
2164 
2165 		/* If unreachable we have no ill but need some source */
2166 		if (ill == NULL) {
2167 			src_addr = ipv6_loopback;
2168 			/* Make sure we look for a better source address */
2169 			generation = SRC_GENERATION_VERIFY;
2170 		} else {
2171 			error = ip_select_source_v6(ill, &setsrc, dst_addr,
2172 			    zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
2173 			    &src_addr, &generation, NULL);
2174 			if (error != 0) {
2175 				ire = NULL;	/* Stored in ixa_ire */
2176 				goto bad_addr;
2177 			}
2178 		}
2179 
2180 		/*
2181 		 * We allow the source address to to down.
2182 		 * However, we check that we don't use the loopback address
2183 		 * as a source when sending out on the wire.
2184 		 */
2185 		if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
2186 		    !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
2187 		    !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2188 			ire = NULL;	/* Stored in ixa_ire */
2189 			error = EADDRNOTAVAIL;
2190 			goto bad_addr;
2191 		}
2192 
2193 		*src_addrp = src_addr;
2194 		ixa->ixa_src_generation = generation;
2195 	}
2196 
2197 	/*
2198 	 * Make sure we don't leave an unreachable ixa_nce in place
2199 	 * since ip_select_route is used when we unplumb i.e., remove
2200 	 * references on ixa_ire, ixa_nce, and ixa_dce.
2201 	 */
2202 	nce = ixa->ixa_nce;
2203 	if (nce != NULL && nce->nce_is_condemned) {
2204 		nce_refrele(nce);
2205 		ixa->ixa_nce = NULL;
2206 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2207 	}
2208 
2209 	/*
2210 	 * Note that IPv6 multicast supports PMTU discovery unlike IPv4
2211 	 * multicast. But pmtu discovery is only enabled for connected
2212 	 * sockets in general.
2213 	 */
2214 
2215 	/*
2216 	 * Set initial value for fragmentation limit.  Either conn_ip_output
2217 	 * or ULP might updates it when there are routing changes.
2218 	 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
2219 	 */
2220 	pmtu = ip_get_pmtu(ixa);
2221 	ixa->ixa_fragsize = pmtu;
2222 	/* Make sure ixa_fragsize and ixa_pmtu remain identical */
2223 	if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
2224 		ixa->ixa_pmtu = pmtu;
2225 
2226 	/*
2227 	 * Extract information useful for some transports.
2228 	 * First we look for DCE metrics. Then we take what we have in
2229 	 * the metrics in the route, where the offlink is used if we have
2230 	 * one.
2231 	 */
2232 	if (uinfo != NULL) {
2233 		bzero(uinfo, sizeof (*uinfo));
2234 
2235 		if (dce->dce_flags & DCEF_UINFO)
2236 			*uinfo = dce->dce_uinfo;
2237 
2238 		rts_merge_metrics(uinfo, &ire->ire_metrics);
2239 
2240 		/* Allow ire_metrics to decrease the path MTU from above */
2241 		if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
2242 			uinfo->iulp_mtu = pmtu;
2243 
2244 		uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
2245 		uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
2246 		uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
2247 	}
2248 
2249 	if (ill != NULL)
2250 		ill_refrele(ill);
2251 
2252 	return (error);
2253 
2254 bad_addr:
2255 	if (ire != NULL)
2256 		ire_refrele(ire);
2257 
2258 	if (ill != NULL)
2259 		ill_refrele(ill);
2260 
2261 	/*
2262 	 * Make sure we don't leave an unreachable ixa_nce in place
2263 	 * since ip_select_route is used when we unplumb i.e., remove
2264 	 * references on ixa_ire, ixa_nce, and ixa_dce.
2265 	 */
2266 	nce = ixa->ixa_nce;
2267 	if (nce != NULL && nce->nce_is_condemned) {
2268 		nce_refrele(nce);
2269 		ixa->ixa_nce = NULL;
2270 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2271 	}
2272 
2273 	return (error);
2274 }
2275 
2276 /*
2277  * Handle protocols with which IP is less intimate.  There
2278  * can be more than one stream bound to a particular
2279  * protocol.  When this is the case, normally each one gets a copy
2280  * of any incoming packets.
2281  *
2282  * Zones notes:
2283  * Packets will be distributed to conns in all zones. This is really only
2284  * useful for ICMPv6 as only applications in the global zone can create raw
2285  * sockets for other protocols.
2286  */
2287 void
2288 ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
2289 {
2290 	mblk_t		*mp1;
2291 	in6_addr_t	laddr = ip6h->ip6_dst;
2292 	conn_t		*connp, *first_connp, *next_connp;
2293 	connf_t		*connfp;
2294 	ill_t		*ill = ira->ira_ill;
2295 	ip_stack_t	*ipst = ill->ill_ipst;
2296 
2297 	connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
2298 	mutex_enter(&connfp->connf_lock);
2299 	connp = connfp->connf_head;
2300 	for (connp = connfp->connf_head; connp != NULL;
2301 	    connp = connp->conn_next) {
2302 		/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2303 		if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2304 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2305 		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2306 			break;
2307 	}
2308 
2309 	if (connp == NULL) {
2310 		/*
2311 		 * No one bound to this port.  Is
2312 		 * there a client that wants all
2313 		 * unclaimed datagrams?
2314 		 */
2315 		mutex_exit(&connfp->connf_lock);
2316 		ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
2317 		    ICMP6_PARAMPROB_NEXTHEADER, ira);
2318 		return;
2319 	}
2320 
2321 	ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
2322 
2323 	CONN_INC_REF(connp);
2324 	first_connp = connp;
2325 
2326 	/*
2327 	 * XXX: Fix the multiple protocol listeners case. We should not
2328 	 * be walking the conn->conn_next list here.
2329 	 */
2330 	connp = connp->conn_next;
2331 	for (;;) {
2332 		while (connp != NULL) {
2333 			/* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2334 			if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2335 			    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2336 			    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2337 			    ira, connp)))
2338 				break;
2339 			connp = connp->conn_next;
2340 		}
2341 
2342 		if (connp == NULL) {
2343 			/* No more interested clients */
2344 			connp = first_connp;
2345 			break;
2346 		}
2347 		if (((mp1 = dupmsg(mp)) == NULL) &&
2348 		    ((mp1 = copymsg(mp)) == NULL)) {
2349 			/* Memory allocation failed */
2350 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2351 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
2352 			connp = first_connp;
2353 			break;
2354 		}
2355 
2356 		CONN_INC_REF(connp);
2357 		mutex_exit(&connfp->connf_lock);
2358 
2359 		ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
2360 		    ira);
2361 
2362 		mutex_enter(&connfp->connf_lock);
2363 		/* Follow the next pointer before releasing the conn. */
2364 		next_connp = connp->conn_next;
2365 		CONN_DEC_REF(connp);
2366 		connp = next_connp;
2367 	}
2368 
2369 	/* Last one.  Send it upstream. */
2370 	mutex_exit(&connfp->connf_lock);
2371 
2372 	ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
2373 
2374 	CONN_DEC_REF(connp);
2375 }
2376 
2377 /*
2378  * Called when it is conceptually a ULP that would sent the packet
2379  * e.g., port unreachable and nexthdr unknown. Check that the packet
2380  * would have passed the IPsec global policy before sending the error.
2381  *
2382  * Send an ICMP error after patching up the packet appropriately.
2383  * Uses ip_drop_input and bumps the appropriate MIB.
2384  * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
2385  */
2386 void
2387 ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
2388     ip_recv_attr_t *ira)
2389 {
2390 	ip6_t		*ip6h;
2391 	boolean_t	secure;
2392 	ill_t		*ill = ira->ira_ill;
2393 	ip_stack_t	*ipst = ill->ill_ipst;
2394 	netstack_t	*ns = ipst->ips_netstack;
2395 	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2396 
2397 	secure = ira->ira_flags & IRAF_IPSEC_SECURE;
2398 
2399 	/*
2400 	 * We are generating an icmp error for some inbound packet.
2401 	 * Called from all ip_fanout_(udp, tcp, proto) functions.
2402 	 * Before we generate an error, check with global policy
2403 	 * to see whether this is allowed to enter the system. As
2404 	 * there is no "conn", we are checking with global policy.
2405 	 */
2406 	ip6h = (ip6_t *)mp->b_rptr;
2407 	if (secure || ipss->ipsec_inbound_v6_policy_present) {
2408 		mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
2409 		if (mp == NULL)
2410 			return;
2411 	}
2412 
2413 	/* We never send errors for protocols that we do implement */
2414 	if (ira->ira_protocol == IPPROTO_ICMPV6) {
2415 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2416 		ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
2417 		freemsg(mp);
2418 		return;
2419 	}
2420 
2421 	switch (icmp_type) {
2422 	case ICMP6_DST_UNREACH:
2423 		ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
2424 
2425 		BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
2426 		ip_drop_input("ipIfStatsNoPorts", mp, ill);
2427 
2428 		icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
2429 		break;
2430 	case ICMP6_PARAM_PROB:
2431 		ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
2432 
2433 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
2434 		ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
2435 
2436 		/* Let the system determine the offset for this one */
2437 		icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
2438 		break;
2439 	default:
2440 #ifdef DEBUG
2441 		panic("ip_fanout_send_icmp_v6: wrong type");
2442 		/*NOTREACHED*/
2443 #else
2444 		freemsg(mp);
2445 		break;
2446 #endif
2447 	}
2448 }
2449 
2450 /*
2451  * Fanout for UDP packets that are multicast or ICMP errors.
2452  * (Unicast fanout is handled in ip_input_v6.)
2453  *
2454  * If SO_REUSEADDR is set all multicast packets
2455  * will be delivered to all conns bound to the same port.
2456  *
2457  * Fanout for UDP packets.
2458  * The caller puts <fport, lport> in the ports parameter.
2459  * ire_type must be IRE_BROADCAST for multicast and broadcast packets.
2460  *
2461  * If SO_REUSEADDR is set all multicast and broadcast packets
2462  * will be delivered to all conns bound to the same port.
2463  *
2464  * Zones notes:
2465  * Earlier in ip_input on a system with multiple shared-IP zones we
2466  * duplicate the multicast and broadcast packets and send them up
2467  * with each explicit zoneid that exists on that ill.
2468  * This means that here we can match the zoneid with SO_ALLZONES being special.
2469  */
2470 void
2471 ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
2472     ip_recv_attr_t *ira)
2473 {
2474 	in6_addr_t	laddr;
2475 	conn_t		*connp;
2476 	connf_t		*connfp;
2477 	in6_addr_t	faddr;
2478 	ill_t		*ill = ira->ira_ill;
2479 	ip_stack_t	*ipst = ill->ill_ipst;
2480 
2481 	ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
2482 
2483 	laddr = ip6h->ip6_dst;
2484 	faddr = ip6h->ip6_src;
2485 
2486 	/* Attempt to find a client stream based on destination port. */
2487 	connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2488 	mutex_enter(&connfp->connf_lock);
2489 	connp = connfp->connf_head;
2490 	while (connp != NULL) {
2491 		if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
2492 		    conn_wantpacket_v6(connp, ira, ip6h) &&
2493 		    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2494 		    tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2495 			break;
2496 		connp = connp->conn_next;
2497 	}
2498 
2499 	if (connp == NULL)
2500 		goto notfound;
2501 
2502 	CONN_INC_REF(connp);
2503 
2504 	if (connp->conn_reuseaddr) {
2505 		conn_t		*first_connp = connp;
2506 		conn_t		*next_connp;
2507 		mblk_t		*mp1;
2508 
2509 		connp = connp->conn_next;
2510 		for (;;) {
2511 			while (connp != NULL) {
2512 				if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
2513 				    fport, faddr) &&
2514 				    conn_wantpacket_v6(connp, ira, ip6h) &&
2515 				    (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2516 				    tsol_receive_local(mp, &laddr, IPV6_VERSION,
2517 				    ira, connp)))
2518 					break;
2519 				connp = connp->conn_next;
2520 			}
2521 			if (connp == NULL) {
2522 				/* No more interested clients */
2523 				connp = first_connp;
2524 				break;
2525 			}
2526 			if (((mp1 = dupmsg(mp)) == NULL) &&
2527 			    ((mp1 = copymsg(mp)) == NULL)) {
2528 				/* Memory allocation failed */
2529 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2530 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
2531 				connp = first_connp;
2532 				break;
2533 			}
2534 
2535 			CONN_INC_REF(connp);
2536 			mutex_exit(&connfp->connf_lock);
2537 
2538 			IP6_STAT(ipst, ip6_udp_fanmb);
2539 			ip_fanout_udp_conn(connp, mp1, NULL,
2540 			    (ip6_t *)mp1->b_rptr, ira);
2541 
2542 			mutex_enter(&connfp->connf_lock);
2543 			/* Follow the next pointer before releasing the conn. */
2544 			next_connp = connp->conn_next;
2545 			IP6_STAT(ipst, ip6_udp_fanmb);
2546 			CONN_DEC_REF(connp);
2547 			connp = next_connp;
2548 		}
2549 	}
2550 
2551 	/* Last one.  Send it upstream. */
2552 	mutex_exit(&connfp->connf_lock);
2553 
2554 	IP6_STAT(ipst, ip6_udp_fanmb);
2555 	ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
2556 	CONN_DEC_REF(connp);
2557 	return;
2558 
2559 notfound:
2560 	mutex_exit(&connfp->connf_lock);
2561 	/*
2562 	 * No one bound to this port.  Is
2563 	 * there a client that wants all
2564 	 * unclaimed datagrams?
2565 	 */
2566 	if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
2567 		ASSERT(ira->ira_protocol == IPPROTO_UDP);
2568 		ip_fanout_proto_v6(mp, ip6h, ira);
2569 	} else {
2570 		ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
2571 		    ICMP6_DST_UNREACH_NOPORT, ira);
2572 	}
2573 }
2574 
2575 /*
2576  * int ip_find_hdr_v6()
2577  *
2578  * This routine is used by the upper layer protocols, iptun, and IPsec:
2579  * - Set extension header pointers to appropriate locations
2580  * - Determine IPv6 header length and return it
2581  * - Return a pointer to the last nexthdr value
2582  *
2583  * The caller must initialize ipp_fields.
2584  * The upper layer protocols normally set label_separate which makes the
2585  * routine put the TX label in ipp_label_v6. If this is not set then
2586  * the hop-by-hop options including the label are placed in ipp_hopopts.
2587  *
2588  * NOTE: If multiple extension headers of the same type are present,
2589  * ip_find_hdr_v6() will set the respective extension header pointers
2590  * to the first one that it encounters in the IPv6 header.  It also
2591  * skips fragment headers.  This routine deals with malformed packets
2592  * of various sorts in which case the returned length is up to the
2593  * malformed part.
2594  */
2595 int
2596 ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
2597     uint8_t *nexthdrp)
2598 {
2599 	uint_t	length, ehdrlen;
2600 	uint8_t nexthdr;
2601 	uint8_t *whereptr, *endptr;
2602 	ip6_dest_t *tmpdstopts;
2603 	ip6_rthdr_t *tmprthdr;
2604 	ip6_hbh_t *tmphopopts;
2605 	ip6_frag_t *tmpfraghdr;
2606 
2607 	ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
2608 	ipp->ipp_hoplimit = ip6h->ip6_hops;
2609 	ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
2610 	ipp->ipp_addr = ip6h->ip6_dst;
2611 
2612 	length = IPV6_HDR_LEN;
2613 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2614 	endptr = mp->b_wptr;
2615 
2616 	nexthdr = ip6h->ip6_nxt;
2617 	while (whereptr < endptr) {
2618 		/* Is there enough left for len + nexthdr? */
2619 		if (whereptr + MIN_EHDR_LEN > endptr)
2620 			goto done;
2621 
2622 		switch (nexthdr) {
2623 		case IPPROTO_HOPOPTS: {
2624 			/* We check for any CIPSO */
2625 			uchar_t *secopt;
2626 			boolean_t hbh_needed;
2627 			uchar_t *after_secopt;
2628 
2629 			tmphopopts = (ip6_hbh_t *)whereptr;
2630 			ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
2631 			if ((uchar_t *)tmphopopts +  ehdrlen > endptr)
2632 				goto done;
2633 			nexthdr = tmphopopts->ip6h_nxt;
2634 
2635 			if (!label_separate) {
2636 				secopt = NULL;
2637 				after_secopt = whereptr;
2638 			} else {
2639 				/*
2640 				 * We have dropped packets with bad options in
2641 				 * ip6_input. No need to check return value
2642 				 * here.
2643 				 */
2644 				(void) tsol_find_secopt_v6(whereptr, ehdrlen,
2645 				    &secopt, &after_secopt, &hbh_needed);
2646 			}
2647 			if (secopt != NULL && after_secopt - whereptr > 0) {
2648 				ipp->ipp_fields |= IPPF_LABEL_V6;
2649 				ipp->ipp_label_v6 = secopt;
2650 				ipp->ipp_label_len_v6 = after_secopt - whereptr;
2651 			} else {
2652 				ipp->ipp_label_len_v6 = 0;
2653 				after_secopt = whereptr;
2654 				hbh_needed = B_TRUE;
2655 			}
2656 			/* return only 1st hbh */
2657 			if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
2658 				ipp->ipp_fields |= IPPF_HOPOPTS;
2659 				ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
2660 				ipp->ipp_hopoptslen = ehdrlen -
2661 				    ipp->ipp_label_len_v6;
2662 			}
2663 			break;
2664 		}
2665 		case IPPROTO_DSTOPTS:
2666 			tmpdstopts = (ip6_dest_t *)whereptr;
2667 			ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
2668 			if ((uchar_t *)tmpdstopts +  ehdrlen > endptr)
2669 				goto done;
2670 			nexthdr = tmpdstopts->ip6d_nxt;
2671 			/*
2672 			 * ipp_dstopts is set to the destination header after a
2673 			 * routing header.
2674 			 * Assume it is a post-rthdr destination header
2675 			 * and adjust when we find an rthdr.
2676 			 */
2677 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2678 				ipp->ipp_fields |= IPPF_DSTOPTS;
2679 				ipp->ipp_dstopts = tmpdstopts;
2680 				ipp->ipp_dstoptslen = ehdrlen;
2681 			}
2682 			break;
2683 		case IPPROTO_ROUTING:
2684 			tmprthdr = (ip6_rthdr_t *)whereptr;
2685 			ehdrlen = 8 * (tmprthdr->ip6r_len + 1);
2686 			if ((uchar_t *)tmprthdr +  ehdrlen > endptr)
2687 				goto done;
2688 			nexthdr = tmprthdr->ip6r_nxt;
2689 			/* return only 1st rthdr */
2690 			if (!(ipp->ipp_fields & IPPF_RTHDR)) {
2691 				ipp->ipp_fields |= IPPF_RTHDR;
2692 				ipp->ipp_rthdr = tmprthdr;
2693 				ipp->ipp_rthdrlen = ehdrlen;
2694 			}
2695 			/*
2696 			 * Make any destination header we've seen be a
2697 			 * pre-rthdr destination header.
2698 			 */
2699 			if (ipp->ipp_fields & IPPF_DSTOPTS) {
2700 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2701 				ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
2702 				ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
2703 				ipp->ipp_dstopts = NULL;
2704 				ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
2705 				ipp->ipp_dstoptslen = 0;
2706 			}
2707 			break;
2708 		case IPPROTO_FRAGMENT:
2709 			tmpfraghdr = (ip6_frag_t *)whereptr;
2710 			ehdrlen = sizeof (ip6_frag_t);
2711 			if ((uchar_t *)tmpfraghdr + ehdrlen > endptr)
2712 				goto done;
2713 			nexthdr = tmpfraghdr->ip6f_nxt;
2714 			if (!(ipp->ipp_fields & IPPF_FRAGHDR)) {
2715 				ipp->ipp_fields |= IPPF_FRAGHDR;
2716 				ipp->ipp_fraghdr = tmpfraghdr;
2717 				ipp->ipp_fraghdrlen = ehdrlen;
2718 			}
2719 			break;
2720 		case IPPROTO_NONE:
2721 		default:
2722 			goto done;
2723 		}
2724 		length += ehdrlen;
2725 		whereptr += ehdrlen;
2726 	}
2727 done:
2728 	if (nexthdrp != NULL)
2729 		*nexthdrp = nexthdr;
2730 	return (length);
2731 }
2732 
2733 /*
2734  * Return the length of the IPv6 related headers (including extension headers)
2735  * Returns a length even if the packet is malformed.
2736  */
2737 uint16_t
2738 ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
2739 {
2740 	uint16_t hdr_len;
2741 
2742 	(void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL);
2743 	return (hdr_len);
2744 }
2745 
2746 /*
2747  * Parse and process any hop-by-hop or destination options.
2748  *
2749  * Assumes that q is an ill read queue so that ICMP errors for link-local
2750  * destinations are sent out the correct interface.
2751  *
2752  * Returns -1 if there was an error and mp has been consumed.
2753  * Returns 0 if no special action is needed.
2754  * Returns 1 if the packet contained a router alert option for this node
2755  * which is verified to be "interesting/known" for our implementation.
2756  *
2757  * XXX Note: In future as more hbh or dest options are defined,
2758  * it may be better to have different routines for hbh and dest
2759  * options as opt_type fields other than IP6OPT_PAD1 and IP6OPT_PADN
2760  * may have same value in different namespaces. Or is it same namespace ??
2761  * Current code checks for each opt_type (other than pads) if it is in
2762  * the expected  nexthdr (hbh or dest)
2763  */
2764 int
2765 ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
2766     uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
2767 {
2768 	uint8_t opt_type;
2769 	uint_t optused = 0;
2770 	int ret = 0;
2771 	const char *errtype;
2772 	ill_t		*ill = ira->ira_ill;
2773 	ip_stack_t	*ipst = ill->ill_ipst;
2774 
2775 	while (optlen != 0) {
2776 		opt_type = *optptr;
2777 		if (opt_type == IP6OPT_PAD1) {
2778 			optused = 1;
2779 		} else {
2780 			if (optlen < 2)
2781 				goto bad_opt;
2782 			errtype = "malformed";
2783 			if (opt_type == ip6opt_ls) {
2784 				optused = 2 + optptr[1];
2785 				if (optused > optlen)
2786 					goto bad_opt;
2787 			} else switch (opt_type) {
2788 			case IP6OPT_PADN:
2789 				/*
2790 				 * Note:We don't verify that (N-2) pad octets
2791 				 * are zero as required by spec. Adhere to
2792 				 * "be liberal in what you accept..." part of
2793 				 * implementation philosophy (RFC791,RFC1122)
2794 				 */
2795 				optused = 2 + optptr[1];
2796 				if (optused > optlen)
2797 					goto bad_opt;
2798 				break;
2799 
2800 			case IP6OPT_JUMBO:
2801 				if (hdr_type != IPPROTO_HOPOPTS)
2802 					goto opt_error;
2803 				goto opt_error; /* XXX Not implemented! */
2804 
2805 			case IP6OPT_ROUTER_ALERT: {
2806 				struct ip6_opt_router *or;
2807 
2808 				if (hdr_type != IPPROTO_HOPOPTS)
2809 					goto opt_error;
2810 				optused = 2 + optptr[1];
2811 				if (optused > optlen)
2812 					goto bad_opt;
2813 				or = (struct ip6_opt_router *)optptr;
2814 				/* Check total length and alignment */
2815 				if (optused != sizeof (*or) ||
2816 				    ((uintptr_t)or->ip6or_value & 0x1) != 0)
2817 					goto opt_error;
2818 				/* Check value */
2819 				switch (*((uint16_t *)or->ip6or_value)) {
2820 				case IP6_ALERT_MLD:
2821 				case IP6_ALERT_RSVP:
2822 					ret = 1;
2823 				}
2824 				break;
2825 			}
2826 			case IP6OPT_HOME_ADDRESS: {
2827 				/*
2828 				 * Minimal support for the home address option
2829 				 * (which is required by all IPv6 nodes).
2830 				 * Implement by just swapping the home address
2831 				 * and source address.
2832 				 * XXX Note: this has IPsec implications since
2833 				 * AH needs to take this into account.
2834 				 * Also, when IPsec is used we need to ensure
2835 				 * that this is only processed once
2836 				 * in the received packet (to avoid swapping
2837 				 * back and forth).
2838 				 * NOTE:This option processing is considered
2839 				 * to be unsafe and prone to a denial of
2840 				 * service attack.
2841 				 * The current processing is not safe even with
2842 				 * IPsec secured IP packets. Since the home
2843 				 * address option processing requirement still
2844 				 * is in the IETF draft and in the process of
2845 				 * being redefined for its usage, it has been
2846 				 * decided to turn off the option by default.
2847 				 * If this section of code needs to be executed,
2848 				 * ndd variable ip6_ignore_home_address_opt
2849 				 * should be set to 0 at the user's own risk.
2850 				 */
2851 				struct ip6_opt_home_address *oh;
2852 				in6_addr_t tmp;
2853 
2854 				if (ipst->ips_ipv6_ignore_home_address_opt)
2855 					goto opt_error;
2856 
2857 				if (hdr_type != IPPROTO_DSTOPTS)
2858 					goto opt_error;
2859 				optused = 2 + optptr[1];
2860 				if (optused > optlen)
2861 					goto bad_opt;
2862 
2863 				/*
2864 				 * We did this dest. opt the first time
2865 				 * around (i.e. before AH processing).
2866 				 * If we've done AH... stop now.
2867 				 */
2868 				if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
2869 				    ira->ira_ipsec_ah_sa != NULL)
2870 					break;
2871 
2872 				oh = (struct ip6_opt_home_address *)optptr;
2873 				/* Check total length and alignment */
2874 				if (optused < sizeof (*oh) ||
2875 				    ((uintptr_t)oh->ip6oh_addr & 0x7) != 0)
2876 					goto opt_error;
2877 				/* Swap ip6_src and the home address */
2878 				tmp = ip6h->ip6_src;
2879 				/* XXX Note: only 8 byte alignment option */
2880 				ip6h->ip6_src = *(in6_addr_t *)oh->ip6oh_addr;
2881 				*(in6_addr_t *)oh->ip6oh_addr = tmp;
2882 				break;
2883 			}
2884 
2885 			case IP6OPT_TUNNEL_LIMIT:
2886 				if (hdr_type != IPPROTO_DSTOPTS) {
2887 					goto opt_error;
2888 				}
2889 				optused = 2 + optptr[1];
2890 				if (optused > optlen) {
2891 					goto bad_opt;
2892 				}
2893 				if (optused != 3) {
2894 					goto opt_error;
2895 				}
2896 				break;
2897 
2898 			default:
2899 				errtype = "unknown";
2900 				/* FALLTHROUGH */
2901 			opt_error:
2902 				/* Determine which zone should send error */
2903 				switch (IP6OPT_TYPE(opt_type)) {
2904 				case IP6OPT_TYPE_SKIP:
2905 					optused = 2 + optptr[1];
2906 					if (optused > optlen)
2907 						goto bad_opt;
2908 					ip1dbg(("ip_process_options_v6: %s "
2909 					    "opt 0x%x skipped\n",
2910 					    errtype, opt_type));
2911 					break;
2912 				case IP6OPT_TYPE_DISCARD:
2913 					ip1dbg(("ip_process_options_v6: %s "
2914 					    "opt 0x%x; packet dropped\n",
2915 					    errtype, opt_type));
2916 					BUMP_MIB(ill->ill_ip_mib,
2917 					    ipIfStatsInHdrErrors);
2918 					ip_drop_input("ipIfStatsInHdrErrors",
2919 					    mp, ill);
2920 					freemsg(mp);
2921 					return (-1);
2922 				case IP6OPT_TYPE_ICMP:
2923 					BUMP_MIB(ill->ill_ip_mib,
2924 					    ipIfStatsInHdrErrors);
2925 					ip_drop_input("ipIfStatsInHdrErrors",
2926 					    mp, ill);
2927 					icmp_param_problem_v6(mp,
2928 					    ICMP6_PARAMPROB_OPTION,
2929 					    (uint32_t)(optptr -
2930 					    (uint8_t *)ip6h),
2931 					    B_FALSE, ira);
2932 					return (-1);
2933 				case IP6OPT_TYPE_FORCEICMP:
2934 					BUMP_MIB(ill->ill_ip_mib,
2935 					    ipIfStatsInHdrErrors);
2936 					ip_drop_input("ipIfStatsInHdrErrors",
2937 					    mp, ill);
2938 					icmp_param_problem_v6(mp,
2939 					    ICMP6_PARAMPROB_OPTION,
2940 					    (uint32_t)(optptr -
2941 					    (uint8_t *)ip6h),
2942 					    B_TRUE, ira);
2943 					return (-1);
2944 				default:
2945 					ASSERT(0);
2946 				}
2947 			}
2948 		}
2949 		optlen -= optused;
2950 		optptr += optused;
2951 	}
2952 	return (ret);
2953 
2954 bad_opt:
2955 	/* Determine which zone should send error */
2956 	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
2957 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
2958 	    (uint32_t)(optptr - (uint8_t *)ip6h),
2959 	    B_FALSE, ira);
2960 	return (-1);
2961 }
2962 
2963 /*
2964  * Process a routing header that is not yet empty.
2965  * Because of RFC 5095, we now reject all route headers.
2966  */
2967 void
2968 ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
2969     ip_recv_attr_t *ira)
2970 {
2971 	ill_t		*ill = ira->ira_ill;
2972 	ip_stack_t	*ipst = ill->ill_ipst;
2973 
2974 	ASSERT(rth->ip6r_segleft != 0);
2975 
2976 	if (!ipst->ips_ipv6_forward_src_routed) {
2977 		/* XXX Check for source routed out same interface? */
2978 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
2979 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
2980 		ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
2981 		freemsg(mp);
2982 		return;
2983 	}
2984 
2985 	ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
2986 	icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
2987 	    (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
2988 	    B_FALSE, ira);
2989 }
2990 
2991 /*
2992  * Read side put procedure for IPv6 module.
2993  */
2994 int
2995 ip_rput_v6(queue_t *q, mblk_t *mp)
2996 {
2997 	ill_t		*ill;
2998 
2999 	ill = (ill_t *)q->q_ptr;
3000 	if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
3001 		union DL_primitives *dl;
3002 
3003 		dl = (union DL_primitives *)mp->b_rptr;
3004 		/*
3005 		 * Things are opening or closing - only accept DLPI
3006 		 * ack messages. If the stream is closing and ip_wsrv
3007 		 * has completed, ip_close is out of the qwait, but has
3008 		 * not yet completed qprocsoff. Don't proceed any further
3009 		 * because the ill has been cleaned up and things hanging
3010 		 * off the ill have been freed.
3011 		 */
3012 		if ((mp->b_datap->db_type != M_PCPROTO) ||
3013 		    (dl->dl_primitive == DL_UNITDATA_IND)) {
3014 			inet_freemsg(mp);
3015 			return (0);
3016 		}
3017 	}
3018 	if (DB_TYPE(mp) == M_DATA) {
3019 		struct mac_header_info_s mhi;
3020 
3021 		ip_mdata_to_mhi(ill, mp, &mhi);
3022 		ip_input_v6(ill, NULL, mp, &mhi);
3023 	} else {
3024 		ip_rput_notdata(ill, mp);
3025 	}
3026 	return (0);
3027 }
3028 
3029 /*
3030  * Walk through the IPv6 packet in mp and see if there's an AH header
3031  * in it.  See if the AH header needs to get done before other headers in
3032  * the packet.  (Worker function for ipsec_early_ah_v6().)
3033  */
3034 #define	IPSEC_HDR_DONT_PROCESS	0
3035 #define	IPSEC_HDR_PROCESS	1
3036 #define	IPSEC_MEMORY_ERROR	2 /* or malformed packet */
3037 static int
3038 ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
3039 {
3040 	uint_t	length;
3041 	uint_t	ehdrlen;
3042 	uint8_t *whereptr;
3043 	uint8_t *endptr;
3044 	uint8_t *nexthdrp;
3045 	ip6_dest_t *desthdr;
3046 	ip6_rthdr_t *rthdr;
3047 	ip6_t	*ip6h;
3048 
3049 	/*
3050 	 * For now just pullup everything.  In general, the less pullups,
3051 	 * the better, but there's so much squirrelling through anyway,
3052 	 * it's just easier this way.
3053 	 */
3054 	if (!pullupmsg(mp, -1)) {
3055 		return (IPSEC_MEMORY_ERROR);
3056 	}
3057 
3058 	ip6h = (ip6_t *)mp->b_rptr;
3059 	length = IPV6_HDR_LEN;
3060 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
3061 	endptr = mp->b_wptr;
3062 
3063 	/*
3064 	 * We can't just use the argument nexthdr in the place
3065 	 * of nexthdrp becaue we don't dereference nexthdrp
3066 	 * till we confirm whether it is a valid address.
3067 	 */
3068 	nexthdrp = &ip6h->ip6_nxt;
3069 	while (whereptr < endptr) {
3070 		/* Is there enough left for len + nexthdr? */
3071 		if (whereptr + MIN_EHDR_LEN > endptr)
3072 			return (IPSEC_MEMORY_ERROR);
3073 
3074 		switch (*nexthdrp) {
3075 		case IPPROTO_HOPOPTS:
3076 		case IPPROTO_DSTOPTS:
3077 			/* Assumes the headers are identical for hbh and dst */
3078 			desthdr = (ip6_dest_t *)whereptr;
3079 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
3080 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
3081 				return (IPSEC_MEMORY_ERROR);
3082 			/*
3083 			 * Return DONT_PROCESS because the destination
3084 			 * options header may be for each hop in a
3085 			 * routing-header, and we only want AH if we're
3086 			 * finished with routing headers.
3087 			 */
3088 			if (*nexthdrp == IPPROTO_DSTOPTS)
3089 				return (IPSEC_HDR_DONT_PROCESS);
3090 			nexthdrp = &desthdr->ip6d_nxt;
3091 			break;
3092 		case IPPROTO_ROUTING:
3093 			rthdr = (ip6_rthdr_t *)whereptr;
3094 
3095 			/*
3096 			 * If there's more hops left on the routing header,
3097 			 * return now with DON'T PROCESS.
3098 			 */
3099 			if (rthdr->ip6r_segleft > 0)
3100 				return (IPSEC_HDR_DONT_PROCESS);
3101 
3102 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
3103 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
3104 				return (IPSEC_MEMORY_ERROR);
3105 			nexthdrp = &rthdr->ip6r_nxt;
3106 			break;
3107 		case IPPROTO_FRAGMENT:
3108 			/* Wait for reassembly */
3109 			return (IPSEC_HDR_DONT_PROCESS);
3110 		case IPPROTO_AH:
3111 			*nexthdr = IPPROTO_AH;
3112 			return (IPSEC_HDR_PROCESS);
3113 		case IPPROTO_NONE:
3114 			/* No next header means we're finished */
3115 		default:
3116 			return (IPSEC_HDR_DONT_PROCESS);
3117 		}
3118 		length += ehdrlen;
3119 		whereptr += ehdrlen;
3120 	}
3121 	/*
3122 	 * Malformed/truncated packet.
3123 	 */
3124 	return (IPSEC_MEMORY_ERROR);
3125 }
3126 
3127 /*
3128  * Path for AH if options are present.
3129  * Returns NULL if the mblk was consumed.
3130  *
3131  * Sometimes AH needs to be done before other IPv6 headers for security
3132  * reasons.  This function (and its ipsec_needs_processing_v6() above)
3133  * indicates if that is so, and fans out to the appropriate IPsec protocol
3134  * for the datagram passed in.
3135  */
3136 mblk_t *
3137 ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
3138 {
3139 	uint8_t nexthdr;
3140 	ah_t *ah;
3141 	ill_t		*ill = ira->ira_ill;
3142 	ip_stack_t	*ipst = ill->ill_ipst;
3143 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
3144 
3145 	switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
3146 	case IPSEC_MEMORY_ERROR:
3147 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3148 		ip_drop_input("ipIfStatsInDiscards", mp, ill);
3149 		freemsg(mp);
3150 		return (NULL);
3151 	case IPSEC_HDR_DONT_PROCESS:
3152 		return (mp);
3153 	}
3154 
3155 	/* Default means send it to AH! */
3156 	ASSERT(nexthdr == IPPROTO_AH);
3157 
3158 	if (!ipsec_loaded(ipss)) {
3159 		ip_proto_not_sup(mp, ira);
3160 		return (NULL);
3161 	}
3162 
3163 	mp = ipsec_inbound_ah_sa(mp, ira, &ah);
3164 	if (mp == NULL)
3165 		return (NULL);
3166 	ASSERT(ah != NULL);
3167 	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3168 	ASSERT(ira->ira_ipsec_ah_sa != NULL);
3169 	ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
3170 	mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
3171 
3172 	if (mp == NULL) {
3173 		/*
3174 		 * Either it failed or is pending. In the former case
3175 		 * ipIfStatsInDiscards was increased.
3176 		 */
3177 		return (NULL);
3178 	}
3179 
3180 	/* we're done with IPsec processing, send it up */
3181 	ip_input_post_ipsec(mp, ira);
3182 	return (NULL);
3183 }
3184 
3185 /*
3186  * Reassemble fragment.
3187  * When it returns a completed message the first mblk will only contain
3188  * the headers prior to the fragment header, with the nexthdr value updated
3189  * to be the header after the fragment header.
3190  */
3191 mblk_t *
3192 ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
3193     ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
3194 {
3195 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
3196 	uint16_t	offset;
3197 	boolean_t	more_frags;
3198 	uint8_t		nexthdr = fraghdr->ip6f_nxt;
3199 	in6_addr_t	*v6dst_ptr;
3200 	in6_addr_t	*v6src_ptr;
3201 	uint_t		end;
3202 	uint_t		hdr_length;
3203 	size_t		count;
3204 	ipf_t		*ipf;
3205 	ipf_t		**ipfp;
3206 	ipfb_t		*ipfb;
3207 	mblk_t		*mp1;
3208 	uint8_t		ecn_info = 0;
3209 	size_t		msg_len;
3210 	mblk_t		*tail_mp;
3211 	mblk_t		*t_mp;
3212 	boolean_t	pruned = B_FALSE;
3213 	uint32_t	sum_val;
3214 	uint16_t	sum_flags;
3215 	ill_t		*ill = ira->ira_ill;
3216 	ip_stack_t	*ipst = ill->ill_ipst;
3217 	uint_t		prev_nexthdr_offset;
3218 	uint8_t		prev_nexthdr;
3219 	uint8_t		*ptr;
3220 	uint32_t	packet_size;
3221 
3222 	/*
3223 	 * We utilize hardware computed checksum info only for UDP since
3224 	 * IP fragmentation is a normal occurence for the protocol.  In
3225 	 * addition, checksum offload support for IP fragments carrying
3226 	 * UDP payload is commonly implemented across network adapters.
3227 	 */
3228 	ASSERT(ira->ira_rill != NULL);
3229 	if (nexthdr == IPPROTO_UDP && dohwcksum &&
3230 	    ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
3231 	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
3232 		mblk_t *mp1 = mp->b_cont;
3233 		int32_t len;
3234 
3235 		/* Record checksum information from the packet */
3236 		sum_val = (uint32_t)DB_CKSUM16(mp);
3237 		sum_flags = DB_CKSUMFLAGS(mp);
3238 
3239 		/* fragmented payload offset from beginning of mblk */
3240 		offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
3241 
3242 		if ((sum_flags & HCK_PARTIALCKSUM) &&
3243 		    (mp1 == NULL || mp1->b_cont == NULL) &&
3244 		    offset >= DB_CKSUMSTART(mp) &&
3245 		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
3246 			uint32_t adj;
3247 			/*
3248 			 * Partial checksum has been calculated by hardware
3249 			 * and attached to the packet; in addition, any
3250 			 * prepended extraneous data is even byte aligned.
3251 			 * If any such data exists, we adjust the checksum;
3252 			 * this would also handle any postpended data.
3253 			 */
3254 			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
3255 			    mp, mp1, len, adj);
3256 
3257 			/* One's complement subtract extraneous checksum */
3258 			if (adj >= sum_val)
3259 				sum_val = ~(adj - sum_val) & 0xFFFF;
3260 			else
3261 				sum_val -= adj;
3262 		}
3263 	} else {
3264 		sum_val = 0;
3265 		sum_flags = 0;
3266 	}
3267 
3268 	/* Clear hardware checksumming flag */
3269 	DB_CKSUMFLAGS(mp) = 0;
3270 
3271 	/*
3272 	 * Determine the offset (from the begining of the IP header)
3273 	 * of the nexthdr value which has IPPROTO_FRAGMENT. We use
3274 	 * this when removing the fragment header from the packet.
3275 	 * This packet consists of the IPv6 header, a potential
3276 	 * hop-by-hop options header, a potential pre-routing-header
3277 	 * destination options header, and a potential routing header.
3278 	 */
3279 	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
3280 	prev_nexthdr = ip6h->ip6_nxt;
3281 	ptr = (uint8_t *)&ip6h[1];
3282 
3283 	if (prev_nexthdr == IPPROTO_HOPOPTS) {
3284 		ip6_hbh_t	*hbh_hdr;
3285 		uint_t		hdr_len;
3286 
3287 		hbh_hdr = (ip6_hbh_t *)ptr;
3288 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
3289 		prev_nexthdr = hbh_hdr->ip6h_nxt;
3290 		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
3291 		    - (uint8_t *)ip6h;
3292 		ptr += hdr_len;
3293 	}
3294 	if (prev_nexthdr == IPPROTO_DSTOPTS) {
3295 		ip6_dest_t	*dest_hdr;
3296 		uint_t		hdr_len;
3297 
3298 		dest_hdr = (ip6_dest_t *)ptr;
3299 		hdr_len = 8 * (dest_hdr->ip6d_len + 1);
3300 		prev_nexthdr = dest_hdr->ip6d_nxt;
3301 		prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
3302 		    - (uint8_t *)ip6h;
3303 		ptr += hdr_len;
3304 	}
3305 	if (prev_nexthdr == IPPROTO_ROUTING) {
3306 		ip6_rthdr_t	*rthdr;
3307 		uint_t		hdr_len;
3308 
3309 		rthdr = (ip6_rthdr_t *)ptr;
3310 		prev_nexthdr = rthdr->ip6r_nxt;
3311 		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
3312 		    - (uint8_t *)ip6h;
3313 		hdr_len = 8 * (rthdr->ip6r_len + 1);
3314 		ptr += hdr_len;
3315 	}
3316 	if (prev_nexthdr != IPPROTO_FRAGMENT) {
3317 		/* Can't handle other headers before the fragment header */
3318 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3319 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3320 		freemsg(mp);
3321 		return (NULL);
3322 	}
3323 
3324 	/*
3325 	 * Note: Fragment offset in header is in 8-octet units.
3326 	 * Clearing least significant 3 bits not only extracts
3327 	 * it but also gets it in units of octets.
3328 	 */
3329 	offset = ntohs(fraghdr->ip6f_offlg) & ~7;
3330 	more_frags = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG);
3331 
3332 	/*
3333 	 * Is the more frags flag on and the payload length not a multiple
3334 	 * of eight?
3335 	 */
3336 	if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
3337 		ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3338 		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3339 		    (uint32_t)((char *)&ip6h->ip6_plen -
3340 		    (char *)ip6h), B_FALSE, ira);
3341 		return (NULL);
3342 	}
3343 
3344 	v6src_ptr = &ip6h->ip6_src;
3345 	v6dst_ptr = &ip6h->ip6_dst;
3346 	end = remlen;
3347 
3348 	hdr_length = (uint_t)((char *)&fraghdr[1] - (char *)ip6h);
3349 	end += offset;
3350 
3351 	/*
3352 	 * Would fragment cause reassembled packet to have a payload length
3353 	 * greater than IP_MAXPACKET - the max payload size?
3354 	 */
3355 	if (end > IP_MAXPACKET) {
3356 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3357 		ip_drop_input("Reassembled packet too large", mp, ill);
3358 		icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3359 		    (uint32_t)((char *)&fraghdr->ip6f_offlg -
3360 		    (char *)ip6h), B_FALSE, ira);
3361 		return (NULL);
3362 	}
3363 
3364 	/*
3365 	 * This packet just has one fragment. Reassembly not
3366 	 * needed.
3367 	 */
3368 	if (!more_frags && offset == 0) {
3369 		goto reass_done;
3370 	}
3371 
3372 	/*
3373 	 * Drop the fragmented as early as possible, if
3374 	 * we don't have resource(s) to re-assemble.
3375 	 */
3376 	if (ipst->ips_ip_reass_queue_bytes == 0) {
3377 		freemsg(mp);
3378 		return (NULL);
3379 	}
3380 
3381 	/* Record the ECN field info. */
3382 	ecn_info = (uint8_t)(ntohl(ip6h->ip6_vcf & htonl(~0xFFCFFFFF)) >> 20);
3383 	/*
3384 	 * If this is not the first fragment, dump the unfragmentable
3385 	 * portion of the packet.
3386 	 */
3387 	if (offset)
3388 		mp->b_rptr = (uchar_t *)&fraghdr[1];
3389 
3390 	/*
3391 	 * Fragmentation reassembly.  Each ILL has a hash table for
3392 	 * queueing packets undergoing reassembly for all IPIFs
3393 	 * associated with the ILL.  The hash is based on the packet
3394 	 * IP ident field.  The ILL frag hash table was allocated
3395 	 * as a timer block at the time the ILL was created.  Whenever
3396 	 * there is anything on the reassembly queue, the timer will
3397 	 * be running.
3398 	 */
3399 	/* Handle vnic loopback of fragments */
3400 	if (mp->b_datap->db_ref > 2)
3401 		msg_len = 0;
3402 	else
3403 		msg_len = MBLKSIZE(mp);
3404 
3405 	tail_mp = mp;
3406 	while (tail_mp->b_cont != NULL) {
3407 		tail_mp = tail_mp->b_cont;
3408 		if (tail_mp->b_datap->db_ref <= 2)
3409 			msg_len += MBLKSIZE(tail_mp);
3410 	}
3411 	/*
3412 	 * If the reassembly list for this ILL will get too big
3413 	 * prune it.
3414 	 */
3415 
3416 	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
3417 	    ipst->ips_ip_reass_queue_bytes) {
3418 		DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
3419 		    uint_t, ill->ill_frag_count,
3420 		    uint_t, ipst->ips_ip_reass_queue_bytes);
3421 		ill_frag_prune(ill,
3422 		    (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
3423 		    (ipst->ips_ip_reass_queue_bytes - msg_len));
3424 		pruned = B_TRUE;
3425 	}
3426 
3427 	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH_V6(*v6src_ptr, ident)];
3428 	mutex_enter(&ipfb->ipfb_lock);
3429 
3430 	ipfp = &ipfb->ipfb_ipf;
3431 	/* Try to find an existing fragment queue for this packet. */
3432 	for (;;) {
3433 		ipf = ipfp[0];
3434 		if (ipf) {
3435 			/*
3436 			 * It has to match on ident, source address, and
3437 			 * dest address.
3438 			 */
3439 			if (ipf->ipf_ident == ident &&
3440 			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6src, v6src_ptr) &&
3441 			    IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6dst, v6dst_ptr)) {
3442 
3443 				/*
3444 				 * If we have received too many
3445 				 * duplicate fragments for this packet
3446 				 * free it.
3447 				 */
3448 				if (ipf->ipf_num_dups > ip_max_frag_dups) {
3449 					ill_frag_free_pkts(ill, ipfb, ipf, 1);
3450 					freemsg(mp);
3451 					mutex_exit(&ipfb->ipfb_lock);
3452 					return (NULL);
3453 				}
3454 
3455 				break;
3456 			}
3457 			ipfp = &ipf->ipf_hash_next;
3458 			continue;
3459 		}
3460 
3461 
3462 		/*
3463 		 * If we pruned the list, do we want to store this new
3464 		 * fragment?. We apply an optimization here based on the
3465 		 * fact that most fragments will be received in order.
3466 		 * So if the offset of this incoming fragment is zero,
3467 		 * it is the first fragment of a new packet. We will
3468 		 * keep it.  Otherwise drop the fragment, as we have
3469 		 * probably pruned the packet already (since the
3470 		 * packet cannot be found).
3471 		 */
3472 
3473 		if (pruned && offset != 0) {
3474 			mutex_exit(&ipfb->ipfb_lock);
3475 			freemsg(mp);
3476 			return (NULL);
3477 		}
3478 
3479 		/* New guy.  Allocate a frag message. */
3480 		mp1 = allocb(sizeof (*ipf), BPRI_MED);
3481 		if (!mp1) {
3482 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3483 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3484 			freemsg(mp);
3485 	partial_reass_done:
3486 			mutex_exit(&ipfb->ipfb_lock);
3487 			return (NULL);
3488 		}
3489 
3490 		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
3491 			/*
3492 			 * Too many fragmented packets in this hash bucket.
3493 			 * Free the oldest.
3494 			 */
3495 			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
3496 		}
3497 
3498 		mp1->b_cont = mp;
3499 
3500 		/* Initialize the fragment header. */
3501 		ipf = (ipf_t *)mp1->b_rptr;
3502 		ipf->ipf_mp = mp1;
3503 		ipf->ipf_ptphn = ipfp;
3504 		ipfp[0] = ipf;
3505 		ipf->ipf_hash_next = NULL;
3506 		ipf->ipf_ident = ident;
3507 		ipf->ipf_v6src = *v6src_ptr;
3508 		ipf->ipf_v6dst = *v6dst_ptr;
3509 		/* Record reassembly start time. */
3510 		ipf->ipf_timestamp = gethrestime_sec();
3511 		/* Record ipf generation and account for frag header */
3512 		ipf->ipf_gen = ill->ill_ipf_gen++;
3513 		ipf->ipf_count = MBLKSIZE(mp1);
3514 		ipf->ipf_protocol = nexthdr;
3515 		ipf->ipf_nf_hdr_len = 0;
3516 		ipf->ipf_prev_nexthdr_offset = 0;
3517 		ipf->ipf_last_frag_seen = B_FALSE;
3518 		ipf->ipf_ecn = ecn_info;
3519 		ipf->ipf_num_dups = 0;
3520 		ipfb->ipfb_frag_pkts++;
3521 		ipf->ipf_checksum = 0;
3522 		ipf->ipf_checksum_flags = 0;
3523 
3524 		/* Store checksum value in fragment header */
3525 		if (sum_flags != 0) {
3526 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3527 			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3528 			ipf->ipf_checksum = sum_val;
3529 			ipf->ipf_checksum_flags = sum_flags;
3530 		}
3531 
3532 		/*
3533 		 * We handle reassembly two ways.  In the easy case,
3534 		 * where all the fragments show up in order, we do
3535 		 * minimal bookkeeping, and just clip new pieces on
3536 		 * the end.  If we ever see a hole, then we go off
3537 		 * to ip_reassemble which has to mark the pieces and
3538 		 * keep track of the number of holes, etc.  Obviously,
3539 		 * the point of having both mechanisms is so we can
3540 		 * handle the easy case as efficiently as possible.
3541 		 */
3542 		if (offset == 0) {
3543 			/* Easy case, in-order reassembly so far. */
3544 			/* Update the byte count */
3545 			ipf->ipf_count += msg_len;
3546 			ipf->ipf_tail_mp = tail_mp;
3547 			/*
3548 			 * Keep track of next expected offset in
3549 			 * ipf_end.
3550 			 */
3551 			ipf->ipf_end = end;
3552 			ipf->ipf_nf_hdr_len = hdr_length;
3553 			ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
3554 		} else {
3555 			/* Hard case, hole at the beginning. */
3556 			ipf->ipf_tail_mp = NULL;
3557 			/*
3558 			 * ipf_end == 0 means that we have given up
3559 			 * on easy reassembly.
3560 			 */
3561 			ipf->ipf_end = 0;
3562 
3563 			/* Forget checksum offload from now on */
3564 			ipf->ipf_checksum_flags = 0;
3565 
3566 			/*
3567 			 * ipf_hole_cnt is set by ip_reassemble.
3568 			 * ipf_count is updated by ip_reassemble.
3569 			 * No need to check for return value here
3570 			 * as we don't expect reassembly to complete or
3571 			 * fail for the first fragment itself.
3572 			 */
3573 			(void) ip_reassemble(mp, ipf, offset, more_frags, ill,
3574 			    msg_len);
3575 		}
3576 		/* Update per ipfb and ill byte counts */
3577 		ipfb->ipfb_count += ipf->ipf_count;
3578 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3579 		atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
3580 		/* If the frag timer wasn't already going, start it. */
3581 		mutex_enter(&ill->ill_lock);
3582 		ill_frag_timer_start(ill);
3583 		mutex_exit(&ill->ill_lock);
3584 		goto partial_reass_done;
3585 	}
3586 
3587 	/*
3588 	 * If the packet's flag has changed (it could be coming up
3589 	 * from an interface different than the previous, therefore
3590 	 * possibly different checksum capability), then forget about
3591 	 * any stored checksum states.  Otherwise add the value to
3592 	 * the existing one stored in the fragment header.
3593 	 */
3594 	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
3595 		sum_val += ipf->ipf_checksum;
3596 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3597 		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3598 		ipf->ipf_checksum = sum_val;
3599 	} else if (ipf->ipf_checksum_flags != 0) {
3600 		/* Forget checksum offload from now on */
3601 		ipf->ipf_checksum_flags = 0;
3602 	}
3603 
3604 	/*
3605 	 * We have a new piece of a datagram which is already being
3606 	 * reassembled.  Update the ECN info if all IP fragments
3607 	 * are ECN capable.  If there is one which is not, clear
3608 	 * all the info.  If there is at least one which has CE
3609 	 * code point, IP needs to report that up to transport.
3610 	 */
3611 	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
3612 		if (ecn_info == IPH_ECN_CE)
3613 			ipf->ipf_ecn = IPH_ECN_CE;
3614 	} else {
3615 		ipf->ipf_ecn = IPH_ECN_NECT;
3616 	}
3617 
3618 	if (offset && ipf->ipf_end == offset) {
3619 		/* The new fragment fits at the end */
3620 		ipf->ipf_tail_mp->b_cont = mp;
3621 		/* Update the byte count */
3622 		ipf->ipf_count += msg_len;
3623 		/* Update per ipfb and ill byte counts */
3624 		ipfb->ipfb_count += msg_len;
3625 		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3626 		atomic_add_32(&ill->ill_frag_count, msg_len);
3627 		if (more_frags) {
3628 			/* More to come. */
3629 			ipf->ipf_end = end;
3630 			ipf->ipf_tail_mp = tail_mp;
3631 			goto partial_reass_done;
3632 		}
3633 	} else {
3634 		/*
3635 		 * Go do the hard cases.
3636 		 * Call ip_reassemble().
3637 		 */
3638 		int ret;
3639 
3640 		if (offset == 0) {
3641 			if (ipf->ipf_prev_nexthdr_offset == 0) {
3642 				ipf->ipf_nf_hdr_len = hdr_length;
3643 				ipf->ipf_prev_nexthdr_offset =
3644 				    prev_nexthdr_offset;
3645 			}
3646 		}
3647 		/* Save current byte count */
3648 		count = ipf->ipf_count;
3649 		ret = ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len);
3650 
3651 		/* Count of bytes added and subtracted (freeb()ed) */
3652 		count = ipf->ipf_count - count;
3653 		if (count) {
3654 			/* Update per ipfb and ill byte counts */
3655 			ipfb->ipfb_count += count;
3656 			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
3657 			atomic_add_32(&ill->ill_frag_count, count);
3658 		}
3659 		if (ret == IP_REASS_PARTIAL) {
3660 			goto partial_reass_done;
3661 		} else if (ret == IP_REASS_FAILED) {
3662 			/* Reassembly failed. Free up all resources */
3663 			ill_frag_free_pkts(ill, ipfb, ipf, 1);
3664 			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
3665 				IP_REASS_SET_START(t_mp, 0);
3666 				IP_REASS_SET_END(t_mp, 0);
3667 			}
3668 			freemsg(mp);
3669 			goto partial_reass_done;
3670 		}
3671 
3672 		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
3673 	}
3674 	/*
3675 	 * We have completed reassembly.  Unhook the frag header from
3676 	 * the reassembly list.
3677 	 *
3678 	 * Grab the unfragmentable header length next header value out
3679 	 * of the first fragment
3680 	 */
3681 	ASSERT(ipf->ipf_nf_hdr_len != 0);
3682 	hdr_length = ipf->ipf_nf_hdr_len;
3683 
3684 	/*
3685 	 * Before we free the frag header, record the ECN info
3686 	 * to report back to the transport.
3687 	 */
3688 	ecn_info = ipf->ipf_ecn;
3689 
3690 	/*
3691 	 * Store the nextheader field in the header preceding the fragment
3692 	 * header
3693 	 */
3694 	nexthdr = ipf->ipf_protocol;
3695 	prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
3696 	ipfp = ipf->ipf_ptphn;
3697 
3698 	/* We need to supply these to caller */
3699 	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
3700 		sum_val = ipf->ipf_checksum;
3701 	else
3702 		sum_val = 0;
3703 
3704 	mp1 = ipf->ipf_mp;
3705 	count = ipf->ipf_count;
3706 	ipf = ipf->ipf_hash_next;
3707 	if (ipf)
3708 		ipf->ipf_ptphn = ipfp;
3709 	ipfp[0] = ipf;
3710 	atomic_add_32(&ill->ill_frag_count, -count);
3711 	ASSERT(ipfb->ipfb_count >= count);
3712 	ipfb->ipfb_count -= count;
3713 	ipfb->ipfb_frag_pkts--;
3714 	mutex_exit(&ipfb->ipfb_lock);
3715 	/* Ditch the frag header. */
3716 	mp = mp1->b_cont;
3717 	freeb(mp1);
3718 
3719 	/*
3720 	 * Make sure the packet is good by doing some sanity
3721 	 * check. If bad we can silentely drop the packet.
3722 	 */
3723 reass_done:
3724 	if (hdr_length < sizeof (ip6_frag_t)) {
3725 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3726 		ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3727 		ip1dbg(("ip_input_fragment_v6: bad packet\n"));
3728 		freemsg(mp);
3729 		return (NULL);
3730 	}
3731 
3732 	/*
3733 	 * Remove the fragment header from the initial header by
3734 	 * splitting the mblk into the non-fragmentable header and
3735 	 * everthing after the fragment extension header.  This has the
3736 	 * side effect of putting all the headers that need destination
3737 	 * processing into the b_cont block-- on return this fact is
3738 	 * used in order to avoid having to look at the extensions
3739 	 * already processed.
3740 	 *
3741 	 * Note that this code assumes that the unfragmentable portion
3742 	 * of the header is in the first mblk and increments
3743 	 * the read pointer past it.  If this assumption is broken
3744 	 * this code fails badly.
3745 	 */
3746 	if (mp->b_rptr + hdr_length != mp->b_wptr) {
3747 		mblk_t *nmp;
3748 
3749 		if (!(nmp = dupb(mp))) {
3750 			ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
3751 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3752 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
3753 			freemsg(mp);
3754 			return (NULL);
3755 		}
3756 		nmp->b_cont = mp->b_cont;
3757 		mp->b_cont = nmp;
3758 		nmp->b_rptr += hdr_length;
3759 	}
3760 	mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
3761 
3762 	ip6h = (ip6_t *)mp->b_rptr;
3763 	((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
3764 
3765 	/* Restore original IP length in header. */
3766 	packet_size = msgdsize(mp);
3767 	ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
3768 	/* Record the ECN info. */
3769 	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
3770 	ip6h->ip6_vcf |= htonl(ecn_info << 20);
3771 
3772 	/* Update the receive attributes */
3773 	ira->ira_pktlen = packet_size;
3774 	ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
3775 	ira->ira_protocol = nexthdr;
3776 
3777 	/* Reassembly is successful; set checksum information in packet */
3778 	DB_CKSUM16(mp) = (uint16_t)sum_val;
3779 	DB_CKSUMFLAGS(mp) = sum_flags;
3780 	DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
3781 
3782 	return (mp);
3783 }
3784 
3785 /*
3786  * Given an mblk and a ptr, find the destination address in an IPv6 routing
3787  * header.
3788  */
3789 static in6_addr_t
3790 pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
3791 {
3792 	ip6_rthdr0_t *rt0;
3793 	int segleft, numaddr;
3794 	in6_addr_t *ap, rv = oldrv;
3795 
3796 	rt0 = (ip6_rthdr0_t *)whereptr;
3797 	if (rt0->ip6r0_type != 0 && rt0->ip6r0_type != 2) {
3798 		DTRACE_PROBE2(pluck_out_dst_unknown_type, mblk_t *, mp,
3799 		    uint8_t *, whereptr);
3800 		return (rv);
3801 	}
3802 	segleft = rt0->ip6r0_segleft;
3803 	numaddr = rt0->ip6r0_len / 2;
3804 
3805 	if ((rt0->ip6r0_len & 0x1) ||
3806 	    (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
3807 	    (segleft > rt0->ip6r0_len / 2)) {
3808 		/*
3809 		 * Corrupt packet.  Either the routing header length is odd
3810 		 * (can't happen) or mismatched compared to the packet, or the
3811 		 * number of addresses is.  Return what we can.  This will
3812 		 * only be a problem on forwarded packets that get squeezed
3813 		 * through an outbound tunnel enforcing IPsec Tunnel Mode.
3814 		 */
3815 		DTRACE_PROBE2(pluck_out_dst_badpkt, mblk_t *, mp, uint8_t *,
3816 		    whereptr);
3817 		return (rv);
3818 	}
3819 
3820 	if (segleft != 0) {
3821 		ap = (in6_addr_t *)((char *)rt0 + sizeof (*rt0));
3822 		rv = ap[numaddr - 1];
3823 	}
3824 
3825 	return (rv);
3826 }
3827 
3828 /*
3829  * Walk through the options to see if there is a routing header.
3830  * If present get the destination which is the last address of
3831  * the option.
3832  * mp needs to be provided in cases when the extension headers might span
3833  * b_cont; mp is never modified by this function.
3834  */
3835 in6_addr_t
3836 ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
3837 {
3838 	const mblk_t *current_mp = mp;
3839 	uint8_t nexthdr;
3840 	uint8_t *whereptr;
3841 	int ehdrlen;
3842 	in6_addr_t rv;
3843 
3844 	whereptr = (uint8_t *)ip6h;
3845 	ehdrlen = sizeof (ip6_t);
3846 
3847 	/* We assume at least the IPv6 base header is within one mblk. */
3848 	ASSERT(mp == NULL ||
3849 	    (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
3850 
3851 	rv = ip6h->ip6_dst;
3852 	nexthdr = ip6h->ip6_nxt;
3853 	if (is_fragment != NULL)
3854 		*is_fragment = B_FALSE;
3855 
3856 	/*
3857 	 * We also assume (thanks to ipsec_tun_outbound()'s pullup) that
3858 	 * no extension headers will be split across mblks.
3859 	 */
3860 
3861 	while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS ||
3862 	    nexthdr == IPPROTO_ROUTING) {
3863 		if (nexthdr == IPPROTO_ROUTING)
3864 			rv = pluck_out_dst(current_mp, whereptr, rv);
3865 
3866 		/*
3867 		 * All IPv6 extension headers have the next-header in byte
3868 		 * 0, and the (length - 8) in 8-byte-words.
3869 		 */
3870 		while (current_mp != NULL &&
3871 		    whereptr + ehdrlen >= current_mp->b_wptr) {
3872 			ehdrlen -= (current_mp->b_wptr - whereptr);
3873 			current_mp = current_mp->b_cont;
3874 			if (current_mp == NULL) {
3875 				/* Bad packet.  Return what we can. */
3876 				DTRACE_PROBE3(ip_get_dst_v6_badpkt, mblk_t *,
3877 				    mp, mblk_t *, current_mp, ip6_t *, ip6h);
3878 				goto done;
3879 			}
3880 			whereptr = current_mp->b_rptr;
3881 		}
3882 		whereptr += ehdrlen;
3883 
3884 		nexthdr = *whereptr;
3885 		ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
3886 		ehdrlen = (*(whereptr + 1) + 1) * 8;
3887 	}
3888 
3889 done:
3890 	if (nexthdr == IPPROTO_FRAGMENT && is_fragment != NULL)
3891 		*is_fragment = B_TRUE;
3892 	return (rv);
3893 }
3894 
3895 /*
3896  * ip_source_routed_v6:
3897  * This function is called by redirect code (called from ip_input_v6) to
3898  * know whether this packet is source routed through this node i.e
3899  * whether this node (router) is part of the journey. This
3900  * function is called under two cases :
3901  *
3902  * case 1 : Routing header was processed by this node and
3903  *	    ip_process_rthdr replaced ip6_dst with the next hop
3904  *	    and we are forwarding the packet to the next hop.
3905  *
3906  * case 2 : Routing header was not processed by this node and we
3907  *	    are just forwarding the packet.
3908  *
3909  * For case (1) we don't want to send redirects. For case(2) we
3910  * want to send redirects.
3911  */
3912 static boolean_t
3913 ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
3914 {
3915 	uint8_t		nexthdr;
3916 	in6_addr_t	*addrptr;
3917 	ip6_rthdr0_t	*rthdr;
3918 	uint8_t		numaddr;
3919 	ip6_hbh_t	*hbhhdr;
3920 	uint_t		ehdrlen;
3921 	uint8_t		*byteptr;
3922 
3923 	ip2dbg(("ip_source_routed_v6\n"));
3924 	nexthdr = ip6h->ip6_nxt;
3925 	ehdrlen = IPV6_HDR_LEN;
3926 
3927 	/* if a routing hdr is preceeded by HOPOPT or DSTOPT */
3928 	while (nexthdr == IPPROTO_HOPOPTS ||
3929 	    nexthdr == IPPROTO_DSTOPTS) {
3930 		byteptr = (uint8_t *)ip6h + ehdrlen;
3931 		/*
3932 		 * Check if we have already processed
3933 		 * packets or we are just a forwarding
3934 		 * router which only pulled up msgs up
3935 		 * to IPV6HDR and  one HBH ext header
3936 		 */
3937 		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
3938 			ip2dbg(("ip_source_routed_v6: Extension"
3939 			    " headers not processed\n"));
3940 			return (B_FALSE);
3941 		}
3942 		hbhhdr = (ip6_hbh_t *)byteptr;
3943 		nexthdr = hbhhdr->ip6h_nxt;
3944 		ehdrlen = ehdrlen + 8 * (hbhhdr->ip6h_len + 1);
3945 	}
3946 	switch (nexthdr) {
3947 	case IPPROTO_ROUTING:
3948 		byteptr = (uint8_t *)ip6h + ehdrlen;
3949 		/*
3950 		 * If for some reason, we haven't pulled up
3951 		 * the routing hdr data mblk, then we must
3952 		 * not have processed it at all. So for sure
3953 		 * we are not part of the source routed journey.
3954 		 */
3955 		if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
3956 			ip2dbg(("ip_source_routed_v6: Routing"
3957 			    " header not processed\n"));
3958 			return (B_FALSE);
3959 		}
3960 		rthdr = (ip6_rthdr0_t *)byteptr;
3961 		/*
3962 		 * Either we are an intermediate router or the
3963 		 * last hop before destination and we have
3964 		 * already processed the routing header.
3965 		 * If segment_left is greater than or equal to zero,
3966 		 * then we must be the (numaddr - segleft) entry
3967 		 * of the routing header. Although ip6r0_segleft
3968 		 * is a unit8_t variable, we still check for zero
3969 		 * or greater value, if in case the data type
3970 		 * is changed someday in future.
3971 		 */
3972 		if (rthdr->ip6r0_segleft > 0 ||
3973 		    rthdr->ip6r0_segleft == 0) {
3974 			numaddr = rthdr->ip6r0_len / 2;
3975 			addrptr = (in6_addr_t *)((char *)rthdr +
3976 			    sizeof (*rthdr));
3977 			addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
3978 			if (addrptr != NULL) {
3979 				if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
3980 					return (B_TRUE);
3981 				ip1dbg(("ip_source_routed_v6: Not local\n"));
3982 			}
3983 		}
3984 	/* FALLTHROUGH */
3985 	default:
3986 		ip2dbg(("ip_source_routed_v6: Not source routed here\n"));
3987 		return (B_FALSE);
3988 	}
3989 }
3990 
3991 /*
3992  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
3993  * We have not optimized this in terms of number of mblks
3994  * allocated. For instance, for each fragment sent we always allocate a
3995  * mblk to hold the IPv6 header and fragment header.
3996  *
3997  * Assumes that all the extension headers are contained in the first mblk
3998  * and that the fragment header has has already been added by calling
3999  * ip_fraghdr_add_v6.
4000  */
4001 int
4002 ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
4003     uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
4004     pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
4005 {
4006 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4007 	ip6_t		*fip6h;
4008 	mblk_t		*hmp;
4009 	mblk_t		*hmp0;
4010 	mblk_t		*dmp;
4011 	ip6_frag_t	*fraghdr;
4012 	size_t		unfragmentable_len;
4013 	size_t		mlen;
4014 	size_t		max_chunk;
4015 	uint16_t	off_flags;
4016 	uint16_t	offset = 0;
4017 	ill_t		*ill = nce->nce_ill;
4018 	uint8_t		nexthdr;
4019 	uint8_t		*ptr;
4020 	ip_stack_t	*ipst = ill->ill_ipst;
4021 	uint_t		priority = mp->b_band;
4022 	int		error = 0;
4023 
4024 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
4025 	if (max_frag == 0) {
4026 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4027 		ip_drop_output("FragFails: zero max_frag", mp, ill);
4028 		freemsg(mp);
4029 		return (EINVAL);
4030 	}
4031 
4032 	/*
4033 	 * Caller should have added fraghdr_t to pkt_len, and also
4034 	 * updated ip6_plen.
4035 	 */
4036 	ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
4037 	ASSERT(msgdsize(mp) == pkt_len);
4038 
4039 	/*
4040 	 * Determine the length of the unfragmentable portion of this
4041 	 * datagram.  This consists of the IPv6 header, a potential
4042 	 * hop-by-hop options header, a potential pre-routing-header
4043 	 * destination options header, and a potential routing header.
4044 	 */
4045 	nexthdr = ip6h->ip6_nxt;
4046 	ptr = (uint8_t *)&ip6h[1];
4047 
4048 	if (nexthdr == IPPROTO_HOPOPTS) {
4049 		ip6_hbh_t	*hbh_hdr;
4050 		uint_t		hdr_len;
4051 
4052 		hbh_hdr = (ip6_hbh_t *)ptr;
4053 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4054 		nexthdr = hbh_hdr->ip6h_nxt;
4055 		ptr += hdr_len;
4056 	}
4057 	if (nexthdr == IPPROTO_DSTOPTS) {
4058 		ip6_dest_t	*dest_hdr;
4059 		uint_t		hdr_len;
4060 
4061 		dest_hdr = (ip6_dest_t *)ptr;
4062 		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4063 			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4064 			nexthdr = dest_hdr->ip6d_nxt;
4065 			ptr += hdr_len;
4066 		}
4067 	}
4068 	if (nexthdr == IPPROTO_ROUTING) {
4069 		ip6_rthdr_t	*rthdr;
4070 		uint_t		hdr_len;
4071 
4072 		rthdr = (ip6_rthdr_t *)ptr;
4073 		nexthdr = rthdr->ip6r_nxt;
4074 		hdr_len = 8 * (rthdr->ip6r_len + 1);
4075 		ptr += hdr_len;
4076 	}
4077 	if (nexthdr != IPPROTO_FRAGMENT) {
4078 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4079 		ip_drop_output("FragFails: bad nexthdr", mp, ill);
4080 		freemsg(mp);
4081 		return (EINVAL);
4082 	}
4083 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4084 	unfragmentable_len += sizeof (ip6_frag_t);
4085 
4086 	max_chunk = (max_frag - unfragmentable_len) & ~7;
4087 
4088 	/*
4089 	 * Allocate an mblk with enough room for the link-layer
4090 	 * header and the unfragmentable part of the datagram, which includes
4091 	 * the fragment header.  This (or a copy) will be used as the
4092 	 * first mblk for each fragment we send.
4093 	 */
4094 	hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
4095 	if (hmp == NULL) {
4096 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4097 		ip_drop_output("FragFails: no hmp", mp, ill);
4098 		freemsg(mp);
4099 		return (ENOBUFS);
4100 	}
4101 	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4102 	hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
4103 
4104 	fip6h = (ip6_t *)hmp->b_rptr;
4105 	bcopy(ip6h, fip6h, unfragmentable_len);
4106 
4107 	/*
4108 	 * pkt_len is set to the total length of the fragmentable data in this
4109 	 * datagram.  For each fragment sent, we will decrement pkt_len
4110 	 * by the amount of fragmentable data sent in that fragment
4111 	 * until len reaches zero.
4112 	 */
4113 	pkt_len -= unfragmentable_len;
4114 
4115 	/*
4116 	 * Move read ptr past unfragmentable portion, we don't want this part
4117 	 * of the data in our fragments.
4118 	 */
4119 	mp->b_rptr += unfragmentable_len;
4120 	if (mp->b_rptr == mp->b_wptr) {
4121 		mblk_t *mp1 = mp->b_cont;
4122 		freeb(mp);
4123 		mp = mp1;
4124 	}
4125 
4126 	while (pkt_len != 0) {
4127 		mlen = MIN(pkt_len, max_chunk);
4128 		pkt_len -= mlen;
4129 		if (pkt_len != 0) {
4130 			/* Not last */
4131 			hmp0 = copyb(hmp);
4132 			if (hmp0 == NULL) {
4133 				BUMP_MIB(ill->ill_ip_mib,
4134 				    ipIfStatsOutFragFails);
4135 				ip_drop_output("FragFails: copyb failed",
4136 				    mp, ill);
4137 				freeb(hmp);
4138 				freemsg(mp);
4139 				ip1dbg(("ip_fragment_v6: copyb failed\n"));
4140 				return (ENOBUFS);
4141 			}
4142 			off_flags = IP6F_MORE_FRAG;
4143 		} else {
4144 			/* Last fragment */
4145 			hmp0 = hmp;
4146 			hmp = NULL;
4147 			off_flags = 0;
4148 		}
4149 		fip6h = (ip6_t *)(hmp0->b_rptr);
4150 		fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
4151 		    sizeof (ip6_frag_t));
4152 
4153 		fip6h->ip6_plen = htons((uint16_t)(mlen +
4154 		    unfragmentable_len - IPV6_HDR_LEN));
4155 		/*
4156 		 * Note: Optimization alert.
4157 		 * In IPv6 (and IPv4) protocol header, Fragment Offset
4158 		 * ("offset") is 13 bits wide and in 8-octet units.
4159 		 * In IPv6 protocol header (unlike IPv4) in a 16 bit field,
4160 		 * it occupies the most significant 13 bits.
4161 		 * (least significant 13 bits in IPv4).
4162 		 * We do not do any shifts here. Not shifting is same effect
4163 		 * as taking offset value in octet units, dividing by 8 and
4164 		 * then shifting 3 bits left to line it up in place in proper
4165 		 * place protocol header.
4166 		 */
4167 		fraghdr->ip6f_offlg = htons(offset) | off_flags;
4168 
4169 		if (!(dmp = ip_carve_mp(&mp, mlen))) {
4170 			/* mp has already been freed by ip_carve_mp() */
4171 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4172 			ip_drop_output("FragFails: could not carve mp",
4173 			    hmp0, ill);
4174 			if (hmp != NULL)
4175 				freeb(hmp);
4176 			freeb(hmp0);
4177 			ip1dbg(("ip_carve_mp: failed\n"));
4178 			return (ENOBUFS);
4179 		}
4180 		hmp0->b_cont = dmp;
4181 		/* Get the priority marking, if any */
4182 		hmp0->b_band = priority;
4183 
4184 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
4185 
4186 		error = postfragfn(hmp0, nce, ixaflags,
4187 		    mlen + unfragmentable_len, xmit_hint, szone, nolzid,
4188 		    ixa_cookie);
4189 		if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
4190 			/* No point in sending the other fragments */
4191 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4192 			ip_drop_output("FragFails: postfragfn failed",
4193 			    hmp, ill);
4194 			freeb(hmp);
4195 			freemsg(mp);
4196 			return (error);
4197 		}
4198 		/* No need to redo state machine in loop */
4199 		ixaflags &= ~IXAF_REACH_CONF;
4200 
4201 		offset += mlen;
4202 	}
4203 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
4204 	return (error);
4205 }
4206 
4207 /*
4208  * Add a fragment header to an IPv6 packet.
4209  * Assumes that all the extension headers are contained in the first mblk.
4210  *
4211  * The fragment header is inserted after an hop-by-hop options header
4212  * and after [an optional destinations header followed by] a routing header.
4213  */
4214 mblk_t *
4215 ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
4216 {
4217 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
4218 	ip6_t		*fip6h;
4219 	mblk_t		*hmp;
4220 	ip6_frag_t	*fraghdr;
4221 	size_t		unfragmentable_len;
4222 	uint8_t		nexthdr;
4223 	uint_t		prev_nexthdr_offset;
4224 	uint8_t		*ptr;
4225 	uint_t		priority = mp->b_band;
4226 	ip_stack_t	*ipst = ixa->ixa_ipst;
4227 
4228 	/*
4229 	 * Determine the length of the unfragmentable portion of this
4230 	 * datagram.  This consists of the IPv6 header, a potential
4231 	 * hop-by-hop options header, a potential pre-routing-header
4232 	 * destination options header, and a potential routing header.
4233 	 */
4234 	nexthdr = ip6h->ip6_nxt;
4235 	prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
4236 	ptr = (uint8_t *)&ip6h[1];
4237 
4238 	if (nexthdr == IPPROTO_HOPOPTS) {
4239 		ip6_hbh_t	*hbh_hdr;
4240 		uint_t		hdr_len;
4241 
4242 		hbh_hdr = (ip6_hbh_t *)ptr;
4243 		hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4244 		nexthdr = hbh_hdr->ip6h_nxt;
4245 		prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
4246 		    - (uint8_t *)ip6h;
4247 		ptr += hdr_len;
4248 	}
4249 	if (nexthdr == IPPROTO_DSTOPTS) {
4250 		ip6_dest_t	*dest_hdr;
4251 		uint_t		hdr_len;
4252 
4253 		dest_hdr = (ip6_dest_t *)ptr;
4254 		if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4255 			hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4256 			nexthdr = dest_hdr->ip6d_nxt;
4257 			prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
4258 			    - (uint8_t *)ip6h;
4259 			ptr += hdr_len;
4260 		}
4261 	}
4262 	if (nexthdr == IPPROTO_ROUTING) {
4263 		ip6_rthdr_t	*rthdr;
4264 		uint_t		hdr_len;
4265 
4266 		rthdr = (ip6_rthdr_t *)ptr;
4267 		nexthdr = rthdr->ip6r_nxt;
4268 		prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
4269 		    - (uint8_t *)ip6h;
4270 		hdr_len = 8 * (rthdr->ip6r_len + 1);
4271 		ptr += hdr_len;
4272 	}
4273 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4274 
4275 	/*
4276 	 * Allocate an mblk with enough room for the link-layer
4277 	 * header, the unfragmentable part of the datagram, and the
4278 	 * fragment header.
4279 	 */
4280 	hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
4281 	    ipst->ips_ip_wroff_extra, mp);
4282 	if (hmp == NULL) {
4283 		ill_t *ill = ixa->ixa_nce->nce_ill;
4284 
4285 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
4286 		ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
4287 		freemsg(mp);
4288 		return (NULL);
4289 	}
4290 	hmp->b_rptr += ipst->ips_ip_wroff_extra;
4291 	hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
4292 
4293 	fip6h = (ip6_t *)hmp->b_rptr;
4294 	fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
4295 
4296 	bcopy(ip6h, fip6h, unfragmentable_len);
4297 	fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
4298 	hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
4299 
4300 	fraghdr->ip6f_nxt = nexthdr;
4301 	fraghdr->ip6f_reserved = 0;
4302 	fraghdr->ip6f_offlg = 0;
4303 	fraghdr->ip6f_ident = htonl(ident);
4304 
4305 	/* Get the priority marking, if any */
4306 	hmp->b_band = priority;
4307 
4308 	/*
4309 	 * Move read ptr past unfragmentable portion, we don't want this part
4310 	 * of the data in our fragments.
4311 	 */
4312 	mp->b_rptr += unfragmentable_len;
4313 	hmp->b_cont = mp;
4314 	return (hmp);
4315 }
4316 
4317 /*
4318  * Determine if the ill and multicast aspects of that packets
4319  * "matches" the conn.
4320  */
4321 boolean_t
4322 conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
4323 {
4324 	ill_t		*ill = ira->ira_rill;
4325 	zoneid_t	zoneid = ira->ira_zoneid;
4326 	uint_t		in_ifindex;
4327 	in6_addr_t	*v6dst_ptr = &ip6h->ip6_dst;
4328 	in6_addr_t	*v6src_ptr = &ip6h->ip6_src;
4329 
4330 	/*
4331 	 * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
4332 	 * scopeid. This is used to limit
4333 	 * unicast and multicast reception to conn_incoming_ifindex.
4334 	 * conn_wantpacket_v6 is called both for unicast and
4335 	 * multicast packets.
4336 	 */
4337 	in_ifindex = connp->conn_incoming_ifindex;
4338 
4339 	/* mpathd can bind to the under IPMP interface, which we allow */
4340 	if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
4341 		if (!IS_UNDER_IPMP(ill))
4342 			return (B_FALSE);
4343 
4344 		if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
4345 			return (B_FALSE);
4346 	}
4347 
4348 	if (!IPCL_ZONE_MATCH(connp, zoneid))
4349 		return (B_FALSE);
4350 
4351 	if (!(ira->ira_flags & IRAF_MULTICAST))
4352 		return (B_TRUE);
4353 
4354 	if (connp->conn_multi_router)
4355 		return (B_TRUE);
4356 
4357 	if (ira->ira_protocol == IPPROTO_RSVP)
4358 		return (B_TRUE);
4359 
4360 	return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
4361 	    ira->ira_ill));
4362 }
4363 
4364 /*
4365  * pr_addr_dbg function provides the needed buffer space to call
4366  * inet_ntop() function's 3rd argument. This function should be
4367  * used by any kernel routine which wants to save INET6_ADDRSTRLEN
4368  * stack buffer space in it's own stack frame. This function uses
4369  * a buffer from it's own stack and prints the information.
4370  * Example: pr_addr_dbg("func: no route for %s\n ", AF_INET, addr)
4371  *
4372  * Note:    This function can call inet_ntop() once.
4373  */
4374 void
4375 pr_addr_dbg(char *fmt1, int af, const void *addr)
4376 {
4377 	char	buf[INET6_ADDRSTRLEN];
4378 
4379 	if (fmt1 == NULL) {
4380 		ip0dbg(("pr_addr_dbg: Wrong arguments\n"));
4381 		return;
4382 	}
4383 
4384 	/*
4385 	 * This does not compare debug level and just prints
4386 	 * out. Thus it is the responsibility of the caller
4387 	 * to check the appropriate debug-level before calling
4388 	 * this function.
4389 	 */
4390 	if (ip_debug > 0) {
4391 		printf(fmt1, inet_ntop(af, addr, buf, sizeof (buf)));
4392 	}
4393 
4394 
4395 }
4396 
4397 
4398 /*
4399  * Return the length in bytes of the IPv6 headers (base header
4400  * extension headers) that will be needed based on the
4401  * ip_pkt_t structure passed by the caller.
4402  *
4403  * The returned length does not include the length of the upper level
4404  * protocol (ULP) header.
4405  */
4406 int
4407 ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
4408 {
4409 	int len;
4410 
4411 	len = IPV6_HDR_LEN;
4412 
4413 	/*
4414 	 * If there's a security label here, then we ignore any hop-by-hop
4415 	 * options the user may try to set.
4416 	 */
4417 	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4418 		uint_t hopoptslen;
4419 		/*
4420 		 * Note that ipp_label_len_v6 is just the option - not
4421 		 * the hopopts extension header. It also needs to be padded
4422 		 * to a multiple of 8 bytes.
4423 		 */
4424 		ASSERT(ipp->ipp_label_len_v6 != 0);
4425 		hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4426 		hopoptslen = (hopoptslen + 7)/8 * 8;
4427 		len += hopoptslen;
4428 	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4429 		ASSERT(ipp->ipp_hopoptslen != 0);
4430 		len += ipp->ipp_hopoptslen;
4431 	}
4432 
4433 	/*
4434 	 * En-route destination options
4435 	 * Only do them if there's a routing header as well
4436 	 */
4437 	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4438 	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4439 		ASSERT(ipp->ipp_rthdrdstoptslen != 0);
4440 		len += ipp->ipp_rthdrdstoptslen;
4441 	}
4442 	if (ipp->ipp_fields & IPPF_RTHDR) {
4443 		ASSERT(ipp->ipp_rthdrlen != 0);
4444 		len += ipp->ipp_rthdrlen;
4445 	}
4446 	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4447 		ASSERT(ipp->ipp_dstoptslen != 0);
4448 		len += ipp->ipp_dstoptslen;
4449 	}
4450 	return (len);
4451 }
4452 
4453 /*
4454  * All-purpose routine to build a header chain of an IPv6 header
4455  * followed by any required extension headers and a proto header.
4456  *
4457  * The caller has to set the source and destination address as well as
4458  * ip6_plen. The caller has to massage any routing header and compensate
4459  * for the ULP pseudo-header checksum due to the source route.
4460  *
4461  * The extension headers will all be fully filled in.
4462  */
4463 void
4464 ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
4465     uint8_t protocol, uint32_t flowinfo)
4466 {
4467 	uint8_t *nxthdr_ptr;
4468 	uint8_t *cp;
4469 	ip6_t	*ip6h = (ip6_t *)buf;
4470 
4471 	/* Initialize IPv6 header */
4472 	ip6h->ip6_vcf =
4473 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4474 	    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4475 
4476 	if (ipp->ipp_fields & IPPF_TCLASS) {
4477 		/* Overrides the class part of flowinfo */
4478 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4479 		    ipp->ipp_tclass);
4480 	}
4481 
4482 	if (ipp->ipp_fields & IPPF_HOPLIMIT)
4483 		ip6h->ip6_hops = ipp->ipp_hoplimit;
4484 	else
4485 		ip6h->ip6_hops = ipp->ipp_unicast_hops;
4486 
4487 	if ((ipp->ipp_fields & IPPF_ADDR) &&
4488 	    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4489 		ip6h->ip6_src = ipp->ipp_addr;
4490 
4491 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
4492 	cp = (uint8_t *)&ip6h[1];
4493 	/*
4494 	 * Here's where we have to start stringing together
4495 	 * any extension headers in the right order:
4496 	 * Hop-by-hop, destination, routing, and final destination opts.
4497 	 */
4498 	/*
4499 	 * If there's a security label here, then we ignore any hop-by-hop
4500 	 * options the user may try to set.
4501 	 */
4502 	if (ipp->ipp_fields & IPPF_LABEL_V6) {
4503 		/*
4504 		 * Hop-by-hop options with the label.
4505 		 * Note that ipp_label_v6 is just the option - not
4506 		 * the hopopts extension header. It also needs to be padded
4507 		 * to a multiple of 8 bytes.
4508 		 */
4509 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4510 		uint_t hopoptslen;
4511 		uint_t padlen;
4512 
4513 		padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4514 		hopoptslen = (padlen + 7)/8 * 8;
4515 		padlen = hopoptslen - padlen;
4516 
4517 		*nxthdr_ptr = IPPROTO_HOPOPTS;
4518 		nxthdr_ptr = &hbh->ip6h_nxt;
4519 		hbh->ip6h_len = hopoptslen/8 - 1;
4520 		cp += sizeof (ip6_hbh_t);
4521 		bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
4522 		cp += ipp->ipp_label_len_v6;
4523 
4524 		ASSERT(padlen <= 7);
4525 		switch (padlen) {
4526 		case 0:
4527 			break;
4528 		case 1:
4529 			cp[0] = IP6OPT_PAD1;
4530 			break;
4531 		default:
4532 			cp[0] = IP6OPT_PADN;
4533 			cp[1] = padlen - 2;
4534 			bzero(&cp[2], padlen - 2);
4535 			break;
4536 		}
4537 		cp += padlen;
4538 	} else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4539 		/* Hop-by-hop options */
4540 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4541 
4542 		*nxthdr_ptr = IPPROTO_HOPOPTS;
4543 		nxthdr_ptr = &hbh->ip6h_nxt;
4544 
4545 		bcopy(ipp->ipp_hopopts, cp, ipp->ipp_hopoptslen);
4546 		cp += ipp->ipp_hopoptslen;
4547 	}
4548 	/*
4549 	 * En-route destination options
4550 	 * Only do them if there's a routing header as well
4551 	 */
4552 	if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4553 	    (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4554 		ip6_dest_t *dst = (ip6_dest_t *)cp;
4555 
4556 		*nxthdr_ptr = IPPROTO_DSTOPTS;
4557 		nxthdr_ptr = &dst->ip6d_nxt;
4558 
4559 		bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
4560 		cp += ipp->ipp_rthdrdstoptslen;
4561 	}
4562 	/*
4563 	 * Routing header next
4564 	 */
4565 	if (ipp->ipp_fields & IPPF_RTHDR) {
4566 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
4567 
4568 		*nxthdr_ptr = IPPROTO_ROUTING;
4569 		nxthdr_ptr = &rt->ip6r_nxt;
4570 
4571 		bcopy(ipp->ipp_rthdr, cp, ipp->ipp_rthdrlen);
4572 		cp += ipp->ipp_rthdrlen;
4573 	}
4574 	/*
4575 	 * Do ultimate destination options
4576 	 */
4577 	if (ipp->ipp_fields & IPPF_DSTOPTS) {
4578 		ip6_dest_t *dest = (ip6_dest_t *)cp;
4579 
4580 		*nxthdr_ptr = IPPROTO_DSTOPTS;
4581 		nxthdr_ptr = &dest->ip6d_nxt;
4582 
4583 		bcopy(ipp->ipp_dstopts, cp, ipp->ipp_dstoptslen);
4584 		cp += ipp->ipp_dstoptslen;
4585 	}
4586 	/*
4587 	 * Now set the last header pointer to the proto passed in
4588 	 */
4589 	*nxthdr_ptr = protocol;
4590 	ASSERT((int)(cp - buf) == buf_len);
4591 }
4592 
4593 /*
4594  * Return a pointer to the routing header extension header
4595  * in the IPv6 header(s) chain passed in.
4596  * If none found, return NULL
4597  * Assumes that all extension headers are in same mblk as the v6 header
4598  */
4599 ip6_rthdr_t *
4600 ip_find_rthdr_v6(ip6_t *ip6h, uint8_t *endptr)
4601 {
4602 	ip6_dest_t	*desthdr;
4603 	ip6_frag_t	*fraghdr;
4604 	uint_t		hdrlen;
4605 	uint8_t		nexthdr;
4606 	uint8_t		*ptr = (uint8_t *)&ip6h[1];
4607 
4608 	if (ip6h->ip6_nxt == IPPROTO_ROUTING)
4609 		return ((ip6_rthdr_t *)ptr);
4610 
4611 	/*
4612 	 * The routing header will precede all extension headers
4613 	 * other than the hop-by-hop and destination options
4614 	 * extension headers, so if we see anything other than those,
4615 	 * we're done and didn't find it.
4616 	 * We could see a destination options header alone but no
4617 	 * routing header, in which case we'll return NULL as soon as
4618 	 * we see anything after that.
4619 	 * Hop-by-hop and destination option headers are identical,
4620 	 * so we can use either one we want as a template.
4621 	 */
4622 	nexthdr = ip6h->ip6_nxt;
4623 	while (ptr < endptr) {
4624 		/* Is there enough left for len + nexthdr? */
4625 		if (ptr + MIN_EHDR_LEN > endptr)
4626 			return (NULL);
4627 
4628 		switch (nexthdr) {
4629 		case IPPROTO_HOPOPTS:
4630 		case IPPROTO_DSTOPTS:
4631 			/* Assumes the headers are identical for hbh and dst */
4632 			desthdr = (ip6_dest_t *)ptr;
4633 			hdrlen = 8 * (desthdr->ip6d_len + 1);
4634 			nexthdr = desthdr->ip6d_nxt;
4635 			break;
4636 
4637 		case IPPROTO_ROUTING:
4638 			return ((ip6_rthdr_t *)ptr);
4639 
4640 		case IPPROTO_FRAGMENT:
4641 			fraghdr = (ip6_frag_t *)ptr;
4642 			hdrlen = sizeof (ip6_frag_t);
4643 			nexthdr = fraghdr->ip6f_nxt;
4644 			break;
4645 
4646 		default:
4647 			return (NULL);
4648 		}
4649 		ptr += hdrlen;
4650 	}
4651 	return (NULL);
4652 }
4653 
4654 /*
4655  * Called for source-routed packets originating on this node.
4656  * Manipulates the original routing header by moving every entry up
4657  * one slot, placing the first entry in the v6 header's v6_dst field,
4658  * and placing the ultimate destination in the routing header's last
4659  * slot.
4660  *
4661  * Returns the checksum diference between the ultimate destination
4662  * (last hop in the routing header when the packet is sent) and
4663  * the first hop (ip6_dst when the packet is sent)
4664  */
4665 /* ARGSUSED2 */
4666 uint32_t
4667 ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
4668 {
4669 	uint_t		numaddr;
4670 	uint_t		i;
4671 	in6_addr_t	*addrptr;
4672 	in6_addr_t	tmp;
4673 	ip6_rthdr0_t	*rthdr = (ip6_rthdr0_t *)rth;
4674 	uint32_t	cksm;
4675 	uint32_t	addrsum = 0;
4676 	uint16_t	*ptr;
4677 
4678 	/*
4679 	 * Perform any processing needed for source routing.
4680 	 * We know that all extension headers will be in the same mblk
4681 	 * as the IPv6 header.
4682 	 */
4683 
4684 	/*
4685 	 * If no segments left in header, or the header length field is zero,
4686 	 * don't move hop addresses around;
4687 	 * Checksum difference is zero.
4688 	 */
4689 	if ((rthdr->ip6r0_segleft == 0) || (rthdr->ip6r0_len == 0))
4690 		return (0);
4691 
4692 	ptr = (uint16_t *)&ip6h->ip6_dst;
4693 	cksm = 0;
4694 	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4695 		cksm += ptr[i];
4696 	}
4697 	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4698 
4699 	/*
4700 	 * Here's where the fun begins - we have to
4701 	 * move all addresses up one spot, take the
4702 	 * first hop and make it our first ip6_dst,
4703 	 * and place the ultimate destination in the
4704 	 * newly-opened last slot.
4705 	 */
4706 	addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
4707 	numaddr = rthdr->ip6r0_len / 2;
4708 	tmp = *addrptr;
4709 	for (i = 0; i < (numaddr - 1); addrptr++, i++) {
4710 		*addrptr = addrptr[1];
4711 	}
4712 	*addrptr = ip6h->ip6_dst;
4713 	ip6h->ip6_dst = tmp;
4714 
4715 	/*
4716 	 * From the checksummed ultimate destination subtract the checksummed
4717 	 * current ip6_dst (the first hop address). Return that number.
4718 	 * (In the v4 case, the second part of this is done in each routine
4719 	 *  that calls ip_massage_options(). We do it all in this one place
4720 	 *  for v6).
4721 	 */
4722 	ptr = (uint16_t *)&ip6h->ip6_dst;
4723 	for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4724 		addrsum += ptr[i];
4725 	}
4726 	cksm -= ((addrsum >> 16) + (addrsum & 0xFFFF));
4727 	if ((int)cksm < 0)
4728 		cksm--;
4729 	cksm = (cksm & 0xFFFF) + (cksm >> 16);
4730 
4731 	return (cksm);
4732 }
4733 
4734 void
4735 *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
4736 {
4737 	kstat_t *ksp;
4738 
4739 	ip6_stat_t template = {
4740 		{ "ip6_udp_fannorm",	KSTAT_DATA_UINT64 },
4741 		{ "ip6_udp_fanmb",	KSTAT_DATA_UINT64 },
4742 		{ "ip6_recv_pullup",		KSTAT_DATA_UINT64 },
4743 		{ "ip6_db_ref",			KSTAT_DATA_UINT64 },
4744 		{ "ip6_notaligned",		KSTAT_DATA_UINT64 },
4745 		{ "ip6_multimblk",		KSTAT_DATA_UINT64 },
4746 		{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
4747 		{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
4748 		{ "ip6_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
4749 		{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
4750 		{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4751 		{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4752 		{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4753 		{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
4754 		{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
4755 		{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
4756 	};
4757 	ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
4758 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
4759 	    KSTAT_FLAG_VIRTUAL, stackid);
4760 
4761 	if (ksp == NULL)
4762 		return (NULL);
4763 
4764 	bcopy(&template, ip6_statisticsp, sizeof (template));
4765 	ksp->ks_data = (void *)ip6_statisticsp;
4766 	ksp->ks_private = (void *)(uintptr_t)stackid;
4767 
4768 	kstat_install(ksp);
4769 	return (ksp);
4770 }
4771 
4772 void
4773 ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4774 {
4775 	if (ksp != NULL) {
4776 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4777 		kstat_delete_netstack(ksp, stackid);
4778 	}
4779 }
4780 
4781 /*
4782  * The following two functions set and get the value for the
4783  * IPV6_SRC_PREFERENCES socket option.
4784  */
4785 int
4786 ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
4787 {
4788 	/*
4789 	 * We only support preferences that are covered by
4790 	 * IPV6_PREFER_SRC_MASK.
4791 	 */
4792 	if (prefs & ~IPV6_PREFER_SRC_MASK)
4793 		return (EINVAL);
4794 
4795 	/*
4796 	 * Look for conflicting preferences or default preferences.  If
4797 	 * both bits of a related pair are clear, the application wants the
4798 	 * system's default value for that pair.  Both bits in a pair can't
4799 	 * be set.
4800 	 */
4801 	if ((prefs & IPV6_PREFER_SRC_MIPMASK) == 0) {
4802 		prefs |= IPV6_PREFER_SRC_MIPDEFAULT;
4803 	} else if ((prefs & IPV6_PREFER_SRC_MIPMASK) ==
4804 	    IPV6_PREFER_SRC_MIPMASK) {
4805 		return (EINVAL);
4806 	}
4807 	if ((prefs & IPV6_PREFER_SRC_TMPMASK) == 0) {
4808 		prefs |= IPV6_PREFER_SRC_TMPDEFAULT;
4809 	} else if ((prefs & IPV6_PREFER_SRC_TMPMASK) ==
4810 	    IPV6_PREFER_SRC_TMPMASK) {
4811 		return (EINVAL);
4812 	}
4813 	if ((prefs & IPV6_PREFER_SRC_CGAMASK) == 0) {
4814 		prefs |= IPV6_PREFER_SRC_CGADEFAULT;
4815 	} else if ((prefs & IPV6_PREFER_SRC_CGAMASK) ==
4816 	    IPV6_PREFER_SRC_CGAMASK) {
4817 		return (EINVAL);
4818 	}
4819 
4820 	ixa->ixa_src_preferences = prefs;
4821 	return (0);
4822 }
4823 
4824 size_t
4825 ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
4826 {
4827 	*val = ixa->ixa_src_preferences;
4828 	return (sizeof (ixa->ixa_src_preferences));
4829 }
4830 
4831 /*
4832  * Get the size of the IP options (including the IP headers size)
4833  * without including the AH header's size. If till_ah is B_FALSE,
4834  * and if AH header is present, dest options beyond AH header will
4835  * also be included in the returned size.
4836  */
4837 int
4838 ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
4839 {
4840 	ip6_t *ip6h;
4841 	uint8_t nexthdr;
4842 	uint8_t *whereptr;
4843 	ip6_hbh_t *hbhhdr;
4844 	ip6_dest_t *dsthdr;
4845 	ip6_rthdr_t *rthdr;
4846 	int ehdrlen;
4847 	int size;
4848 	ah_t *ah;
4849 
4850 	ip6h = (ip6_t *)mp->b_rptr;
4851 	size = IPV6_HDR_LEN;
4852 	nexthdr = ip6h->ip6_nxt;
4853 	whereptr = (uint8_t *)&ip6h[1];
4854 	for (;;) {
4855 		/* Assume IP has already stripped it */
4856 		ASSERT(nexthdr != IPPROTO_FRAGMENT);
4857 		switch (nexthdr) {
4858 		case IPPROTO_HOPOPTS:
4859 			hbhhdr = (ip6_hbh_t *)whereptr;
4860 			nexthdr = hbhhdr->ip6h_nxt;
4861 			ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
4862 			break;
4863 		case IPPROTO_DSTOPTS:
4864 			dsthdr = (ip6_dest_t *)whereptr;
4865 			nexthdr = dsthdr->ip6d_nxt;
4866 			ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4867 			break;
4868 		case IPPROTO_ROUTING:
4869 			rthdr = (ip6_rthdr_t *)whereptr;
4870 			nexthdr = rthdr->ip6r_nxt;
4871 			ehdrlen = 8 * (rthdr->ip6r_len + 1);
4872 			break;
4873 		default :
4874 			if (till_ah) {
4875 				ASSERT(nexthdr == IPPROTO_AH);
4876 				return (size);
4877 			}
4878 			/*
4879 			 * If we don't have a AH header to traverse,
4880 			 * return now. This happens normally for
4881 			 * outbound datagrams where we have not inserted
4882 			 * the AH header.
4883 			 */
4884 			if (nexthdr != IPPROTO_AH) {
4885 				return (size);
4886 			}
4887 
4888 			/*
4889 			 * We don't include the AH header's size
4890 			 * to be symmetrical with other cases where
4891 			 * we either don't have a AH header (outbound)
4892 			 * or peek into the AH header yet (inbound and
4893 			 * not pulled up yet).
4894 			 */
4895 			ah = (ah_t *)whereptr;
4896 			nexthdr = ah->ah_nexthdr;
4897 			ehdrlen = (ah->ah_length << 2) + 8;
4898 
4899 			if (nexthdr == IPPROTO_DSTOPTS) {
4900 				if (whereptr + ehdrlen >= mp->b_wptr) {
4901 					/*
4902 					 * The destination options header
4903 					 * is not part of the first mblk.
4904 					 */
4905 					whereptr = mp->b_cont->b_rptr;
4906 				} else {
4907 					whereptr += ehdrlen;
4908 				}
4909 
4910 				dsthdr = (ip6_dest_t *)whereptr;
4911 				ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4912 				size += ehdrlen;
4913 			}
4914 			return (size);
4915 		}
4916 		whereptr += ehdrlen;
4917 		size += ehdrlen;
4918 	}
4919 }
4920 
4921 /*
4922  * Utility routine that checks if `v6srcp' is a valid address on underlying
4923  * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
4924  * associated with `v6srcp' on success.  NOTE: if this is not called from
4925  * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
4926  * group during or after this lookup.
4927  */
4928 boolean_t
4929 ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
4930 {
4931 	ipif_t *ipif;
4932 
4933 
4934 	ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
4935 	if (ipif != NULL) {
4936 		if (ipifp != NULL)
4937 			*ipifp = ipif;
4938 		else
4939 			ipif_refrele(ipif);
4940 		return (B_TRUE);
4941 	}
4942 
4943 	if (ip_debug > 2) {
4944 		pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
4945 		    "src %s\n", AF_INET6, v6srcp);
4946 	}
4947 	return (B_FALSE);
4948 }
4949