xref: /freebsd/sys/netinet6/ip6_output.c (revision d8b878873e7aa8df1972cc6a642804b17eb61087)
1 /*-
2  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the project nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
30  */
31 
32 /*-
33  * Copyright (c) 1982, 1986, 1988, 1990, 1993
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
61  */
62 
63 #include <sys/cdefs.h>
64 __FBSDID("$FreeBSD$");
65 
66 #include "opt_inet.h"
67 #include "opt_inet6.h"
68 #include "opt_ipsec.h"
69 #include "opt_sctp.h"
70 
71 #include <sys/param.h>
72 #include <sys/kernel.h>
73 #include <sys/malloc.h>
74 #include <sys/mbuf.h>
75 #include <sys/errno.h>
76 #include <sys/priv.h>
77 #include <sys/proc.h>
78 #include <sys/protosw.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/syslog.h>
82 #include <sys/ucred.h>
83 
84 #include <net/if.h>
85 #include <net/netisr.h>
86 #include <net/route.h>
87 #include <net/pfil.h>
88 #include <net/vnet.h>
89 
90 #include <netinet/in.h>
91 #include <netinet/in_var.h>
92 #include <netinet6/in6_var.h>
93 #include <netinet/ip6.h>
94 #include <netinet/icmp6.h>
95 #include <netinet6/ip6_var.h>
96 #include <netinet/in_pcb.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet6/nd6.h>
99 
100 #ifdef IPSEC
101 #include <netipsec/ipsec.h>
102 #include <netipsec/ipsec6.h>
103 #include <netipsec/key.h>
104 #include <netinet6/ip6_ipsec.h>
105 #endif /* IPSEC */
106 #ifdef SCTP
107 #include <netinet/sctp.h>
108 #include <netinet/sctp_crc32.h>
109 #endif
110 
111 #include <netinet6/ip6protosw.h>
112 #include <netinet6/scope6_var.h>
113 
114 extern int in6_mcast_loop;
115 
116 struct ip6_exthdrs {
117 	struct mbuf *ip6e_ip6;
118 	struct mbuf *ip6e_hbh;
119 	struct mbuf *ip6e_dest1;
120 	struct mbuf *ip6e_rthdr;
121 	struct mbuf *ip6e_dest2;
122 };
123 
124 static int ip6_pcbopt __P((int, u_char *, int, struct ip6_pktopts **,
125 			   struct ucred *, int));
126 static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *,
127 	struct socket *, struct sockopt *));
128 static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
129 static int ip6_setpktopt __P((int, u_char *, int, struct ip6_pktopts *,
130 	struct ucred *, int, int, int));
131 
132 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
133 static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int,
134 	struct ip6_frag **));
135 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
136 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
137 static int ip6_getpmtu __P((struct route_in6 *, struct route_in6 *,
138 	struct ifnet *, struct in6_addr *, u_long *, int *));
139 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
140 
141 
142 /*
143  * Make an extension header from option data.  hp is the source, and
144  * mp is the destination.
145  */
146 #define MAKE_EXTHDR(hp, mp)						\
147     do {								\
148 	if (hp) {							\
149 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
150 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
151 		    ((eh)->ip6e_len + 1) << 3);				\
152 		if (error)						\
153 			goto freehdrs;					\
154 	}								\
155     } while (/*CONSTCOND*/ 0)
156 
157 /*
158  * Form a chain of extension headers.
159  * m is the extension header mbuf
160  * mp is the previous mbuf in the chain
161  * p is the next header
162  * i is the type of option.
163  */
164 #define MAKE_CHAIN(m, mp, p, i)\
165     do {\
166 	if (m) {\
167 		if (!hdrsplit) \
168 			panic("assumption failed: hdr not split"); \
169 		*mtod((m), u_char *) = *(p);\
170 		*(p) = (i);\
171 		p = mtod((m), u_char *);\
172 		(m)->m_next = (mp)->m_next;\
173 		(mp)->m_next = (m);\
174 		(mp) = (m);\
175 	}\
176     } while (/*CONSTCOND*/ 0)
177 
178 /*
179  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
180  * header (with pri, len, nxt, hlim, src, dst).
181  * This function may modify ver and hlim only.
182  * The mbuf chain containing the packet will be freed.
183  * The mbuf opt, if present, will not be freed.
184  *
185  * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
186  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
187  * which is rt_rmx.rmx_mtu.
188  *
189  * ifpp - XXX: just for statistics
190  */
191 int
192 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
193     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
194     struct ifnet **ifpp, struct inpcb *inp)
195 {
196 	struct ip6_hdr *ip6, *mhip6;
197 	struct ifnet *ifp, *origifp;
198 	struct mbuf *m = m0;
199 	struct mbuf *mprev = NULL;
200 	int hlen, tlen, len, off;
201 	struct route_in6 ip6route;
202 	struct rtentry *rt = NULL;
203 	struct sockaddr_in6 *dst, src_sa, dst_sa;
204 	struct in6_addr odst;
205 	int error = 0;
206 	struct in6_ifaddr *ia = NULL;
207 	u_long mtu;
208 	int alwaysfrag, dontfrag;
209 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
210 	struct ip6_exthdrs exthdrs;
211 	struct in6_addr finaldst, src0, dst0;
212 	u_int32_t zone;
213 	struct route_in6 *ro_pmtu = NULL;
214 	int hdrsplit = 0;
215 	int needipsec = 0;
216 #ifdef SCTP
217 	int sw_csum;
218 #endif
219 #ifdef IPSEC
220 	struct ipsec_output_state state;
221 	struct ip6_rthdr *rh = NULL;
222 	int needipsectun = 0;
223 	int segleft_org = 0;
224 	struct secpolicy *sp = NULL;
225 #endif /* IPSEC */
226 
227 	ip6 = mtod(m, struct ip6_hdr *);
228 	if (ip6 == NULL) {
229 		printf ("ip6 is NULL");
230 		goto bad;
231 	}
232 
233 	finaldst = ip6->ip6_dst;
234 
235 	bzero(&exthdrs, sizeof(exthdrs));
236 
237 	if (opt) {
238 		/* Hop-by-Hop options header */
239 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
240 		/* Destination options header(1st part) */
241 		if (opt->ip6po_rthdr) {
242 			/*
243 			 * Destination options header(1st part)
244 			 * This only makes sense with a routing header.
245 			 * See Section 9.2 of RFC 3542.
246 			 * Disabling this part just for MIP6 convenience is
247 			 * a bad idea.  We need to think carefully about a
248 			 * way to make the advanced API coexist with MIP6
249 			 * options, which might automatically be inserted in
250 			 * the kernel.
251 			 */
252 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
253 		}
254 		/* Routing header */
255 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
256 		/* Destination options header(2nd part) */
257 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
258 	}
259 
260 	/*
261 	 * IPSec checking which handles several cases.
262 	 * FAST IPSEC: We re-injected the packet.
263 	 */
264 #ifdef IPSEC
265 	switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp, &sp))
266 	{
267 	case 1:                 /* Bad packet */
268 		goto freehdrs;
269 	case -1:                /* Do IPSec */
270 		needipsec = 1;
271 	case 0:                 /* No IPSec */
272 	default:
273 		break;
274 	}
275 #endif /* IPSEC */
276 
277 	/*
278 	 * Calculate the total length of the extension header chain.
279 	 * Keep the length of the unfragmentable part for fragmentation.
280 	 */
281 	optlen = 0;
282 	if (exthdrs.ip6e_hbh)
283 		optlen += exthdrs.ip6e_hbh->m_len;
284 	if (exthdrs.ip6e_dest1)
285 		optlen += exthdrs.ip6e_dest1->m_len;
286 	if (exthdrs.ip6e_rthdr)
287 		optlen += exthdrs.ip6e_rthdr->m_len;
288 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
289 
290 	/* NOTE: we don't add AH/ESP length here. do that later. */
291 	if (exthdrs.ip6e_dest2)
292 		optlen += exthdrs.ip6e_dest2->m_len;
293 
294 	/*
295 	 * If we need IPsec, or there is at least one extension header,
296 	 * separate IP6 header from the payload.
297 	 */
298 	if ((needipsec || optlen) && !hdrsplit) {
299 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
300 			m = NULL;
301 			goto freehdrs;
302 		}
303 		m = exthdrs.ip6e_ip6;
304 		hdrsplit++;
305 	}
306 
307 	/* adjust pointer */
308 	ip6 = mtod(m, struct ip6_hdr *);
309 
310 	/* adjust mbuf packet header length */
311 	m->m_pkthdr.len += optlen;
312 	plen = m->m_pkthdr.len - sizeof(*ip6);
313 
314 	/* If this is a jumbo payload, insert a jumbo payload option. */
315 	if (plen > IPV6_MAXPACKET) {
316 		if (!hdrsplit) {
317 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
318 				m = NULL;
319 				goto freehdrs;
320 			}
321 			m = exthdrs.ip6e_ip6;
322 			hdrsplit++;
323 		}
324 		/* adjust pointer */
325 		ip6 = mtod(m, struct ip6_hdr *);
326 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
327 			goto freehdrs;
328 		ip6->ip6_plen = 0;
329 	} else
330 		ip6->ip6_plen = htons(plen);
331 
332 	/*
333 	 * Concatenate headers and fill in next header fields.
334 	 * Here we have, on "m"
335 	 *	IPv6 payload
336 	 * and we insert headers accordingly.  Finally, we should be getting:
337 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
338 	 *
339 	 * during the header composing process, "m" points to IPv6 header.
340 	 * "mprev" points to an extension header prior to esp.
341 	 */
342 	u_char *nexthdrp = &ip6->ip6_nxt;
343 	mprev = m;
344 
345 	/*
346 	 * we treat dest2 specially.  this makes IPsec processing
347 	 * much easier.  the goal here is to make mprev point the
348 	 * mbuf prior to dest2.
349 	 *
350 	 * result: IPv6 dest2 payload
351 	 * m and mprev will point to IPv6 header.
352 	 */
353 	if (exthdrs.ip6e_dest2) {
354 		if (!hdrsplit)
355 			panic("assumption failed: hdr not split");
356 		exthdrs.ip6e_dest2->m_next = m->m_next;
357 		m->m_next = exthdrs.ip6e_dest2;
358 		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
359 		ip6->ip6_nxt = IPPROTO_DSTOPTS;
360 	}
361 
362 	/*
363 	 * result: IPv6 hbh dest1 rthdr dest2 payload
364 	 * m will point to IPv6 header.  mprev will point to the
365 	 * extension header prior to dest2 (rthdr in the above case).
366 	 */
367 	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
368 	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
369 		   IPPROTO_DSTOPTS);
370 	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
371 		   IPPROTO_ROUTING);
372 
373 #ifdef IPSEC
374 	if (!needipsec)
375 		goto skip_ipsec2;
376 
377 	/*
378 	 * pointers after IPsec headers are not valid any more.
379 	 * other pointers need a great care too.
380 	 * (IPsec routines should not mangle mbufs prior to AH/ESP)
381 	 */
382 	exthdrs.ip6e_dest2 = NULL;
383 
384 	if (exthdrs.ip6e_rthdr) {
385 		rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
386 		segleft_org = rh->ip6r_segleft;
387 		rh->ip6r_segleft = 0;
388 	}
389 
390 	bzero(&state, sizeof(state));
391 	state.m = m;
392 	error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags,
393 				    &needipsectun);
394 	m = state.m;
395 	if (error == EJUSTRETURN) {
396 		/*
397 		 * We had a SP with a level of 'use' and no SA. We
398 		 * will just continue to process the packet without
399 		 * IPsec processing.
400 		 */
401 		;
402 	} else if (error) {
403 		/* mbuf is already reclaimed in ipsec6_output_trans. */
404 		m = NULL;
405 		switch (error) {
406 		case EHOSTUNREACH:
407 		case ENETUNREACH:
408 		case EMSGSIZE:
409 		case ENOBUFS:
410 		case ENOMEM:
411 			break;
412 		default:
413 			printf("[%s:%d] (ipsec): error code %d\n",
414 			    __func__, __LINE__, error);
415 			/* FALLTHROUGH */
416 		case ENOENT:
417 			/* don't show these error codes to the user */
418 			error = 0;
419 			break;
420 		}
421 		goto bad;
422 	} else if (!needipsectun) {
423 		/*
424 		 * In the FAST IPSec case we have already
425 		 * re-injected the packet and it has been freed
426 		 * by the ipsec_done() function.  So, just clean
427 		 * up after ourselves.
428 		 */
429 		m = NULL;
430 		goto done;
431 	}
432 	if (exthdrs.ip6e_rthdr) {
433 		/* ah6_output doesn't modify mbuf chain */
434 		rh->ip6r_segleft = segleft_org;
435 	}
436 skip_ipsec2:;
437 #endif /* IPSEC */
438 
439 	/*
440 	 * If there is a routing header, discard the packet.
441 	 */
442 	if (exthdrs.ip6e_rthdr) {
443 		 error = EINVAL;
444 		 goto bad;
445 	}
446 
447 	/* Source address validation */
448 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
449 	    (flags & IPV6_UNSPECSRC) == 0) {
450 		error = EOPNOTSUPP;
451 		V_ip6stat.ip6s_badscope++;
452 		goto bad;
453 	}
454 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
455 		error = EOPNOTSUPP;
456 		V_ip6stat.ip6s_badscope++;
457 		goto bad;
458 	}
459 
460 	V_ip6stat.ip6s_localout++;
461 
462 	/*
463 	 * Route packet.
464 	 */
465 	if (ro == 0) {
466 		ro = &ip6route;
467 		bzero((caddr_t)ro, sizeof(*ro));
468 	}
469 	ro_pmtu = ro;
470 	if (opt && opt->ip6po_rthdr)
471 		ro = &opt->ip6po_route;
472 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
473 
474 again:
475 	/*
476 	 * if specified, try to fill in the traffic class field.
477 	 * do not override if a non-zero value is already set.
478 	 * we check the diffserv field and the ecn field separately.
479 	 */
480 	if (opt && opt->ip6po_tclass >= 0) {
481 		int mask = 0;
482 
483 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
484 			mask |= 0xfc;
485 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
486 			mask |= 0x03;
487 		if (mask != 0)
488 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
489 	}
490 
491 	/* fill in or override the hop limit field, if necessary. */
492 	if (opt && opt->ip6po_hlim != -1)
493 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
494 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
495 		if (im6o != NULL)
496 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
497 		else
498 			ip6->ip6_hlim = V_ip6_defmcasthlim;
499 	}
500 
501 #ifdef IPSEC
502 	/*
503 	 * We may re-inject packets into the stack here.
504 	 */
505 	if (needipsec && needipsectun) {
506 		struct ipsec_output_state state;
507 
508 		/*
509 		 * All the extension headers will become inaccessible
510 		 * (since they can be encrypted).
511 		 * Don't panic, we need no more updates to extension headers
512 		 * on inner IPv6 packet (since they are now encapsulated).
513 		 *
514 		 * IPv6 [ESP|AH] IPv6 [extension headers] payload
515 		 */
516 		bzero(&exthdrs, sizeof(exthdrs));
517 		exthdrs.ip6e_ip6 = m;
518 
519 		bzero(&state, sizeof(state));
520 		state.m = m;
521 		state.ro = (struct route *)ro;
522 		state.dst = (struct sockaddr *)dst;
523 
524 		error = ipsec6_output_tunnel(&state, sp, flags);
525 
526 		m = state.m;
527 		ro = (struct route_in6 *)state.ro;
528 		dst = (struct sockaddr_in6 *)state.dst;
529 		if (error == EJUSTRETURN) {
530 			/*
531 			 * We had a SP with a level of 'use' and no SA. We
532 			 * will just continue to process the packet without
533 			 * IPsec processing.
534 			 */
535 			;
536 		} else if (error) {
537 			/* mbuf is already reclaimed in ipsec6_output_tunnel. */
538 			m0 = m = NULL;
539 			m = NULL;
540 			switch (error) {
541 			case EHOSTUNREACH:
542 			case ENETUNREACH:
543 			case EMSGSIZE:
544 			case ENOBUFS:
545 			case ENOMEM:
546 				break;
547 			default:
548 				printf("[%s:%d] (ipsec): error code %d\n",
549 				    __func__, __LINE__, error);
550 				/* FALLTHROUGH */
551 			case ENOENT:
552 				/* don't show these error codes to the user */
553 				error = 0;
554 				break;
555 			}
556 			goto bad;
557 		} else {
558 			/*
559 			 * In the FAST IPSec case we have already
560 			 * re-injected the packet and it has been freed
561 			 * by the ipsec_done() function.  So, just clean
562 			 * up after ourselves.
563 			 */
564 			m = NULL;
565 			goto done;
566 		}
567 
568 		exthdrs.ip6e_ip6 = m;
569 	}
570 #endif /* IPSEC */
571 
572 	/* adjust pointer */
573 	ip6 = mtod(m, struct ip6_hdr *);
574 
575 	bzero(&dst_sa, sizeof(dst_sa));
576 	dst_sa.sin6_family = AF_INET6;
577 	dst_sa.sin6_len = sizeof(dst_sa);
578 	dst_sa.sin6_addr = ip6->ip6_dst;
579 	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
580 	    &ifp, &rt)) != 0) {
581 		switch (error) {
582 		case EHOSTUNREACH:
583 			V_ip6stat.ip6s_noroute++;
584 			break;
585 		case EADDRNOTAVAIL:
586 		default:
587 			break; /* XXX statistics? */
588 		}
589 		if (ifp != NULL)
590 			in6_ifstat_inc(ifp, ifs6_out_discard);
591 		goto bad;
592 	}
593 	if (rt == NULL) {
594 		/*
595 		 * If in6_selectroute() does not return a route entry,
596 		 * dst may not have been updated.
597 		 */
598 		*dst = dst_sa;	/* XXX */
599 	}
600 
601 	/*
602 	 * then rt (for unicast) and ifp must be non-NULL valid values.
603 	 */
604 	if ((flags & IPV6_FORWARDING) == 0) {
605 		/* XXX: the FORWARDING flag can be set for mrouting. */
606 		in6_ifstat_inc(ifp, ifs6_out_request);
607 	}
608 	if (rt != NULL) {
609 		ia = (struct in6_ifaddr *)(rt->rt_ifa);
610 		rt->rt_use++;
611 	}
612 
613 
614 	/*
615 	 * The outgoing interface must be in the zone of source and
616 	 * destination addresses.
617 	 */
618 	origifp = ifp;
619 
620 	src0 = ip6->ip6_src;
621 	if (in6_setscope(&src0, origifp, &zone))
622 		goto badscope;
623 	bzero(&src_sa, sizeof(src_sa));
624 	src_sa.sin6_family = AF_INET6;
625 	src_sa.sin6_len = sizeof(src_sa);
626 	src_sa.sin6_addr = ip6->ip6_src;
627 	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
628 		goto badscope;
629 
630 	dst0 = ip6->ip6_dst;
631 	if (in6_setscope(&dst0, origifp, &zone))
632 		goto badscope;
633 	/* re-initialize to be sure */
634 	bzero(&dst_sa, sizeof(dst_sa));
635 	dst_sa.sin6_family = AF_INET6;
636 	dst_sa.sin6_len = sizeof(dst_sa);
637 	dst_sa.sin6_addr = ip6->ip6_dst;
638 	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
639 		goto badscope;
640 	}
641 
642 	/* We should use ia_ifp to support the case of
643 	 * sending packets to an address of our own.
644 	 */
645 	if (ia != NULL && ia->ia_ifp)
646 		ifp = ia->ia_ifp;
647 
648 	/* scope check is done. */
649 	goto routefound;
650 
651   badscope:
652 	V_ip6stat.ip6s_badscope++;
653 	in6_ifstat_inc(origifp, ifs6_out_discard);
654 	if (error == 0)
655 		error = EHOSTUNREACH; /* XXX */
656 	goto bad;
657 
658   routefound:
659 	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
660 		if (opt && opt->ip6po_nextroute.ro_rt) {
661 			/*
662 			 * The nexthop is explicitly specified by the
663 			 * application.  We assume the next hop is an IPv6
664 			 * address.
665 			 */
666 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
667 		}
668 		else if ((rt->rt_flags & RTF_GATEWAY))
669 			dst = (struct sockaddr_in6 *)rt->rt_gateway;
670 	}
671 
672 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
673 		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
674 	} else {
675 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
676 		in6_ifstat_inc(ifp, ifs6_out_mcast);
677 		/*
678 		 * Confirm that the outgoing interface supports multicast.
679 		 */
680 		if (!(ifp->if_flags & IFF_MULTICAST)) {
681 			V_ip6stat.ip6s_noroute++;
682 			in6_ifstat_inc(ifp, ifs6_out_discard);
683 			error = ENETUNREACH;
684 			goto bad;
685 		}
686 		if ((im6o == NULL && in6_mcast_loop) ||
687 		    (im6o && im6o->im6o_multicast_loop)) {
688 			/*
689 			 * Loop back multicast datagram if not expressly
690 			 * forbidden to do so, even if we have not joined
691 			 * the address; protocols will filter it later,
692 			 * thus deferring a hash lookup and lock acquisition
693 			 * at the expense of an m_copym().
694 			 */
695 			ip6_mloopback(ifp, m, dst);
696 		} else {
697 			/*
698 			 * If we are acting as a multicast router, perform
699 			 * multicast forwarding as if the packet had just
700 			 * arrived on the interface to which we are about
701 			 * to send.  The multicast forwarding function
702 			 * recursively calls this function, using the
703 			 * IPV6_FORWARDING flag to prevent infinite recursion.
704 			 *
705 			 * Multicasts that are looped back by ip6_mloopback(),
706 			 * above, will be forwarded by the ip6_input() routine,
707 			 * if necessary.
708 			 */
709 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
710 				/*
711 				 * XXX: ip6_mforward expects that rcvif is NULL
712 				 * when it is called from the originating path.
713 				 * However, it is not always the case, since
714 				 * some versions of MGETHDR() does not
715 				 * initialize the field.
716 				 */
717 				m->m_pkthdr.rcvif = NULL;
718 				if (ip6_mforward(ip6, ifp, m) != 0) {
719 					m_freem(m);
720 					goto done;
721 				}
722 			}
723 		}
724 		/*
725 		 * Multicasts with a hoplimit of zero may be looped back,
726 		 * above, but must not be transmitted on a network.
727 		 * Also, multicasts addressed to the loopback interface
728 		 * are not sent -- the above call to ip6_mloopback() will
729 		 * loop back a copy if this host actually belongs to the
730 		 * destination group on the loopback interface.
731 		 */
732 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
733 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
734 			m_freem(m);
735 			goto done;
736 		}
737 	}
738 
739 	/*
740 	 * Fill the outgoing inteface to tell the upper layer
741 	 * to increment per-interface statistics.
742 	 */
743 	if (ifpp)
744 		*ifpp = ifp;
745 
746 	/* Determine path MTU. */
747 	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
748 	    &alwaysfrag)) != 0)
749 		goto bad;
750 
751 	/*
752 	 * The caller of this function may specify to use the minimum MTU
753 	 * in some cases.
754 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
755 	 * setting.  The logic is a bit complicated; by default, unicast
756 	 * packets will follow path MTU while multicast packets will be sent at
757 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
758 	 * including unicast ones will be sent at the minimum MTU.  Multicast
759 	 * packets will always be sent at the minimum MTU unless
760 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
761 	 * See RFC 3542 for more details.
762 	 */
763 	if (mtu > IPV6_MMTU) {
764 		if ((flags & IPV6_MINMTU))
765 			mtu = IPV6_MMTU;
766 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
767 			mtu = IPV6_MMTU;
768 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
769 			 (opt == NULL ||
770 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
771 			mtu = IPV6_MMTU;
772 		}
773 	}
774 
775 	/*
776 	 * clear embedded scope identifiers if necessary.
777 	 * in6_clearscope will touch the addresses only when necessary.
778 	 */
779 	in6_clearscope(&ip6->ip6_src);
780 	in6_clearscope(&ip6->ip6_dst);
781 
782 	/*
783 	 * If the outgoing packet contains a hop-by-hop options header,
784 	 * it must be examined and processed even by the source node.
785 	 * (RFC 2460, section 4.)
786 	 */
787 	if (exthdrs.ip6e_hbh) {
788 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
789 		u_int32_t dummy; /* XXX unused */
790 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
791 
792 #ifdef DIAGNOSTIC
793 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
794 			panic("ip6e_hbh is not continuous");
795 #endif
796 		/*
797 		 *  XXX: if we have to send an ICMPv6 error to the sender,
798 		 *       we need the M_LOOP flag since icmp6_error() expects
799 		 *       the IPv6 and the hop-by-hop options header are
800 		 *       continuous unless the flag is set.
801 		 */
802 		m->m_flags |= M_LOOP;
803 		m->m_pkthdr.rcvif = ifp;
804 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
805 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
806 		    &dummy, &plen) < 0) {
807 			/* m was already freed at this point */
808 			error = EINVAL;/* better error? */
809 			goto done;
810 		}
811 		m->m_flags &= ~M_LOOP; /* XXX */
812 		m->m_pkthdr.rcvif = NULL;
813 	}
814 
815 	/* Jump over all PFIL processing if hooks are not active. */
816 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
817 		goto passout;
818 
819 	odst = ip6->ip6_dst;
820 	/* Run through list of hooks for output packets. */
821 	error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
822 	if (error != 0 || m == NULL)
823 		goto done;
824 	ip6 = mtod(m, struct ip6_hdr *);
825 
826 	/* See if destination IP address was changed by packet filter. */
827 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
828 		m->m_flags |= M_SKIP_FIREWALL;
829 		/* If destination is now ourself drop to ip6_input(). */
830 		if (in6_localaddr(&ip6->ip6_dst)) {
831 			if (m->m_pkthdr.rcvif == NULL)
832 				m->m_pkthdr.rcvif = V_loif;
833 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
834 				m->m_pkthdr.csum_flags |=
835 				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
836 				m->m_pkthdr.csum_data = 0xffff;
837 			}
838 			m->m_pkthdr.csum_flags |=
839 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
840 #ifdef SCTP
841 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
842 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
843 #endif
844 			error = netisr_queue(NETISR_IPV6, m);
845 			goto done;
846 		} else
847 			goto again;	/* Redo the routing table lookup. */
848 	}
849 
850 	/* XXX: IPFIREWALL_FORWARD */
851 
852 passout:
853 	/*
854 	 * Send the packet to the outgoing interface.
855 	 * If necessary, do IPv6 fragmentation before sending.
856 	 *
857 	 * the logic here is rather complex:
858 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
859 	 * 1-a:	send as is if tlen <= path mtu
860 	 * 1-b:	fragment if tlen > path mtu
861 	 *
862 	 * 2: if user asks us not to fragment (dontfrag == 1)
863 	 * 2-a:	send as is if tlen <= interface mtu
864 	 * 2-b:	error if tlen > interface mtu
865 	 *
866 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
867 	 *	always fragment
868 	 *
869 	 * 4: if dontfrag == 1 && alwaysfrag == 1
870 	 *	error, as we cannot handle this conflicting request
871 	 */
872 #ifdef SCTP
873 	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
874 	if (sw_csum & CSUM_SCTP) {
875 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
876 		sw_csum &= ~CSUM_SCTP;
877 	}
878 #endif
879 	tlen = m->m_pkthdr.len;
880 
881 	if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
882 		dontfrag = 1;
883 	else
884 		dontfrag = 0;
885 	if (dontfrag && alwaysfrag) {	/* case 4 */
886 		/* conflicting request - can't transmit */
887 		error = EMSGSIZE;
888 		goto bad;
889 	}
890 	if (dontfrag && tlen > IN6_LINKMTU(ifp)) {	/* case 2-b */
891 		/*
892 		 * Even if the DONTFRAG option is specified, we cannot send the
893 		 * packet when the data length is larger than the MTU of the
894 		 * outgoing interface.
895 		 * Notify the error by sending IPV6_PATHMTU ancillary data as
896 		 * well as returning an error code (the latter is not described
897 		 * in the API spec.)
898 		 */
899 		u_int32_t mtu32;
900 		struct ip6ctlparam ip6cp;
901 
902 		mtu32 = (u_int32_t)mtu;
903 		bzero(&ip6cp, sizeof(ip6cp));
904 		ip6cp.ip6c_cmdarg = (void *)&mtu32;
905 		pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst,
906 		    (void *)&ip6cp);
907 
908 		error = EMSGSIZE;
909 		goto bad;
910 	}
911 
912 	/*
913 	 * transmit packet without fragmentation
914 	 */
915 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
916 		struct in6_ifaddr *ia6;
917 
918 		ip6 = mtod(m, struct ip6_hdr *);
919 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
920 		if (ia6) {
921 			/* Record statistics for this interface address. */
922 			ia6->ia_ifa.if_opackets++;
923 			ia6->ia_ifa.if_obytes += m->m_pkthdr.len;
924 			ifa_free(&ia6->ia_ifa);
925 		}
926 		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
927 		goto done;
928 	}
929 
930 	/*
931 	 * try to fragment the packet.  case 1-b and 3
932 	 */
933 	if (mtu < IPV6_MMTU) {
934 		/* path MTU cannot be less than IPV6_MMTU */
935 		error = EMSGSIZE;
936 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
937 		goto bad;
938 	} else if (ip6->ip6_plen == 0) {
939 		/* jumbo payload cannot be fragmented */
940 		error = EMSGSIZE;
941 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
942 		goto bad;
943 	} else {
944 		struct mbuf **mnext, *m_frgpart;
945 		struct ip6_frag *ip6f;
946 		u_int32_t id = htonl(ip6_randomid());
947 		u_char nextproto;
948 
949 		int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;
950 
951 		/*
952 		 * Too large for the destination or interface;
953 		 * fragment if possible.
954 		 * Must be able to put at least 8 bytes per fragment.
955 		 */
956 		hlen = unfragpartlen;
957 		if (mtu > IPV6_MAXPACKET)
958 			mtu = IPV6_MAXPACKET;
959 
960 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
961 		if (len < 8) {
962 			error = EMSGSIZE;
963 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
964 			goto bad;
965 		}
966 
967 		/*
968 		 * Verify that we have any chance at all of being able to queue
969 		 *      the packet or packet fragments
970 		 */
971 		if (qslots <= 0 || ((u_int)qslots * (mtu - hlen)
972 		    < tlen  /* - hlen */)) {
973 			error = ENOBUFS;
974 			V_ip6stat.ip6s_odropped++;
975 			goto bad;
976 		}
977 
978 		mnext = &m->m_nextpkt;
979 
980 		/*
981 		 * Change the next header field of the last header in the
982 		 * unfragmentable part.
983 		 */
984 		if (exthdrs.ip6e_rthdr) {
985 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
986 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
987 		} else if (exthdrs.ip6e_dest1) {
988 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
989 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
990 		} else if (exthdrs.ip6e_hbh) {
991 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
992 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
993 		} else {
994 			nextproto = ip6->ip6_nxt;
995 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
996 		}
997 
998 		/*
999 		 * Loop through length of segment after first fragment,
1000 		 * make new header and copy data of each part and link onto
1001 		 * chain.
1002 		 */
1003 		m0 = m;
1004 		for (off = hlen; off < tlen; off += len) {
1005 			MGETHDR(m, M_DONTWAIT, MT_HEADER);
1006 			if (!m) {
1007 				error = ENOBUFS;
1008 				V_ip6stat.ip6s_odropped++;
1009 				goto sendorfree;
1010 			}
1011 			m->m_pkthdr.rcvif = NULL;
1012 			m->m_flags = m0->m_flags & M_COPYFLAGS;
1013 			*mnext = m;
1014 			mnext = &m->m_nextpkt;
1015 			m->m_data += max_linkhdr;
1016 			mhip6 = mtod(m, struct ip6_hdr *);
1017 			*mhip6 = *ip6;
1018 			m->m_len = sizeof(*mhip6);
1019 			error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
1020 			if (error) {
1021 				V_ip6stat.ip6s_odropped++;
1022 				goto sendorfree;
1023 			}
1024 			ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
1025 			if (off + len >= tlen)
1026 				len = tlen - off;
1027 			else
1028 				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
1029 			mhip6->ip6_plen = htons((u_short)(len + hlen +
1030 			    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
1031 			if ((m_frgpart = m_copy(m0, off, len)) == 0) {
1032 				error = ENOBUFS;
1033 				V_ip6stat.ip6s_odropped++;
1034 				goto sendorfree;
1035 			}
1036 			m_cat(m, m_frgpart);
1037 			m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
1038 			m->m_pkthdr.rcvif = NULL;
1039 			ip6f->ip6f_reserved = 0;
1040 			ip6f->ip6f_ident = id;
1041 			ip6f->ip6f_nxt = nextproto;
1042 			V_ip6stat.ip6s_ofragments++;
1043 			in6_ifstat_inc(ifp, ifs6_out_fragcreat);
1044 		}
1045 
1046 		in6_ifstat_inc(ifp, ifs6_out_fragok);
1047 	}
1048 
1049 	/*
1050 	 * Remove leading garbages.
1051 	 */
1052 sendorfree:
1053 	m = m0->m_nextpkt;
1054 	m0->m_nextpkt = 0;
1055 	m_freem(m0);
1056 	for (m0 = m; m; m = m0) {
1057 		m0 = m->m_nextpkt;
1058 		m->m_nextpkt = 0;
1059 		if (error == 0) {
1060 			/* Record statistics for this interface address. */
1061 			if (ia) {
1062 				ia->ia_ifa.if_opackets++;
1063 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1064 			}
1065 			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
1066 		} else
1067 			m_freem(m);
1068 	}
1069 
1070 	if (error == 0)
1071 		V_ip6stat.ip6s_fragmented++;
1072 
1073 done:
1074 	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
1075 		RTFREE(ro->ro_rt);
1076 	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
1077 		RTFREE(ro_pmtu->ro_rt);
1078 	}
1079 #ifdef IPSEC
1080 	if (sp != NULL)
1081 		KEY_FREESP(&sp);
1082 #endif
1083 
1084 	return (error);
1085 
1086 freehdrs:
1087 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
1088 	m_freem(exthdrs.ip6e_dest1);
1089 	m_freem(exthdrs.ip6e_rthdr);
1090 	m_freem(exthdrs.ip6e_dest2);
1091 	/* FALLTHROUGH */
1092 bad:
1093 	if (m)
1094 		m_freem(m);
1095 	goto done;
1096 }
1097 
1098 static int
1099 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
1100 {
1101 	struct mbuf *m;
1102 
1103 	if (hlen > MCLBYTES)
1104 		return (ENOBUFS); /* XXX */
1105 
1106 	MGET(m, M_DONTWAIT, MT_DATA);
1107 	if (!m)
1108 		return (ENOBUFS);
1109 
1110 	if (hlen > MLEN) {
1111 		MCLGET(m, M_DONTWAIT);
1112 		if ((m->m_flags & M_EXT) == 0) {
1113 			m_free(m);
1114 			return (ENOBUFS);
1115 		}
1116 	}
1117 	m->m_len = hlen;
1118 	if (hdr)
1119 		bcopy(hdr, mtod(m, caddr_t), hlen);
1120 
1121 	*mp = m;
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Insert jumbo payload option.
1127  */
1128 static int
1129 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
1130 {
1131 	struct mbuf *mopt;
1132 	u_char *optbuf;
1133 	u_int32_t v;
1134 
1135 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
1136 
1137 	/*
1138 	 * If there is no hop-by-hop options header, allocate new one.
1139 	 * If there is one but it doesn't have enough space to store the
1140 	 * jumbo payload option, allocate a cluster to store the whole options.
1141 	 * Otherwise, use it to store the options.
1142 	 */
1143 	if (exthdrs->ip6e_hbh == 0) {
1144 		MGET(mopt, M_DONTWAIT, MT_DATA);
1145 		if (mopt == 0)
1146 			return (ENOBUFS);
1147 		mopt->m_len = JUMBOOPTLEN;
1148 		optbuf = mtod(mopt, u_char *);
1149 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
1150 		exthdrs->ip6e_hbh = mopt;
1151 	} else {
1152 		struct ip6_hbh *hbh;
1153 
1154 		mopt = exthdrs->ip6e_hbh;
1155 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
1156 			/*
1157 			 * XXX assumption:
1158 			 * - exthdrs->ip6e_hbh is not referenced from places
1159 			 *   other than exthdrs.
1160 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
1161 			 */
1162 			int oldoptlen = mopt->m_len;
1163 			struct mbuf *n;
1164 
1165 			/*
1166 			 * XXX: give up if the whole (new) hbh header does
1167 			 * not fit even in an mbuf cluster.
1168 			 */
1169 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
1170 				return (ENOBUFS);
1171 
1172 			/*
1173 			 * As a consequence, we must always prepare a cluster
1174 			 * at this point.
1175 			 */
1176 			MGET(n, M_DONTWAIT, MT_DATA);
1177 			if (n) {
1178 				MCLGET(n, M_DONTWAIT);
1179 				if ((n->m_flags & M_EXT) == 0) {
1180 					m_freem(n);
1181 					n = NULL;
1182 				}
1183 			}
1184 			if (!n)
1185 				return (ENOBUFS);
1186 			n->m_len = oldoptlen + JUMBOOPTLEN;
1187 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
1188 			    oldoptlen);
1189 			optbuf = mtod(n, caddr_t) + oldoptlen;
1190 			m_freem(mopt);
1191 			mopt = exthdrs->ip6e_hbh = n;
1192 		} else {
1193 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
1194 			mopt->m_len += JUMBOOPTLEN;
1195 		}
1196 		optbuf[0] = IP6OPT_PADN;
1197 		optbuf[1] = 1;
1198 
1199 		/*
1200 		 * Adjust the header length according to the pad and
1201 		 * the jumbo payload option.
1202 		 */
1203 		hbh = mtod(mopt, struct ip6_hbh *);
1204 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
1205 	}
1206 
1207 	/* fill in the option. */
1208 	optbuf[2] = IP6OPT_JUMBO;
1209 	optbuf[3] = 4;
1210 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
1211 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
1212 
1213 	/* finally, adjust the packet header length */
1214 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
1215 
1216 	return (0);
1217 #undef JUMBOOPTLEN
1218 }
1219 
1220 /*
1221  * Insert fragment header and copy unfragmentable header portions.
1222  */
1223 static int
1224 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
1225     struct ip6_frag **frghdrp)
1226 {
1227 	struct mbuf *n, *mlast;
1228 
1229 	if (hlen > sizeof(struct ip6_hdr)) {
1230 		n = m_copym(m0, sizeof(struct ip6_hdr),
1231 		    hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
1232 		if (n == 0)
1233 			return (ENOBUFS);
1234 		m->m_next = n;
1235 	} else
1236 		n = m;
1237 
1238 	/* Search for the last mbuf of unfragmentable part. */
1239 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
1240 		;
1241 
1242 	if ((mlast->m_flags & M_EXT) == 0 &&
1243 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
1244 		/* use the trailing space of the last mbuf for the fragment hdr */
1245 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
1246 		    mlast->m_len);
1247 		mlast->m_len += sizeof(struct ip6_frag);
1248 		m->m_pkthdr.len += sizeof(struct ip6_frag);
1249 	} else {
1250 		/* allocate a new mbuf for the fragment header */
1251 		struct mbuf *mfrg;
1252 
1253 		MGET(mfrg, M_DONTWAIT, MT_DATA);
1254 		if (mfrg == 0)
1255 			return (ENOBUFS);
1256 		mfrg->m_len = sizeof(struct ip6_frag);
1257 		*frghdrp = mtod(mfrg, struct ip6_frag *);
1258 		mlast->m_next = mfrg;
1259 	}
1260 
1261 	return (0);
1262 }
1263 
1264 static int
1265 ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
1266     struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
1267     int *alwaysfragp)
1268 {
1269 	u_int32_t mtu = 0;
1270 	int alwaysfrag = 0;
1271 	int error = 0;
1272 
1273 	if (ro_pmtu != ro) {
1274 		/* The first hop and the final destination may differ. */
1275 		struct sockaddr_in6 *sa6_dst =
1276 		    (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
1277 		if (ro_pmtu->ro_rt &&
1278 		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
1279 		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
1280 			RTFREE(ro_pmtu->ro_rt);
1281 			ro_pmtu->ro_rt = (struct rtentry *)NULL;
1282 		}
1283 		if (ro_pmtu->ro_rt == NULL) {
1284 			bzero(sa6_dst, sizeof(*sa6_dst));
1285 			sa6_dst->sin6_family = AF_INET6;
1286 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
1287 			sa6_dst->sin6_addr = *dst;
1288 
1289 			rtalloc((struct route *)ro_pmtu);
1290 		}
1291 	}
1292 	if (ro_pmtu->ro_rt) {
1293 		u_int32_t ifmtu;
1294 		struct in_conninfo inc;
1295 
1296 		bzero(&inc, sizeof(inc));
1297 		inc.inc_flags |= INC_ISIPV6;
1298 		inc.inc6_faddr = *dst;
1299 
1300 		if (ifp == NULL)
1301 			ifp = ro_pmtu->ro_rt->rt_ifp;
1302 		ifmtu = IN6_LINKMTU(ifp);
1303 		mtu = tcp_hc_getmtu(&inc);
1304 		if (mtu)
1305 			mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
1306 		else
1307 			mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
1308 		if (mtu == 0)
1309 			mtu = ifmtu;
1310 		else if (mtu < IPV6_MMTU) {
1311 			/*
1312 			 * RFC2460 section 5, last paragraph:
1313 			 * if we record ICMPv6 too big message with
1314 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
1315 			 * or smaller, with framgent header attached.
1316 			 * (fragment header is needed regardless from the
1317 			 * packet size, for translators to identify packets)
1318 			 */
1319 			alwaysfrag = 1;
1320 			mtu = IPV6_MMTU;
1321 		} else if (mtu > ifmtu) {
1322 			/*
1323 			 * The MTU on the route is larger than the MTU on
1324 			 * the interface!  This shouldn't happen, unless the
1325 			 * MTU of the interface has been changed after the
1326 			 * interface was brought up.  Change the MTU in the
1327 			 * route to match the interface MTU (as long as the
1328 			 * field isn't locked).
1329 			 */
1330 			mtu = ifmtu;
1331 			ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
1332 		}
1333 	} else if (ifp) {
1334 		mtu = IN6_LINKMTU(ifp);
1335 	} else
1336 		error = EHOSTUNREACH; /* XXX */
1337 
1338 	*mtup = mtu;
1339 	if (alwaysfragp)
1340 		*alwaysfragp = alwaysfrag;
1341 	return (error);
1342 }
1343 
1344 /*
1345  * IP6 socket option processing.
1346  */
1347 int
1348 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
1349 {
1350 	int optdatalen, uproto;
1351 	void *optdata;
1352 	struct inpcb *in6p = sotoinpcb(so);
1353 	int error, optval;
1354 	int level, op, optname;
1355 	int optlen;
1356 	struct thread *td;
1357 
1358 	level = sopt->sopt_level;
1359 	op = sopt->sopt_dir;
1360 	optname = sopt->sopt_name;
1361 	optlen = sopt->sopt_valsize;
1362 	td = sopt->sopt_td;
1363 	error = 0;
1364 	optval = 0;
1365 	uproto = (int)so->so_proto->pr_protocol;
1366 
1367 	if (level == IPPROTO_IPV6) {
1368 		switch (op) {
1369 
1370 		case SOPT_SET:
1371 			switch (optname) {
1372 			case IPV6_2292PKTOPTIONS:
1373 #ifdef IPV6_PKTOPTIONS
1374 			case IPV6_PKTOPTIONS:
1375 #endif
1376 			{
1377 				struct mbuf *m;
1378 
1379 				error = soopt_getm(sopt, &m); /* XXX */
1380 				if (error != 0)
1381 					break;
1382 				error = soopt_mcopyin(sopt, m); /* XXX */
1383 				if (error != 0)
1384 					break;
1385 				error = ip6_pcbopts(&in6p->in6p_outputopts,
1386 						    m, so, sopt);
1387 				m_freem(m); /* XXX */
1388 				break;
1389 			}
1390 
1391 			/*
1392 			 * Use of some Hop-by-Hop options or some
1393 			 * Destination options, might require special
1394 			 * privilege.  That is, normal applications
1395 			 * (without special privilege) might be forbidden
1396 			 * from setting certain options in outgoing packets,
1397 			 * and might never see certain options in received
1398 			 * packets. [RFC 2292 Section 6]
1399 			 * KAME specific note:
1400 			 *  KAME prevents non-privileged users from sending or
1401 			 *  receiving ANY hbh/dst options in order to avoid
1402 			 *  overhead of parsing options in the kernel.
1403 			 */
1404 			case IPV6_RECVHOPOPTS:
1405 			case IPV6_RECVDSTOPTS:
1406 			case IPV6_RECVRTHDRDSTOPTS:
1407 				if (td != NULL) {
1408 					error = priv_check(td,
1409 					    PRIV_NETINET_SETHDROPTS);
1410 					if (error)
1411 						break;
1412 				}
1413 				/* FALLTHROUGH */
1414 			case IPV6_UNICAST_HOPS:
1415 			case IPV6_HOPLIMIT:
1416 			case IPV6_FAITH:
1417 
1418 			case IPV6_RECVPKTINFO:
1419 			case IPV6_RECVHOPLIMIT:
1420 			case IPV6_RECVRTHDR:
1421 			case IPV6_RECVPATHMTU:
1422 			case IPV6_RECVTCLASS:
1423 			case IPV6_V6ONLY:
1424 			case IPV6_AUTOFLOWLABEL:
1425 			case IPV6_BINDANY:
1426 				if (optname == IPV6_BINDANY && td != NULL) {
1427 					error = priv_check(td,
1428 					    PRIV_NETINET_BINDANY);
1429 					if (error)
1430 						break;
1431 				}
1432 
1433 				if (optlen != sizeof(int)) {
1434 					error = EINVAL;
1435 					break;
1436 				}
1437 				error = sooptcopyin(sopt, &optval,
1438 					sizeof optval, sizeof optval);
1439 				if (error)
1440 					break;
1441 				switch (optname) {
1442 
1443 				case IPV6_UNICAST_HOPS:
1444 					if (optval < -1 || optval >= 256)
1445 						error = EINVAL;
1446 					else {
1447 						/* -1 = kernel default */
1448 						in6p->in6p_hops = optval;
1449 						if ((in6p->inp_vflag &
1450 						     INP_IPV4) != 0)
1451 							in6p->inp_ip_ttl = optval;
1452 					}
1453 					break;
1454 #define OPTSET(bit) \
1455 do { \
1456 	if (optval) \
1457 		in6p->inp_flags |= (bit); \
1458 	else \
1459 		in6p->inp_flags &= ~(bit); \
1460 } while (/*CONSTCOND*/ 0)
1461 #define OPTSET2292(bit) \
1462 do { \
1463 	in6p->inp_flags |= IN6P_RFC2292; \
1464 	if (optval) \
1465 		in6p->inp_flags |= (bit); \
1466 	else \
1467 		in6p->inp_flags &= ~(bit); \
1468 } while (/*CONSTCOND*/ 0)
1469 #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
1470 
1471 				case IPV6_RECVPKTINFO:
1472 					/* cannot mix with RFC2292 */
1473 					if (OPTBIT(IN6P_RFC2292)) {
1474 						error = EINVAL;
1475 						break;
1476 					}
1477 					OPTSET(IN6P_PKTINFO);
1478 					break;
1479 
1480 				case IPV6_HOPLIMIT:
1481 				{
1482 					struct ip6_pktopts **optp;
1483 
1484 					/* cannot mix with RFC2292 */
1485 					if (OPTBIT(IN6P_RFC2292)) {
1486 						error = EINVAL;
1487 						break;
1488 					}
1489 					optp = &in6p->in6p_outputopts;
1490 					error = ip6_pcbopt(IPV6_HOPLIMIT,
1491 					    (u_char *)&optval, sizeof(optval),
1492 					    optp, (td != NULL) ? td->td_ucred :
1493 					    NULL, uproto);
1494 					break;
1495 				}
1496 
1497 				case IPV6_RECVHOPLIMIT:
1498 					/* cannot mix with RFC2292 */
1499 					if (OPTBIT(IN6P_RFC2292)) {
1500 						error = EINVAL;
1501 						break;
1502 					}
1503 					OPTSET(IN6P_HOPLIMIT);
1504 					break;
1505 
1506 				case IPV6_RECVHOPOPTS:
1507 					/* cannot mix with RFC2292 */
1508 					if (OPTBIT(IN6P_RFC2292)) {
1509 						error = EINVAL;
1510 						break;
1511 					}
1512 					OPTSET(IN6P_HOPOPTS);
1513 					break;
1514 
1515 				case IPV6_RECVDSTOPTS:
1516 					/* cannot mix with RFC2292 */
1517 					if (OPTBIT(IN6P_RFC2292)) {
1518 						error = EINVAL;
1519 						break;
1520 					}
1521 					OPTSET(IN6P_DSTOPTS);
1522 					break;
1523 
1524 				case IPV6_RECVRTHDRDSTOPTS:
1525 					/* cannot mix with RFC2292 */
1526 					if (OPTBIT(IN6P_RFC2292)) {
1527 						error = EINVAL;
1528 						break;
1529 					}
1530 					OPTSET(IN6P_RTHDRDSTOPTS);
1531 					break;
1532 
1533 				case IPV6_RECVRTHDR:
1534 					/* cannot mix with RFC2292 */
1535 					if (OPTBIT(IN6P_RFC2292)) {
1536 						error = EINVAL;
1537 						break;
1538 					}
1539 					OPTSET(IN6P_RTHDR);
1540 					break;
1541 
1542 				case IPV6_FAITH:
1543 					OPTSET(INP_FAITH);
1544 					break;
1545 
1546 				case IPV6_RECVPATHMTU:
1547 					/*
1548 					 * We ignore this option for TCP
1549 					 * sockets.
1550 					 * (RFC3542 leaves this case
1551 					 * unspecified.)
1552 					 */
1553 					if (uproto != IPPROTO_TCP)
1554 						OPTSET(IN6P_MTU);
1555 					break;
1556 
1557 				case IPV6_V6ONLY:
1558 					/*
1559 					 * make setsockopt(IPV6_V6ONLY)
1560 					 * available only prior to bind(2).
1561 					 * see ipng mailing list, Jun 22 2001.
1562 					 */
1563 					if (in6p->inp_lport ||
1564 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
1565 						error = EINVAL;
1566 						break;
1567 					}
1568 					OPTSET(IN6P_IPV6_V6ONLY);
1569 					if (optval)
1570 						in6p->inp_vflag &= ~INP_IPV4;
1571 					else
1572 						in6p->inp_vflag |= INP_IPV4;
1573 					break;
1574 				case IPV6_RECVTCLASS:
1575 					/* cannot mix with RFC2292 XXX */
1576 					if (OPTBIT(IN6P_RFC2292)) {
1577 						error = EINVAL;
1578 						break;
1579 					}
1580 					OPTSET(IN6P_TCLASS);
1581 					break;
1582 				case IPV6_AUTOFLOWLABEL:
1583 					OPTSET(IN6P_AUTOFLOWLABEL);
1584 					break;
1585 
1586 				case IPV6_BINDANY:
1587 					OPTSET(INP_BINDANY);
1588 					break;
1589 				}
1590 				break;
1591 
1592 			case IPV6_TCLASS:
1593 			case IPV6_DONTFRAG:
1594 			case IPV6_USE_MIN_MTU:
1595 			case IPV6_PREFER_TEMPADDR:
1596 				if (optlen != sizeof(optval)) {
1597 					error = EINVAL;
1598 					break;
1599 				}
1600 				error = sooptcopyin(sopt, &optval,
1601 					sizeof optval, sizeof optval);
1602 				if (error)
1603 					break;
1604 				{
1605 					struct ip6_pktopts **optp;
1606 					optp = &in6p->in6p_outputopts;
1607 					error = ip6_pcbopt(optname,
1608 					    (u_char *)&optval, sizeof(optval),
1609 					    optp, (td != NULL) ? td->td_ucred :
1610 					    NULL, uproto);
1611 					break;
1612 				}
1613 
1614 			case IPV6_2292PKTINFO:
1615 			case IPV6_2292HOPLIMIT:
1616 			case IPV6_2292HOPOPTS:
1617 			case IPV6_2292DSTOPTS:
1618 			case IPV6_2292RTHDR:
1619 				/* RFC 2292 */
1620 				if (optlen != sizeof(int)) {
1621 					error = EINVAL;
1622 					break;
1623 				}
1624 				error = sooptcopyin(sopt, &optval,
1625 					sizeof optval, sizeof optval);
1626 				if (error)
1627 					break;
1628 				switch (optname) {
1629 				case IPV6_2292PKTINFO:
1630 					OPTSET2292(IN6P_PKTINFO);
1631 					break;
1632 				case IPV6_2292HOPLIMIT:
1633 					OPTSET2292(IN6P_HOPLIMIT);
1634 					break;
1635 				case IPV6_2292HOPOPTS:
1636 					/*
1637 					 * Check super-user privilege.
1638 					 * See comments for IPV6_RECVHOPOPTS.
1639 					 */
1640 					if (td != NULL) {
1641 						error = priv_check(td,
1642 						    PRIV_NETINET_SETHDROPTS);
1643 						if (error)
1644 							return (error);
1645 					}
1646 					OPTSET2292(IN6P_HOPOPTS);
1647 					break;
1648 				case IPV6_2292DSTOPTS:
1649 					if (td != NULL) {
1650 						error = priv_check(td,
1651 						    PRIV_NETINET_SETHDROPTS);
1652 						if (error)
1653 							return (error);
1654 					}
1655 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
1656 					break;
1657 				case IPV6_2292RTHDR:
1658 					OPTSET2292(IN6P_RTHDR);
1659 					break;
1660 				}
1661 				break;
1662 			case IPV6_PKTINFO:
1663 			case IPV6_HOPOPTS:
1664 			case IPV6_RTHDR:
1665 			case IPV6_DSTOPTS:
1666 			case IPV6_RTHDRDSTOPTS:
1667 			case IPV6_NEXTHOP:
1668 			{
1669 				/* new advanced API (RFC3542) */
1670 				u_char *optbuf;
1671 				u_char optbuf_storage[MCLBYTES];
1672 				int optlen;
1673 				struct ip6_pktopts **optp;
1674 
1675 				/* cannot mix with RFC2292 */
1676 				if (OPTBIT(IN6P_RFC2292)) {
1677 					error = EINVAL;
1678 					break;
1679 				}
1680 
1681 				/*
1682 				 * We only ensure valsize is not too large
1683 				 * here.  Further validation will be done
1684 				 * later.
1685 				 */
1686 				error = sooptcopyin(sopt, optbuf_storage,
1687 				    sizeof(optbuf_storage), 0);
1688 				if (error)
1689 					break;
1690 				optlen = sopt->sopt_valsize;
1691 				optbuf = optbuf_storage;
1692 				optp = &in6p->in6p_outputopts;
1693 				error = ip6_pcbopt(optname, optbuf, optlen,
1694 				    optp, (td != NULL) ? td->td_ucred : NULL,
1695 				    uproto);
1696 				break;
1697 			}
1698 #undef OPTSET
1699 
1700 			case IPV6_MULTICAST_IF:
1701 			case IPV6_MULTICAST_HOPS:
1702 			case IPV6_MULTICAST_LOOP:
1703 			case IPV6_JOIN_GROUP:
1704 			case IPV6_LEAVE_GROUP:
1705 			case IPV6_MSFILTER:
1706 			case MCAST_BLOCK_SOURCE:
1707 			case MCAST_UNBLOCK_SOURCE:
1708 			case MCAST_JOIN_GROUP:
1709 			case MCAST_LEAVE_GROUP:
1710 			case MCAST_JOIN_SOURCE_GROUP:
1711 			case MCAST_LEAVE_SOURCE_GROUP:
1712 				error = ip6_setmoptions(in6p, sopt);
1713 				break;
1714 
1715 			case IPV6_PORTRANGE:
1716 				error = sooptcopyin(sopt, &optval,
1717 				    sizeof optval, sizeof optval);
1718 				if (error)
1719 					break;
1720 
1721 				switch (optval) {
1722 				case IPV6_PORTRANGE_DEFAULT:
1723 					in6p->inp_flags &= ~(INP_LOWPORT);
1724 					in6p->inp_flags &= ~(INP_HIGHPORT);
1725 					break;
1726 
1727 				case IPV6_PORTRANGE_HIGH:
1728 					in6p->inp_flags &= ~(INP_LOWPORT);
1729 					in6p->inp_flags |= INP_HIGHPORT;
1730 					break;
1731 
1732 				case IPV6_PORTRANGE_LOW:
1733 					in6p->inp_flags &= ~(INP_HIGHPORT);
1734 					in6p->inp_flags |= INP_LOWPORT;
1735 					break;
1736 
1737 				default:
1738 					error = EINVAL;
1739 					break;
1740 				}
1741 				break;
1742 
1743 #ifdef IPSEC
1744 			case IPV6_IPSEC_POLICY:
1745 			{
1746 				caddr_t req;
1747 				struct mbuf *m;
1748 
1749 				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1750 					break;
1751 				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1752 					break;
1753 				req = mtod(m, caddr_t);
1754 				error = ipsec_set_policy(in6p, optname, req,
1755 				    m->m_len, (sopt->sopt_td != NULL) ?
1756 				    sopt->sopt_td->td_ucred : NULL);
1757 				m_freem(m);
1758 				break;
1759 			}
1760 #endif /* IPSEC */
1761 
1762 			default:
1763 				error = ENOPROTOOPT;
1764 				break;
1765 			}
1766 			break;
1767 
1768 		case SOPT_GET:
1769 			switch (optname) {
1770 
1771 			case IPV6_2292PKTOPTIONS:
1772 #ifdef IPV6_PKTOPTIONS
1773 			case IPV6_PKTOPTIONS:
1774 #endif
1775 				/*
1776 				 * RFC3542 (effectively) deprecated the
1777 				 * semantics of the 2292-style pktoptions.
1778 				 * Since it was not reliable in nature (i.e.,
1779 				 * applications had to expect the lack of some
1780 				 * information after all), it would make sense
1781 				 * to simplify this part by always returning
1782 				 * empty data.
1783 				 */
1784 				sopt->sopt_valsize = 0;
1785 				break;
1786 
1787 			case IPV6_RECVHOPOPTS:
1788 			case IPV6_RECVDSTOPTS:
1789 			case IPV6_RECVRTHDRDSTOPTS:
1790 			case IPV6_UNICAST_HOPS:
1791 			case IPV6_RECVPKTINFO:
1792 			case IPV6_RECVHOPLIMIT:
1793 			case IPV6_RECVRTHDR:
1794 			case IPV6_RECVPATHMTU:
1795 
1796 			case IPV6_FAITH:
1797 			case IPV6_V6ONLY:
1798 			case IPV6_PORTRANGE:
1799 			case IPV6_RECVTCLASS:
1800 			case IPV6_AUTOFLOWLABEL:
1801 				switch (optname) {
1802 
1803 				case IPV6_RECVHOPOPTS:
1804 					optval = OPTBIT(IN6P_HOPOPTS);
1805 					break;
1806 
1807 				case IPV6_RECVDSTOPTS:
1808 					optval = OPTBIT(IN6P_DSTOPTS);
1809 					break;
1810 
1811 				case IPV6_RECVRTHDRDSTOPTS:
1812 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
1813 					break;
1814 
1815 				case IPV6_UNICAST_HOPS:
1816 					optval = in6p->in6p_hops;
1817 					break;
1818 
1819 				case IPV6_RECVPKTINFO:
1820 					optval = OPTBIT(IN6P_PKTINFO);
1821 					break;
1822 
1823 				case IPV6_RECVHOPLIMIT:
1824 					optval = OPTBIT(IN6P_HOPLIMIT);
1825 					break;
1826 
1827 				case IPV6_RECVRTHDR:
1828 					optval = OPTBIT(IN6P_RTHDR);
1829 					break;
1830 
1831 				case IPV6_RECVPATHMTU:
1832 					optval = OPTBIT(IN6P_MTU);
1833 					break;
1834 
1835 				case IPV6_FAITH:
1836 					optval = OPTBIT(INP_FAITH);
1837 					break;
1838 
1839 				case IPV6_V6ONLY:
1840 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
1841 					break;
1842 
1843 				case IPV6_PORTRANGE:
1844 				    {
1845 					int flags;
1846 					flags = in6p->inp_flags;
1847 					if (flags & INP_HIGHPORT)
1848 						optval = IPV6_PORTRANGE_HIGH;
1849 					else if (flags & INP_LOWPORT)
1850 						optval = IPV6_PORTRANGE_LOW;
1851 					else
1852 						optval = 0;
1853 					break;
1854 				    }
1855 				case IPV6_RECVTCLASS:
1856 					optval = OPTBIT(IN6P_TCLASS);
1857 					break;
1858 
1859 				case IPV6_AUTOFLOWLABEL:
1860 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
1861 					break;
1862 
1863 				case IPV6_BINDANY:
1864 					optval = OPTBIT(INP_BINDANY);
1865 					break;
1866 				}
1867 				if (error)
1868 					break;
1869 				error = sooptcopyout(sopt, &optval,
1870 					sizeof optval);
1871 				break;
1872 
1873 			case IPV6_PATHMTU:
1874 			{
1875 				u_long pmtu = 0;
1876 				struct ip6_mtuinfo mtuinfo;
1877 				struct route_in6 sro;
1878 
1879 				bzero(&sro, sizeof(sro));
1880 
1881 				if (!(so->so_state & SS_ISCONNECTED))
1882 					return (ENOTCONN);
1883 				/*
1884 				 * XXX: we dot not consider the case of source
1885 				 * routing, or optional information to specify
1886 				 * the outgoing interface.
1887 				 */
1888 				error = ip6_getpmtu(&sro, NULL, NULL,
1889 				    &in6p->in6p_faddr, &pmtu, NULL);
1890 				if (sro.ro_rt)
1891 					RTFREE(sro.ro_rt);
1892 				if (error)
1893 					break;
1894 				if (pmtu > IPV6_MAXPACKET)
1895 					pmtu = IPV6_MAXPACKET;
1896 
1897 				bzero(&mtuinfo, sizeof(mtuinfo));
1898 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
1899 				optdata = (void *)&mtuinfo;
1900 				optdatalen = sizeof(mtuinfo);
1901 				error = sooptcopyout(sopt, optdata,
1902 				    optdatalen);
1903 				break;
1904 			}
1905 
1906 			case IPV6_2292PKTINFO:
1907 			case IPV6_2292HOPLIMIT:
1908 			case IPV6_2292HOPOPTS:
1909 			case IPV6_2292RTHDR:
1910 			case IPV6_2292DSTOPTS:
1911 				switch (optname) {
1912 				case IPV6_2292PKTINFO:
1913 					optval = OPTBIT(IN6P_PKTINFO);
1914 					break;
1915 				case IPV6_2292HOPLIMIT:
1916 					optval = OPTBIT(IN6P_HOPLIMIT);
1917 					break;
1918 				case IPV6_2292HOPOPTS:
1919 					optval = OPTBIT(IN6P_HOPOPTS);
1920 					break;
1921 				case IPV6_2292RTHDR:
1922 					optval = OPTBIT(IN6P_RTHDR);
1923 					break;
1924 				case IPV6_2292DSTOPTS:
1925 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
1926 					break;
1927 				}
1928 				error = sooptcopyout(sopt, &optval,
1929 				    sizeof optval);
1930 				break;
1931 			case IPV6_PKTINFO:
1932 			case IPV6_HOPOPTS:
1933 			case IPV6_RTHDR:
1934 			case IPV6_DSTOPTS:
1935 			case IPV6_RTHDRDSTOPTS:
1936 			case IPV6_NEXTHOP:
1937 			case IPV6_TCLASS:
1938 			case IPV6_DONTFRAG:
1939 			case IPV6_USE_MIN_MTU:
1940 			case IPV6_PREFER_TEMPADDR:
1941 				error = ip6_getpcbopt(in6p->in6p_outputopts,
1942 				    optname, sopt);
1943 				break;
1944 
1945 			case IPV6_MULTICAST_IF:
1946 			case IPV6_MULTICAST_HOPS:
1947 			case IPV6_MULTICAST_LOOP:
1948 			case IPV6_MSFILTER:
1949 				error = ip6_getmoptions(in6p, sopt);
1950 				break;
1951 
1952 #ifdef IPSEC
1953 			case IPV6_IPSEC_POLICY:
1954 			  {
1955 				caddr_t req = NULL;
1956 				size_t len = 0;
1957 				struct mbuf *m = NULL;
1958 				struct mbuf **mp = &m;
1959 				size_t ovalsize = sopt->sopt_valsize;
1960 				caddr_t oval = (caddr_t)sopt->sopt_val;
1961 
1962 				error = soopt_getm(sopt, &m); /* XXX */
1963 				if (error != 0)
1964 					break;
1965 				error = soopt_mcopyin(sopt, m); /* XXX */
1966 				if (error != 0)
1967 					break;
1968 				sopt->sopt_valsize = ovalsize;
1969 				sopt->sopt_val = oval;
1970 				if (m) {
1971 					req = mtod(m, caddr_t);
1972 					len = m->m_len;
1973 				}
1974 				error = ipsec_get_policy(in6p, req, len, mp);
1975 				if (error == 0)
1976 					error = soopt_mcopyout(sopt, m); /* XXX */
1977 				if (error == 0 && m)
1978 					m_freem(m);
1979 				break;
1980 			  }
1981 #endif /* IPSEC */
1982 
1983 			default:
1984 				error = ENOPROTOOPT;
1985 				break;
1986 			}
1987 			break;
1988 		}
1989 	} else {		/* level != IPPROTO_IPV6 */
1990 		error = EINVAL;
1991 	}
1992 	return (error);
1993 }
1994 
1995 int
1996 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
1997 {
1998 	int error = 0, optval, optlen;
1999 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
2000 	struct inpcb *in6p = sotoinpcb(so);
2001 	int level, op, optname;
2002 
2003 	level = sopt->sopt_level;
2004 	op = sopt->sopt_dir;
2005 	optname = sopt->sopt_name;
2006 	optlen = sopt->sopt_valsize;
2007 
2008 	if (level != IPPROTO_IPV6) {
2009 		return (EINVAL);
2010 	}
2011 
2012 	switch (optname) {
2013 	case IPV6_CHECKSUM:
2014 		/*
2015 		 * For ICMPv6 sockets, no modification allowed for checksum
2016 		 * offset, permit "no change" values to help existing apps.
2017 		 *
2018 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
2019 		 * for an ICMPv6 socket will fail."
2020 		 * The current behavior does not meet RFC3542.
2021 		 */
2022 		switch (op) {
2023 		case SOPT_SET:
2024 			if (optlen != sizeof(int)) {
2025 				error = EINVAL;
2026 				break;
2027 			}
2028 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2029 					    sizeof(optval));
2030 			if (error)
2031 				break;
2032 			if ((optval % 2) != 0) {
2033 				/* the API assumes even offset values */
2034 				error = EINVAL;
2035 			} else if (so->so_proto->pr_protocol ==
2036 			    IPPROTO_ICMPV6) {
2037 				if (optval != icmp6off)
2038 					error = EINVAL;
2039 			} else
2040 				in6p->in6p_cksum = optval;
2041 			break;
2042 
2043 		case SOPT_GET:
2044 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
2045 				optval = icmp6off;
2046 			else
2047 				optval = in6p->in6p_cksum;
2048 
2049 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2050 			break;
2051 
2052 		default:
2053 			error = EINVAL;
2054 			break;
2055 		}
2056 		break;
2057 
2058 	default:
2059 		error = ENOPROTOOPT;
2060 		break;
2061 	}
2062 
2063 	return (error);
2064 }
2065 
2066 /*
2067  * Set up IP6 options in pcb for insertion in output packets or
2068  * specifying behavior of outgoing packets.
2069  */
2070 static int
2071 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
2072     struct socket *so, struct sockopt *sopt)
2073 {
2074 	struct ip6_pktopts *opt = *pktopt;
2075 	int error = 0;
2076 	struct thread *td = sopt->sopt_td;
2077 
2078 	/* turn off any old options. */
2079 	if (opt) {
2080 #ifdef DIAGNOSTIC
2081 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
2082 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
2083 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
2084 			printf("ip6_pcbopts: all specified options are cleared.\n");
2085 #endif
2086 		ip6_clearpktopts(opt, -1);
2087 	} else
2088 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
2089 	*pktopt = NULL;
2090 
2091 	if (!m || m->m_len == 0) {
2092 		/*
2093 		 * Only turning off any previous options, regardless of
2094 		 * whether the opt is just created or given.
2095 		 */
2096 		free(opt, M_IP6OPT);
2097 		return (0);
2098 	}
2099 
2100 	/*  set options specified by user. */
2101 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
2102 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
2103 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
2104 		free(opt, M_IP6OPT);
2105 		return (error);
2106 	}
2107 	*pktopt = opt;
2108 	return (0);
2109 }
2110 
2111 /*
2112  * initialize ip6_pktopts.  beware that there are non-zero default values in
2113  * the struct.
2114  */
2115 void
2116 ip6_initpktopts(struct ip6_pktopts *opt)
2117 {
2118 
2119 	bzero(opt, sizeof(*opt));
2120 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
2121 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
2122 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
2123 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
2124 }
2125 
2126 static int
2127 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
2128     struct ucred *cred, int uproto)
2129 {
2130 	struct ip6_pktopts *opt;
2131 
2132 	if (*pktopt == NULL) {
2133 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
2134 		    M_WAITOK);
2135 		ip6_initpktopts(*pktopt);
2136 	}
2137 	opt = *pktopt;
2138 
2139 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
2140 }
2141 
2142 static int
2143 ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
2144 {
2145 	void *optdata = NULL;
2146 	int optdatalen = 0;
2147 	struct ip6_ext *ip6e;
2148 	int error = 0;
2149 	struct in6_pktinfo null_pktinfo;
2150 	int deftclass = 0, on;
2151 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
2152 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
2153 
2154 	switch (optname) {
2155 	case IPV6_PKTINFO:
2156 		if (pktopt && pktopt->ip6po_pktinfo)
2157 			optdata = (void *)pktopt->ip6po_pktinfo;
2158 		else {
2159 			/* XXX: we don't have to do this every time... */
2160 			bzero(&null_pktinfo, sizeof(null_pktinfo));
2161 			optdata = (void *)&null_pktinfo;
2162 		}
2163 		optdatalen = sizeof(struct in6_pktinfo);
2164 		break;
2165 	case IPV6_TCLASS:
2166 		if (pktopt && pktopt->ip6po_tclass >= 0)
2167 			optdata = (void *)&pktopt->ip6po_tclass;
2168 		else
2169 			optdata = (void *)&deftclass;
2170 		optdatalen = sizeof(int);
2171 		break;
2172 	case IPV6_HOPOPTS:
2173 		if (pktopt && pktopt->ip6po_hbh) {
2174 			optdata = (void *)pktopt->ip6po_hbh;
2175 			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
2176 			optdatalen = (ip6e->ip6e_len + 1) << 3;
2177 		}
2178 		break;
2179 	case IPV6_RTHDR:
2180 		if (pktopt && pktopt->ip6po_rthdr) {
2181 			optdata = (void *)pktopt->ip6po_rthdr;
2182 			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
2183 			optdatalen = (ip6e->ip6e_len + 1) << 3;
2184 		}
2185 		break;
2186 	case IPV6_RTHDRDSTOPTS:
2187 		if (pktopt && pktopt->ip6po_dest1) {
2188 			optdata = (void *)pktopt->ip6po_dest1;
2189 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
2190 			optdatalen = (ip6e->ip6e_len + 1) << 3;
2191 		}
2192 		break;
2193 	case IPV6_DSTOPTS:
2194 		if (pktopt && pktopt->ip6po_dest2) {
2195 			optdata = (void *)pktopt->ip6po_dest2;
2196 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
2197 			optdatalen = (ip6e->ip6e_len + 1) << 3;
2198 		}
2199 		break;
2200 	case IPV6_NEXTHOP:
2201 		if (pktopt && pktopt->ip6po_nexthop) {
2202 			optdata = (void *)pktopt->ip6po_nexthop;
2203 			optdatalen = pktopt->ip6po_nexthop->sa_len;
2204 		}
2205 		break;
2206 	case IPV6_USE_MIN_MTU:
2207 		if (pktopt)
2208 			optdata = (void *)&pktopt->ip6po_minmtu;
2209 		else
2210 			optdata = (void *)&defminmtu;
2211 		optdatalen = sizeof(int);
2212 		break;
2213 	case IPV6_DONTFRAG:
2214 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
2215 			on = 1;
2216 		else
2217 			on = 0;
2218 		optdata = (void *)&on;
2219 		optdatalen = sizeof(on);
2220 		break;
2221 	case IPV6_PREFER_TEMPADDR:
2222 		if (pktopt)
2223 			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
2224 		else
2225 			optdata = (void *)&defpreftemp;
2226 		optdatalen = sizeof(int);
2227 		break;
2228 	default:		/* should not happen */
2229 #ifdef DIAGNOSTIC
2230 		panic("ip6_getpcbopt: unexpected option\n");
2231 #endif
2232 		return (ENOPROTOOPT);
2233 	}
2234 
2235 	error = sooptcopyout(sopt, optdata, optdatalen);
2236 
2237 	return (error);
2238 }
2239 
2240 void
2241 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
2242 {
2243 	if (pktopt == NULL)
2244 		return;
2245 
2246 	if (optname == -1 || optname == IPV6_PKTINFO) {
2247 		if (pktopt->ip6po_pktinfo)
2248 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
2249 		pktopt->ip6po_pktinfo = NULL;
2250 	}
2251 	if (optname == -1 || optname == IPV6_HOPLIMIT)
2252 		pktopt->ip6po_hlim = -1;
2253 	if (optname == -1 || optname == IPV6_TCLASS)
2254 		pktopt->ip6po_tclass = -1;
2255 	if (optname == -1 || optname == IPV6_NEXTHOP) {
2256 		if (pktopt->ip6po_nextroute.ro_rt) {
2257 			RTFREE(pktopt->ip6po_nextroute.ro_rt);
2258 			pktopt->ip6po_nextroute.ro_rt = NULL;
2259 		}
2260 		if (pktopt->ip6po_nexthop)
2261 			free(pktopt->ip6po_nexthop, M_IP6OPT);
2262 		pktopt->ip6po_nexthop = NULL;
2263 	}
2264 	if (optname == -1 || optname == IPV6_HOPOPTS) {
2265 		if (pktopt->ip6po_hbh)
2266 			free(pktopt->ip6po_hbh, M_IP6OPT);
2267 		pktopt->ip6po_hbh = NULL;
2268 	}
2269 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
2270 		if (pktopt->ip6po_dest1)
2271 			free(pktopt->ip6po_dest1, M_IP6OPT);
2272 		pktopt->ip6po_dest1 = NULL;
2273 	}
2274 	if (optname == -1 || optname == IPV6_RTHDR) {
2275 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
2276 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
2277 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
2278 		if (pktopt->ip6po_route.ro_rt) {
2279 			RTFREE(pktopt->ip6po_route.ro_rt);
2280 			pktopt->ip6po_route.ro_rt = NULL;
2281 		}
2282 	}
2283 	if (optname == -1 || optname == IPV6_DSTOPTS) {
2284 		if (pktopt->ip6po_dest2)
2285 			free(pktopt->ip6po_dest2, M_IP6OPT);
2286 		pktopt->ip6po_dest2 = NULL;
2287 	}
2288 }
2289 
2290 #define PKTOPT_EXTHDRCPY(type) \
2291 do {\
2292 	if (src->type) {\
2293 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
2294 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
2295 		if (dst->type == NULL && canwait == M_NOWAIT)\
2296 			goto bad;\
2297 		bcopy(src->type, dst->type, hlen);\
2298 	}\
2299 } while (/*CONSTCOND*/ 0)
2300 
2301 static int
2302 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
2303 {
2304 	if (dst == NULL || src == NULL)  {
2305 		printf("ip6_clearpktopts: invalid argument\n");
2306 		return (EINVAL);
2307 	}
2308 
2309 	dst->ip6po_hlim = src->ip6po_hlim;
2310 	dst->ip6po_tclass = src->ip6po_tclass;
2311 	dst->ip6po_flags = src->ip6po_flags;
2312 	if (src->ip6po_pktinfo) {
2313 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
2314 		    M_IP6OPT, canwait);
2315 		if (dst->ip6po_pktinfo == NULL)
2316 			goto bad;
2317 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
2318 	}
2319 	if (src->ip6po_nexthop) {
2320 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
2321 		    M_IP6OPT, canwait);
2322 		if (dst->ip6po_nexthop == NULL)
2323 			goto bad;
2324 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
2325 		    src->ip6po_nexthop->sa_len);
2326 	}
2327 	PKTOPT_EXTHDRCPY(ip6po_hbh);
2328 	PKTOPT_EXTHDRCPY(ip6po_dest1);
2329 	PKTOPT_EXTHDRCPY(ip6po_dest2);
2330 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
2331 	return (0);
2332 
2333   bad:
2334 	ip6_clearpktopts(dst, -1);
2335 	return (ENOBUFS);
2336 }
2337 #undef PKTOPT_EXTHDRCPY
2338 
2339 struct ip6_pktopts *
2340 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
2341 {
2342 	int error;
2343 	struct ip6_pktopts *dst;
2344 
2345 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
2346 	if (dst == NULL)
2347 		return (NULL);
2348 	ip6_initpktopts(dst);
2349 
2350 	if ((error = copypktopts(dst, src, canwait)) != 0) {
2351 		free(dst, M_IP6OPT);
2352 		return (NULL);
2353 	}
2354 
2355 	return (dst);
2356 }
2357 
2358 void
2359 ip6_freepcbopts(struct ip6_pktopts *pktopt)
2360 {
2361 	if (pktopt == NULL)
2362 		return;
2363 
2364 	ip6_clearpktopts(pktopt, -1);
2365 
2366 	free(pktopt, M_IP6OPT);
2367 }
2368 
2369 /*
2370  * Set IPv6 outgoing packet options based on advanced API.
2371  */
2372 int
2373 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
2374     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
2375 {
2376 	struct cmsghdr *cm = 0;
2377 
2378 	if (control == NULL || opt == NULL)
2379 		return (EINVAL);
2380 
2381 	ip6_initpktopts(opt);
2382 	if (stickyopt) {
2383 		int error;
2384 
2385 		/*
2386 		 * If stickyopt is provided, make a local copy of the options
2387 		 * for this particular packet, then override them by ancillary
2388 		 * objects.
2389 		 * XXX: copypktopts() does not copy the cached route to a next
2390 		 * hop (if any).  This is not very good in terms of efficiency,
2391 		 * but we can allow this since this option should be rarely
2392 		 * used.
2393 		 */
2394 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
2395 			return (error);
2396 	}
2397 
2398 	/*
2399 	 * XXX: Currently, we assume all the optional information is stored
2400 	 * in a single mbuf.
2401 	 */
2402 	if (control->m_next)
2403 		return (EINVAL);
2404 
2405 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
2406 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
2407 		int error;
2408 
2409 		if (control->m_len < CMSG_LEN(0))
2410 			return (EINVAL);
2411 
2412 		cm = mtod(control, struct cmsghdr *);
2413 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
2414 			return (EINVAL);
2415 		if (cm->cmsg_level != IPPROTO_IPV6)
2416 			continue;
2417 
2418 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
2419 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
2420 		if (error)
2421 			return (error);
2422 	}
2423 
2424 	return (0);
2425 }
2426 
2427 /*
2428  * Set a particular packet option, as a sticky option or an ancillary data
2429  * item.  "len" can be 0 only when it's a sticky option.
2430  * We have 4 cases of combination of "sticky" and "cmsg":
2431  * "sticky=0, cmsg=0": impossible
2432  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
2433  * "sticky=1, cmsg=0": RFC3542 socket option
2434  * "sticky=1, cmsg=1": RFC2292 socket option
2435  */
2436 static int
2437 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
2438     struct ucred *cred, int sticky, int cmsg, int uproto)
2439 {
2440 	int minmtupolicy, preftemp;
2441 	int error;
2442 
2443 	if (!sticky && !cmsg) {
2444 #ifdef DIAGNOSTIC
2445 		printf("ip6_setpktopt: impossible case\n");
2446 #endif
2447 		return (EINVAL);
2448 	}
2449 
2450 	/*
2451 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
2452 	 * not be specified in the context of RFC3542.  Conversely,
2453 	 * RFC3542 types should not be specified in the context of RFC2292.
2454 	 */
2455 	if (!cmsg) {
2456 		switch (optname) {
2457 		case IPV6_2292PKTINFO:
2458 		case IPV6_2292HOPLIMIT:
2459 		case IPV6_2292NEXTHOP:
2460 		case IPV6_2292HOPOPTS:
2461 		case IPV6_2292DSTOPTS:
2462 		case IPV6_2292RTHDR:
2463 		case IPV6_2292PKTOPTIONS:
2464 			return (ENOPROTOOPT);
2465 		}
2466 	}
2467 	if (sticky && cmsg) {
2468 		switch (optname) {
2469 		case IPV6_PKTINFO:
2470 		case IPV6_HOPLIMIT:
2471 		case IPV6_NEXTHOP:
2472 		case IPV6_HOPOPTS:
2473 		case IPV6_DSTOPTS:
2474 		case IPV6_RTHDRDSTOPTS:
2475 		case IPV6_RTHDR:
2476 		case IPV6_USE_MIN_MTU:
2477 		case IPV6_DONTFRAG:
2478 		case IPV6_TCLASS:
2479 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
2480 			return (ENOPROTOOPT);
2481 		}
2482 	}
2483 
2484 	switch (optname) {
2485 	case IPV6_2292PKTINFO:
2486 	case IPV6_PKTINFO:
2487 	{
2488 		struct ifnet *ifp = NULL;
2489 		struct in6_pktinfo *pktinfo;
2490 
2491 		if (len != sizeof(struct in6_pktinfo))
2492 			return (EINVAL);
2493 
2494 		pktinfo = (struct in6_pktinfo *)buf;
2495 
2496 		/*
2497 		 * An application can clear any sticky IPV6_PKTINFO option by
2498 		 * doing a "regular" setsockopt with ipi6_addr being
2499 		 * in6addr_any and ipi6_ifindex being zero.
2500 		 * [RFC 3542, Section 6]
2501 		 */
2502 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
2503 		    pktinfo->ipi6_ifindex == 0 &&
2504 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2505 			ip6_clearpktopts(opt, optname);
2506 			break;
2507 		}
2508 
2509 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
2510 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2511 			return (EINVAL);
2512 		}
2513 
2514 		/* validate the interface index if specified. */
2515 		if (pktinfo->ipi6_ifindex > V_if_index ||
2516 		    pktinfo->ipi6_ifindex < 0) {
2517 			 return (ENXIO);
2518 		}
2519 		if (pktinfo->ipi6_ifindex) {
2520 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
2521 			if (ifp == NULL)
2522 				return (ENXIO);
2523 		}
2524 
2525 		/*
2526 		 * We store the address anyway, and let in6_selectsrc()
2527 		 * validate the specified address.  This is because ipi6_addr
2528 		 * may not have enough information about its scope zone, and
2529 		 * we may need additional information (such as outgoing
2530 		 * interface or the scope zone of a destination address) to
2531 		 * disambiguate the scope.
2532 		 * XXX: the delay of the validation may confuse the
2533 		 * application when it is used as a sticky option.
2534 		 */
2535 		if (opt->ip6po_pktinfo == NULL) {
2536 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
2537 			    M_IP6OPT, M_NOWAIT);
2538 			if (opt->ip6po_pktinfo == NULL)
2539 				return (ENOBUFS);
2540 		}
2541 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
2542 		break;
2543 	}
2544 
2545 	case IPV6_2292HOPLIMIT:
2546 	case IPV6_HOPLIMIT:
2547 	{
2548 		int *hlimp;
2549 
2550 		/*
2551 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
2552 		 * to simplify the ordering among hoplimit options.
2553 		 */
2554 		if (optname == IPV6_HOPLIMIT && sticky)
2555 			return (ENOPROTOOPT);
2556 
2557 		if (len != sizeof(int))
2558 			return (EINVAL);
2559 		hlimp = (int *)buf;
2560 		if (*hlimp < -1 || *hlimp > 255)
2561 			return (EINVAL);
2562 
2563 		opt->ip6po_hlim = *hlimp;
2564 		break;
2565 	}
2566 
2567 	case IPV6_TCLASS:
2568 	{
2569 		int tclass;
2570 
2571 		if (len != sizeof(int))
2572 			return (EINVAL);
2573 		tclass = *(int *)buf;
2574 		if (tclass < -1 || tclass > 255)
2575 			return (EINVAL);
2576 
2577 		opt->ip6po_tclass = tclass;
2578 		break;
2579 	}
2580 
2581 	case IPV6_2292NEXTHOP:
2582 	case IPV6_NEXTHOP:
2583 		if (cred != NULL) {
2584 			error = priv_check_cred(cred,
2585 			    PRIV_NETINET_SETHDROPTS, 0);
2586 			if (error)
2587 				return (error);
2588 		}
2589 
2590 		if (len == 0) {	/* just remove the option */
2591 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
2592 			break;
2593 		}
2594 
2595 		/* check if cmsg_len is large enough for sa_len */
2596 		if (len < sizeof(struct sockaddr) || len < *buf)
2597 			return (EINVAL);
2598 
2599 		switch (((struct sockaddr *)buf)->sa_family) {
2600 		case AF_INET6:
2601 		{
2602 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
2603 			int error;
2604 
2605 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
2606 				return (EINVAL);
2607 
2608 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
2609 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
2610 				return (EINVAL);
2611 			}
2612 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
2613 			    != 0) {
2614 				return (error);
2615 			}
2616 			break;
2617 		}
2618 		case AF_LINK:	/* should eventually be supported */
2619 		default:
2620 			return (EAFNOSUPPORT);
2621 		}
2622 
2623 		/* turn off the previous option, then set the new option. */
2624 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
2625 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
2626 		if (opt->ip6po_nexthop == NULL)
2627 			return (ENOBUFS);
2628 		bcopy(buf, opt->ip6po_nexthop, *buf);
2629 		break;
2630 
2631 	case IPV6_2292HOPOPTS:
2632 	case IPV6_HOPOPTS:
2633 	{
2634 		struct ip6_hbh *hbh;
2635 		int hbhlen;
2636 
2637 		/*
2638 		 * XXX: We don't allow a non-privileged user to set ANY HbH
2639 		 * options, since per-option restriction has too much
2640 		 * overhead.
2641 		 */
2642 		if (cred != NULL) {
2643 			error = priv_check_cred(cred,
2644 			    PRIV_NETINET_SETHDROPTS, 0);
2645 			if (error)
2646 				return (error);
2647 		}
2648 
2649 		if (len == 0) {
2650 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
2651 			break;	/* just remove the option */
2652 		}
2653 
2654 		/* message length validation */
2655 		if (len < sizeof(struct ip6_hbh))
2656 			return (EINVAL);
2657 		hbh = (struct ip6_hbh *)buf;
2658 		hbhlen = (hbh->ip6h_len + 1) << 3;
2659 		if (len != hbhlen)
2660 			return (EINVAL);
2661 
2662 		/* turn off the previous option, then set the new option. */
2663 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
2664 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
2665 		if (opt->ip6po_hbh == NULL)
2666 			return (ENOBUFS);
2667 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
2668 
2669 		break;
2670 	}
2671 
2672 	case IPV6_2292DSTOPTS:
2673 	case IPV6_DSTOPTS:
2674 	case IPV6_RTHDRDSTOPTS:
2675 	{
2676 		struct ip6_dest *dest, **newdest = NULL;
2677 		int destlen;
2678 
2679 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
2680 			error = priv_check_cred(cred,
2681 			    PRIV_NETINET_SETHDROPTS, 0);
2682 			if (error)
2683 				return (error);
2684 		}
2685 
2686 		if (len == 0) {
2687 			ip6_clearpktopts(opt, optname);
2688 			break;	/* just remove the option */
2689 		}
2690 
2691 		/* message length validation */
2692 		if (len < sizeof(struct ip6_dest))
2693 			return (EINVAL);
2694 		dest = (struct ip6_dest *)buf;
2695 		destlen = (dest->ip6d_len + 1) << 3;
2696 		if (len != destlen)
2697 			return (EINVAL);
2698 
2699 		/*
2700 		 * Determine the position that the destination options header
2701 		 * should be inserted; before or after the routing header.
2702 		 */
2703 		switch (optname) {
2704 		case IPV6_2292DSTOPTS:
2705 			/*
2706 			 * The old advacned API is ambiguous on this point.
2707 			 * Our approach is to determine the position based
2708 			 * according to the existence of a routing header.
2709 			 * Note, however, that this depends on the order of the
2710 			 * extension headers in the ancillary data; the 1st
2711 			 * part of the destination options header must appear
2712 			 * before the routing header in the ancillary data,
2713 			 * too.
2714 			 * RFC3542 solved the ambiguity by introducing
2715 			 * separate ancillary data or option types.
2716 			 */
2717 			if (opt->ip6po_rthdr == NULL)
2718 				newdest = &opt->ip6po_dest1;
2719 			else
2720 				newdest = &opt->ip6po_dest2;
2721 			break;
2722 		case IPV6_RTHDRDSTOPTS:
2723 			newdest = &opt->ip6po_dest1;
2724 			break;
2725 		case IPV6_DSTOPTS:
2726 			newdest = &opt->ip6po_dest2;
2727 			break;
2728 		}
2729 
2730 		/* turn off the previous option, then set the new option. */
2731 		ip6_clearpktopts(opt, optname);
2732 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
2733 		if (*newdest == NULL)
2734 			return (ENOBUFS);
2735 		bcopy(dest, *newdest, destlen);
2736 
2737 		break;
2738 	}
2739 
2740 	case IPV6_2292RTHDR:
2741 	case IPV6_RTHDR:
2742 	{
2743 		struct ip6_rthdr *rth;
2744 		int rthlen;
2745 
2746 		if (len == 0) {
2747 			ip6_clearpktopts(opt, IPV6_RTHDR);
2748 			break;	/* just remove the option */
2749 		}
2750 
2751 		/* message length validation */
2752 		if (len < sizeof(struct ip6_rthdr))
2753 			return (EINVAL);
2754 		rth = (struct ip6_rthdr *)buf;
2755 		rthlen = (rth->ip6r_len + 1) << 3;
2756 		if (len != rthlen)
2757 			return (EINVAL);
2758 
2759 		switch (rth->ip6r_type) {
2760 		case IPV6_RTHDR_TYPE_0:
2761 			if (rth->ip6r_len == 0)	/* must contain one addr */
2762 				return (EINVAL);
2763 			if (rth->ip6r_len % 2) /* length must be even */
2764 				return (EINVAL);
2765 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
2766 				return (EINVAL);
2767 			break;
2768 		default:
2769 			return (EINVAL);	/* not supported */
2770 		}
2771 
2772 		/* turn off the previous option */
2773 		ip6_clearpktopts(opt, IPV6_RTHDR);
2774 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
2775 		if (opt->ip6po_rthdr == NULL)
2776 			return (ENOBUFS);
2777 		bcopy(rth, opt->ip6po_rthdr, rthlen);
2778 
2779 		break;
2780 	}
2781 
2782 	case IPV6_USE_MIN_MTU:
2783 		if (len != sizeof(int))
2784 			return (EINVAL);
2785 		minmtupolicy = *(int *)buf;
2786 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
2787 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
2788 		    minmtupolicy != IP6PO_MINMTU_ALL) {
2789 			return (EINVAL);
2790 		}
2791 		opt->ip6po_minmtu = minmtupolicy;
2792 		break;
2793 
2794 	case IPV6_DONTFRAG:
2795 		if (len != sizeof(int))
2796 			return (EINVAL);
2797 
2798 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
2799 			/*
2800 			 * we ignore this option for TCP sockets.
2801 			 * (RFC3542 leaves this case unspecified.)
2802 			 */
2803 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
2804 		} else
2805 			opt->ip6po_flags |= IP6PO_DONTFRAG;
2806 		break;
2807 
2808 	case IPV6_PREFER_TEMPADDR:
2809 		if (len != sizeof(int))
2810 			return (EINVAL);
2811 		preftemp = *(int *)buf;
2812 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
2813 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
2814 		    preftemp != IP6PO_TEMPADDR_PREFER) {
2815 			return (EINVAL);
2816 		}
2817 		opt->ip6po_prefer_tempaddr = preftemp;
2818 		break;
2819 
2820 	default:
2821 		return (ENOPROTOOPT);
2822 	} /* end of switch */
2823 
2824 	return (0);
2825 }
2826 
2827 /*
2828  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
2829  * packet to the input queue of a specified interface.  Note that this
2830  * calls the output routine of the loopback "driver", but with an interface
2831  * pointer that might NOT be &loif -- easier than replicating that code here.
2832  */
2833 void
2834 ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
2835 {
2836 	struct mbuf *copym;
2837 	struct ip6_hdr *ip6;
2838 
2839 	copym = m_copy(m, 0, M_COPYALL);
2840 	if (copym == NULL)
2841 		return;
2842 
2843 	/*
2844 	 * Make sure to deep-copy IPv6 header portion in case the data
2845 	 * is in an mbuf cluster, so that we can safely override the IPv6
2846 	 * header portion later.
2847 	 */
2848 	if ((copym->m_flags & M_EXT) != 0 ||
2849 	    copym->m_len < sizeof(struct ip6_hdr)) {
2850 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
2851 		if (copym == NULL)
2852 			return;
2853 	}
2854 
2855 #ifdef DIAGNOSTIC
2856 	if (copym->m_len < sizeof(*ip6)) {
2857 		m_freem(copym);
2858 		return;
2859 	}
2860 #endif
2861 
2862 	ip6 = mtod(copym, struct ip6_hdr *);
2863 	/*
2864 	 * clear embedded scope identifiers if necessary.
2865 	 * in6_clearscope will touch the addresses only when necessary.
2866 	 */
2867 	in6_clearscope(&ip6->ip6_src);
2868 	in6_clearscope(&ip6->ip6_dst);
2869 
2870 	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
2871 }
2872 
2873 /*
2874  * Chop IPv6 header off from the payload.
2875  */
2876 static int
2877 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
2878 {
2879 	struct mbuf *mh;
2880 	struct ip6_hdr *ip6;
2881 
2882 	ip6 = mtod(m, struct ip6_hdr *);
2883 	if (m->m_len > sizeof(*ip6)) {
2884 		MGETHDR(mh, M_DONTWAIT, MT_HEADER);
2885 		if (mh == 0) {
2886 			m_freem(m);
2887 			return ENOBUFS;
2888 		}
2889 		M_MOVE_PKTHDR(mh, m);
2890 		MH_ALIGN(mh, sizeof(*ip6));
2891 		m->m_len -= sizeof(*ip6);
2892 		m->m_data += sizeof(*ip6);
2893 		mh->m_next = m;
2894 		m = mh;
2895 		m->m_len = sizeof(*ip6);
2896 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
2897 	}
2898 	exthdrs->ip6e_ip6 = m;
2899 	return 0;
2900 }
2901 
2902 /*
2903  * Compute IPv6 extension header length.
2904  */
2905 int
2906 ip6_optlen(struct inpcb *in6p)
2907 {
2908 	int len;
2909 
2910 	if (!in6p->in6p_outputopts)
2911 		return 0;
2912 
2913 	len = 0;
2914 #define elen(x) \
2915     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
2916 
2917 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
2918 	if (in6p->in6p_outputopts->ip6po_rthdr)
2919 		/* dest1 is valid with rthdr only */
2920 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
2921 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
2922 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
2923 	return len;
2924 #undef elen
2925 }
2926