xref: /freebsd/sys/netinet/ip_output.c (revision c4f02a891fe62fe1277c89859922804ea2c27bcd)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipfw.h"
38 #include "opt_ipdn.h"
39 #include "opt_ipdivert.h"
40 #include "opt_ipfilter.h"
41 #include "opt_ipsec.h"
42 #include "opt_mac.h"
43 #include "opt_pfil_hooks.h"
44 #include "opt_random_ip_id.h"
45 #include "opt_mbuf_stress_test.h"
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/mac.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/sysctl.h>
57 
58 #include <net/if.h>
59 #include <net/route.h>
60 
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/ip.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/in_var.h>
66 #include <netinet/ip_var.h>
67 
68 #ifdef PFIL_HOOKS
69 #include <net/pfil.h>
70 #endif
71 
72 #include <machine/in_cksum.h>
73 
74 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75 
76 #ifdef IPSEC
77 #include <netinet6/ipsec.h>
78 #include <netkey/key.h>
79 #ifdef IPSEC_DEBUG
80 #include <netkey/key_debug.h>
81 #else
82 #define	KEYDEBUG(lev,arg)
83 #endif
84 #endif /*IPSEC*/
85 
86 #ifdef FAST_IPSEC
87 #include <netipsec/ipsec.h>
88 #include <netipsec/xform.h>
89 #include <netipsec/key.h>
90 #endif /*FAST_IPSEC*/
91 
92 #include <netinet/ip_fw.h>
93 #include <netinet/ip_dummynet.h>
94 
95 #define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
96 				x, (ntohl(a.s_addr)>>24)&0xFF,\
97 				  (ntohl(a.s_addr)>>16)&0xFF,\
98 				  (ntohl(a.s_addr)>>8)&0xFF,\
99 				  (ntohl(a.s_addr))&0xFF, y);
100 
101 u_short ip_id;
102 
103 #ifdef MBUF_STRESS_TEST
104 int mbuf_frag_size = 0;
105 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
106 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
107 #endif
108 
109 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
110 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
111 static void	ip_mloopback
112 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
113 static int	ip_getmoptions
114 	(struct sockopt *, struct ip_moptions *);
115 static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
116 static int	ip_setmoptions
117 	(struct sockopt *, struct ip_moptions **);
118 
119 int	ip_optcopy(struct ip *, struct ip *);
120 
121 
122 extern	struct protosw inetsw[];
123 
124 /*
125  * IP output.  The packet in mbuf chain m contains a skeletal IP
126  * header (with len, off, ttl, proto, tos, src, dst).
127  * The mbuf chain containing the packet will be freed.
128  * The mbuf opt, if present, will not be freed.
129  */
130 int
131 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
132 	int flags, struct ip_moptions *imo, struct inpcb *inp)
133 {
134 	struct ip *ip;
135 	struct ifnet *ifp = NULL;	/* keep compiler happy */
136 	struct mbuf *m;
137 	int hlen = sizeof (struct ip);
138 	int len, off, error = 0;
139 	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
140 	struct in_ifaddr *ia = NULL;
141 	int isbroadcast, sw_csum;
142 	struct in_addr pkt_dst;
143 #ifdef IPSEC
144 	struct route iproute;
145 	struct secpolicy *sp = NULL;
146 #endif
147 #ifdef FAST_IPSEC
148 	struct route iproute;
149 	struct m_tag *mtag;
150 	struct secpolicy *sp = NULL;
151 	struct tdb_ident *tdbi;
152 	int s;
153 #endif /* FAST_IPSEC */
154 	struct ip_fw_args args;
155 	int src_was_INADDR_ANY = 0;	/* as the name says... */
156 
157 	args.eh = NULL;
158 	args.rule = NULL;
159 	args.next_hop = NULL;
160 	args.divert_rule = 0;			/* divert cookie */
161 
162 	/* Grab info from MT_TAG mbufs prepended to the chain. */
163 	for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) {
164 		switch(m0->_m_tag_id) {
165 		default:
166 			printf("ip_output: unrecognised MT_TAG tag %d\n",
167 			    m0->_m_tag_id);
168 			break;
169 
170 		case PACKET_TAG_DUMMYNET:
171 			/*
172 			 * the packet was already tagged, so part of the
173 			 * processing was already done, and we need to go down.
174 			 * Get parameters from the header.
175 			 */
176 			args.rule = ((struct dn_pkt *)m0)->rule;
177 			opt = NULL ;
178 			ro = & ( ((struct dn_pkt *)m0)->ro ) ;
179 			imo = NULL ;
180 			dst = ((struct dn_pkt *)m0)->dn_dst ;
181 			ifp = ((struct dn_pkt *)m0)->ifp ;
182 			flags = ((struct dn_pkt *)m0)->flags ;
183 			break;
184 
185 		case PACKET_TAG_DIVERT:
186 			args.divert_rule = (intptr_t)m0->m_data & 0xffff;
187 			break;
188 
189 		case PACKET_TAG_IPFORWARD:
190 			args.next_hop = (struct sockaddr_in *)m0->m_data;
191 			break;
192 		}
193 	}
194 	m = m0;
195 
196 	M_ASSERTPKTHDR(m);
197 #ifndef FAST_IPSEC
198 	KASSERT(ro != NULL, ("ip_output: no route, proto %d",
199 	    mtod(m, struct ip *)->ip_p));
200 #endif
201 
202 	if (args.rule != NULL) {	/* dummynet already saw us */
203 		ip = mtod(m, struct ip *);
204 		hlen = ip->ip_hl << 2 ;
205 		if (ro->ro_rt)
206 			ia = ifatoia(ro->ro_rt->rt_ifa);
207 		goto sendit;
208 	}
209 
210 	if (opt) {
211 		len = 0;
212 		m = ip_insertoptions(m, opt, &len);
213 		if (len != 0)
214 			hlen = len;
215 	}
216 	ip = mtod(m, struct ip *);
217 	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
218 
219 	/*
220 	 * Fill in IP header.  If we are not allowing fragmentation,
221 	 * then the ip_id field is meaningless, so send it as zero
222 	 * to reduce information leakage.  Otherwise, if we are not
223 	 * randomizing ip_id, then don't bother to convert it to network
224 	 * byte order -- it's just a nonce.  Note that a 16-bit counter
225 	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
226 	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
227 	 * for Counting NATted Hosts", Proc. IMW'02, available at
228 	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
229 	 */
230 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
231 		ip->ip_v = IPVERSION;
232 		ip->ip_hl = hlen >> 2;
233 		if ((ip->ip_off & IP_DF) == 0) {
234 			ip->ip_off = 0;
235 #ifdef RANDOM_IP_ID
236 			ip->ip_id = ip_randomid();
237 #else
238 			ip->ip_id = ip_id++;
239 #endif
240 		} else {
241 			ip->ip_off = IP_DF;
242 			ip->ip_id = 0;
243 		}
244 		ipstat.ips_localout++;
245 	} else {
246 		hlen = ip->ip_hl << 2;
247 	}
248 
249 #ifdef FAST_IPSEC
250 	if (ro == NULL) {
251 		ro = &iproute;
252 		bzero(ro, sizeof (*ro));
253 	}
254 #endif /* FAST_IPSEC */
255 	dst = (struct sockaddr_in *)&ro->ro_dst;
256 	/*
257 	 * If there is a cached route,
258 	 * check that it is to the same destination
259 	 * and is still up.  If not, free it and try again.
260 	 * The address family should also be checked in case of sharing the
261 	 * cache with IPv6.
262 	 */
263 	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
264 			  dst->sin_family != AF_INET ||
265 			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
266 		RTFREE(ro->ro_rt);
267 		ro->ro_rt = (struct rtentry *)0;
268 	}
269 	if (ro->ro_rt == 0) {
270 		bzero(dst, sizeof(*dst));
271 		dst->sin_family = AF_INET;
272 		dst->sin_len = sizeof(*dst);
273 		dst->sin_addr = pkt_dst;
274 	}
275 	/*
276 	 * If routing to interface only,
277 	 * short circuit routing lookup.
278 	 */
279 	if (flags & IP_ROUTETOIF) {
280 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
281 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
282 			ipstat.ips_noroute++;
283 			error = ENETUNREACH;
284 			goto bad;
285 		}
286 		ifp = ia->ia_ifp;
287 		ip->ip_ttl = 1;
288 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
289 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
290 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
291 		/*
292 		 * Bypass the normal routing lookup for multicast
293 		 * packets if the interface is specified.
294 		 */
295 		ifp = imo->imo_multicast_ifp;
296 		IFP_TO_IA(ifp, ia);
297 		isbroadcast = 0;	/* fool gcc */
298 	} else {
299 		/*
300 		 * If this is the case, we probably don't want to allocate
301 		 * a protocol-cloned route since we didn't get one from the
302 		 * ULP.  This lets TCP do its thing, while not burdening
303 		 * forwarding or ICMP with the overhead of cloning a route.
304 		 * Of course, we still want to do any cloning requested by
305 		 * the link layer, as this is probably required in all cases
306 		 * for correct operation (as it is for ARP).
307 		 */
308 		if (ro->ro_rt == 0)
309 			rtalloc_ign(ro, RTF_PRCLONING);
310 		if (ro->ro_rt == 0) {
311 			ipstat.ips_noroute++;
312 			error = EHOSTUNREACH;
313 			goto bad;
314 		}
315 		ia = ifatoia(ro->ro_rt->rt_ifa);
316 		ifp = ro->ro_rt->rt_ifp;
317 		ro->ro_rt->rt_use++;
318 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
319 			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
320 		if (ro->ro_rt->rt_flags & RTF_HOST)
321 			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
322 		else
323 			isbroadcast = in_broadcast(dst->sin_addr, ifp);
324 	}
325 	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
326 		struct in_multi *inm;
327 
328 		m->m_flags |= M_MCAST;
329 		/*
330 		 * IP destination address is multicast.  Make sure "dst"
331 		 * still points to the address in "ro".  (It may have been
332 		 * changed to point to a gateway address, above.)
333 		 */
334 		dst = (struct sockaddr_in *)&ro->ro_dst;
335 		/*
336 		 * See if the caller provided any multicast options
337 		 */
338 		if (imo != NULL) {
339 			ip->ip_ttl = imo->imo_multicast_ttl;
340 			if (imo->imo_multicast_vif != -1)
341 				ip->ip_src.s_addr =
342 				    ip_mcast_src ?
343 				    ip_mcast_src(imo->imo_multicast_vif) :
344 				    INADDR_ANY;
345 		} else
346 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
347 		/*
348 		 * Confirm that the outgoing interface supports multicast.
349 		 */
350 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
351 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
352 				ipstat.ips_noroute++;
353 				error = ENETUNREACH;
354 				goto bad;
355 			}
356 		}
357 		/*
358 		 * If source address not specified yet, use address
359 		 * of outgoing interface.
360 		 */
361 		if (ip->ip_src.s_addr == INADDR_ANY) {
362 			/* Interface may have no addresses. */
363 			if (ia != NULL)
364 				ip->ip_src = IA_SIN(ia)->sin_addr;
365 		}
366 
367 		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
368 			/*
369 			 * XXX
370 			 * delayed checksums are not currently
371 			 * compatible with IP multicast routing
372 			 */
373 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
374 				in_delayed_cksum(m);
375 				m->m_pkthdr.csum_flags &=
376 					~CSUM_DELAY_DATA;
377 			}
378 		}
379 		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
380 		if (inm != NULL &&
381 		   (imo == NULL || imo->imo_multicast_loop)) {
382 			/*
383 			 * If we belong to the destination multicast group
384 			 * on the outgoing interface, and the caller did not
385 			 * forbid loopback, loop back a copy.
386 			 */
387 			ip_mloopback(ifp, m, dst, hlen);
388 		}
389 		else {
390 			/*
391 			 * If we are acting as a multicast router, perform
392 			 * multicast forwarding as if the packet had just
393 			 * arrived on the interface to which we are about
394 			 * to send.  The multicast forwarding function
395 			 * recursively calls this function, using the
396 			 * IP_FORWARDING flag to prevent infinite recursion.
397 			 *
398 			 * Multicasts that are looped back by ip_mloopback(),
399 			 * above, will be forwarded by the ip_input() routine,
400 			 * if necessary.
401 			 */
402 			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
403 				/*
404 				 * If rsvp daemon is not running, do not
405 				 * set ip_moptions. This ensures that the packet
406 				 * is multicast and not just sent down one link
407 				 * as prescribed by rsvpd.
408 				 */
409 				if (!rsvp_on)
410 					imo = NULL;
411 				if (ip_mforward &&
412 				    ip_mforward(ip, ifp, m, imo) != 0) {
413 					m_freem(m);
414 					goto done;
415 				}
416 			}
417 		}
418 
419 		/*
420 		 * Multicasts with a time-to-live of zero may be looped-
421 		 * back, above, but must not be transmitted on a network.
422 		 * Also, multicasts addressed to the loopback interface
423 		 * are not sent -- the above call to ip_mloopback() will
424 		 * loop back a copy if this host actually belongs to the
425 		 * destination group on the loopback interface.
426 		 */
427 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
428 			m_freem(m);
429 			goto done;
430 		}
431 
432 		goto sendit;
433 	}
434 #ifndef notdef
435 	/*
436 	 * If the source address is not specified yet, use the address
437 	 * of the outoing interface. In case, keep note we did that, so
438 	 * if the the firewall changes the next-hop causing the output
439 	 * interface to change, we can fix that.
440 	 */
441 	if (ip->ip_src.s_addr == INADDR_ANY) {
442 		/* Interface may have no addresses. */
443 		if (ia != NULL) {
444 			ip->ip_src = IA_SIN(ia)->sin_addr;
445 			src_was_INADDR_ANY = 1;
446 		}
447 	}
448 #endif /* notdef */
449 	/*
450 	 * Verify that we have any chance at all of being able to queue
451 	 *      the packet or packet fragments
452 	 */
453 	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
454 		ifp->if_snd.ifq_maxlen) {
455 			error = ENOBUFS;
456 			ipstat.ips_odropped++;
457 			goto bad;
458 	}
459 
460 	/*
461 	 * Look for broadcast address and
462 	 * verify user is allowed to send
463 	 * such a packet.
464 	 */
465 	if (isbroadcast) {
466 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
467 			error = EADDRNOTAVAIL;
468 			goto bad;
469 		}
470 		if ((flags & IP_ALLOWBROADCAST) == 0) {
471 			error = EACCES;
472 			goto bad;
473 		}
474 		/* don't allow broadcast messages to be fragmented */
475 		if (ip->ip_len > ifp->if_mtu) {
476 			error = EMSGSIZE;
477 			goto bad;
478 		}
479 		if (flags & IP_SENDONES)
480 			ip->ip_dst.s_addr = INADDR_BROADCAST;
481 		m->m_flags |= M_BCAST;
482 	} else {
483 		m->m_flags &= ~M_BCAST;
484 	}
485 
486 sendit:
487 #ifdef IPSEC
488 	/* get SP for this packet */
489 	if (inp == NULL)
490 		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
491 	else
492 		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
493 
494 	if (sp == NULL) {
495 		ipsecstat.out_inval++;
496 		goto bad;
497 	}
498 
499 	error = 0;
500 
501 	/* check policy */
502 	switch (sp->policy) {
503 	case IPSEC_POLICY_DISCARD:
504 		/*
505 		 * This packet is just discarded.
506 		 */
507 		ipsecstat.out_polvio++;
508 		goto bad;
509 
510 	case IPSEC_POLICY_BYPASS:
511 	case IPSEC_POLICY_NONE:
512 		/* no need to do IPsec. */
513 		goto skip_ipsec;
514 
515 	case IPSEC_POLICY_IPSEC:
516 		if (sp->req == NULL) {
517 			/* acquire a policy */
518 			error = key_spdacquire(sp);
519 			goto bad;
520 		}
521 		break;
522 
523 	case IPSEC_POLICY_ENTRUST:
524 	default:
525 		printf("ip_output: Invalid policy found. %d\n", sp->policy);
526 	}
527     {
528 	struct ipsec_output_state state;
529 	bzero(&state, sizeof(state));
530 	state.m = m;
531 	if (flags & IP_ROUTETOIF) {
532 		state.ro = &iproute;
533 		bzero(&iproute, sizeof(iproute));
534 	} else
535 		state.ro = ro;
536 	state.dst = (struct sockaddr *)dst;
537 
538 	ip->ip_sum = 0;
539 
540 	/*
541 	 * XXX
542 	 * delayed checksums are not currently compatible with IPsec
543 	 */
544 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
545 		in_delayed_cksum(m);
546 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
547 	}
548 
549 	ip->ip_len = htons(ip->ip_len);
550 	ip->ip_off = htons(ip->ip_off);
551 
552 	error = ipsec4_output(&state, sp, flags);
553 
554 	m = state.m;
555 	if (flags & IP_ROUTETOIF) {
556 		/*
557 		 * if we have tunnel mode SA, we may need to ignore
558 		 * IP_ROUTETOIF.
559 		 */
560 		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
561 			flags &= ~IP_ROUTETOIF;
562 			ro = state.ro;
563 		}
564 	} else
565 		ro = state.ro;
566 	dst = (struct sockaddr_in *)state.dst;
567 	if (error) {
568 		/* mbuf is already reclaimed in ipsec4_output. */
569 		m0 = NULL;
570 		switch (error) {
571 		case EHOSTUNREACH:
572 		case ENETUNREACH:
573 		case EMSGSIZE:
574 		case ENOBUFS:
575 		case ENOMEM:
576 			break;
577 		default:
578 			printf("ip4_output (ipsec): error code %d\n", error);
579 			/*fall through*/
580 		case ENOENT:
581 			/* don't show these error codes to the user */
582 			error = 0;
583 			break;
584 		}
585 		goto bad;
586 	}
587     }
588 
589 	/* be sure to update variables that are affected by ipsec4_output() */
590 	ip = mtod(m, struct ip *);
591 	hlen = ip->ip_hl << 2;
592 	if (ro->ro_rt == NULL) {
593 		if ((flags & IP_ROUTETOIF) == 0) {
594 			printf("ip_output: "
595 				"can't update route after IPsec processing\n");
596 			error = EHOSTUNREACH;	/*XXX*/
597 			goto bad;
598 		}
599 	} else {
600 		ia = ifatoia(ro->ro_rt->rt_ifa);
601 		ifp = ro->ro_rt->rt_ifp;
602 	}
603 
604 	/* make it flipped, again. */
605 	ip->ip_len = ntohs(ip->ip_len);
606 	ip->ip_off = ntohs(ip->ip_off);
607 skip_ipsec:
608 #endif /*IPSEC*/
609 #ifdef FAST_IPSEC
610 	/*
611 	 * Check the security policy (SP) for the packet and, if
612 	 * required, do IPsec-related processing.  There are two
613 	 * cases here; the first time a packet is sent through
614 	 * it will be untagged and handled by ipsec4_checkpolicy.
615 	 * If the packet is resubmitted to ip_output (e.g. after
616 	 * AH, ESP, etc. processing), there will be a tag to bypass
617 	 * the lookup and related policy checking.
618 	 */
619 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
620 	s = splnet();
621 	if (mtag != NULL) {
622 		tdbi = (struct tdb_ident *)(mtag + 1);
623 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
624 		if (sp == NULL)
625 			error = -EINVAL;	/* force silent drop */
626 		m_tag_delete(m, mtag);
627 	} else {
628 		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
629 					&error, inp);
630 	}
631 	/*
632 	 * There are four return cases:
633 	 *    sp != NULL	 	    apply IPsec policy
634 	 *    sp == NULL, error == 0	    no IPsec handling needed
635 	 *    sp == NULL, error == -EINVAL  discard packet w/o error
636 	 *    sp == NULL, error != 0	    discard packet, report error
637 	 */
638 	if (sp != NULL) {
639 		/* Loop detection, check if ipsec processing already done */
640 		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
641 		for (mtag = m_tag_first(m); mtag != NULL;
642 		     mtag = m_tag_next(m, mtag)) {
643 			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
644 				continue;
645 			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
646 			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
647 				continue;
648 			/*
649 			 * Check if policy has an SA associated with it.
650 			 * This can happen when an SP has yet to acquire
651 			 * an SA; e.g. on first reference.  If it occurs,
652 			 * then we let ipsec4_process_packet do its thing.
653 			 */
654 			if (sp->req->sav == NULL)
655 				break;
656 			tdbi = (struct tdb_ident *)(mtag + 1);
657 			if (tdbi->spi == sp->req->sav->spi &&
658 			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
659 			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
660 				 sizeof (union sockaddr_union)) == 0) {
661 				/*
662 				 * No IPsec processing is needed, free
663 				 * reference to SP.
664 				 *
665 				 * NB: null pointer to avoid free at
666 				 *     done: below.
667 				 */
668 				KEY_FREESP(&sp), sp = NULL;
669 				splx(s);
670 				goto spd_done;
671 			}
672 		}
673 
674 		/*
675 		 * Do delayed checksums now because we send before
676 		 * this is done in the normal processing path.
677 		 */
678 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
679 			in_delayed_cksum(m);
680 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
681 		}
682 
683 		ip->ip_len = htons(ip->ip_len);
684 		ip->ip_off = htons(ip->ip_off);
685 
686 		/* NB: callee frees mbuf */
687 		error = ipsec4_process_packet(m, sp->req, flags, 0);
688 		/*
689 		 * Preserve KAME behaviour: ENOENT can be returned
690 		 * when an SA acquire is in progress.  Don't propagate
691 		 * this to user-level; it confuses applications.
692 		 *
693 		 * XXX this will go away when the SADB is redone.
694 		 */
695 		if (error == ENOENT)
696 			error = 0;
697 		splx(s);
698 		goto done;
699 	} else {
700 		splx(s);
701 
702 		if (error != 0) {
703 			/*
704 			 * Hack: -EINVAL is used to signal that a packet
705 			 * should be silently discarded.  This is typically
706 			 * because we asked key management for an SA and
707 			 * it was delayed (e.g. kicked up to IKE).
708 			 */
709 			if (error == -EINVAL)
710 				error = 0;
711 			goto bad;
712 		} else {
713 			/* No IPsec processing for this packet. */
714 		}
715 #ifdef notyet
716 		/*
717 		 * If deferred crypto processing is needed, check that
718 		 * the interface supports it.
719 		 */
720 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
721 		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
722 			/* notify IPsec to do its own crypto */
723 			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
724 			error = EHOSTUNREACH;
725 			goto bad;
726 		}
727 #endif
728 	}
729 spd_done:
730 #endif /* FAST_IPSEC */
731 
732 	/*
733 	 * IpHack's section.
734 	 * - Xlate: translate packet's addr/port (NAT).
735 	 * - Firewall: deny/allow/etc.
736 	 * - Wrap: fake packet's addr/port <unimpl.>
737 	 * - Encapsulate: put it in another IP and send out. <unimp.>
738 	 */
739 #ifdef PFIL_HOOKS
740 	/*
741 	 * Run through list of hooks for output packets.
742 	 */
743 	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT);
744 	if (error != 0 || m == NULL)
745 		goto done;
746 	ip = mtod(m, struct ip *);
747 #endif /* PFIL_HOOKS */
748 
749 	/*
750 	 * Check with the firewall...
751 	 * but not if we are already being fwd'd from a firewall.
752 	 */
753 	if (fw_enable && IPFW_LOADED && !args.next_hop) {
754 		struct sockaddr_in *old = dst;
755 
756 		args.m = m;
757 		args.next_hop = dst;
758 		args.oif = ifp;
759 		off = ip_fw_chk_ptr(&args);
760 		m = args.m;
761 		dst = args.next_hop;
762 
763                 /*
764 		 * On return we must do the following:
765 		 * m == NULL	-> drop the pkt (old interface, deprecated)
766 		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
767 		 * 1<=off<= 0xffff		-> DIVERT
768 		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
769 		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
770 		 * dst != old			-> IPFIREWALL_FORWARD
771 		 * off==0, dst==old		-> accept
772 		 * If some of the above modules are not compiled in, then
773 		 * we should't have to check the corresponding condition
774 		 * (because the ipfw control socket should not accept
775 		 * unsupported rules), but better play safe and drop
776 		 * packets in case of doubt.
777 		 */
778 		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
779 			if (m)
780 				m_freem(m);
781 			error = EACCES;
782 			goto done;
783 		}
784 		ip = mtod(m, struct ip *);
785 		if (off == 0 && dst == old)		/* common case */
786 			goto pass;
787                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
788 			/*
789 			 * pass the pkt to dummynet. Need to include
790 			 * pipe number, m, ifp, ro, dst because these are
791 			 * not recomputed in the next pass.
792 			 * All other parameters have been already used and
793 			 * so they are not needed anymore.
794 			 * XXX note: if the ifp or ro entry are deleted
795 			 * while a pkt is in dummynet, we are in trouble!
796 			 */
797 			args.ro = ro;
798 			args.dst = dst;
799 			args.flags = flags;
800 
801 			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
802 				&args);
803 			goto done;
804 		}
805 #ifdef IPDIVERT
806 		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
807 			struct mbuf *clone = NULL;
808 
809 			/* Clone packet if we're doing a 'tee' */
810 			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
811 				clone = m_dup(m, M_DONTWAIT);
812 
813 			/*
814 			 * XXX
815 			 * delayed checksums are not currently compatible
816 			 * with divert sockets.
817 			 */
818 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
819 				in_delayed_cksum(m);
820 				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
821 			}
822 
823 			/* Restore packet header fields to original values */
824 			ip->ip_len = htons(ip->ip_len);
825 			ip->ip_off = htons(ip->ip_off);
826 
827 			/* Deliver packet to divert input routine */
828 			divert_packet(m, 0, off & 0xffff, args.divert_rule);
829 
830 			/* If 'tee', continue with original packet */
831 			if (clone != NULL) {
832 				m = clone;
833 				ip = mtod(m, struct ip *);
834 				goto pass;
835 			}
836 			goto done;
837 		}
838 #endif
839 
840 		/* IPFIREWALL_FORWARD */
841 		/*
842 		 * Check dst to make sure it is directly reachable on the
843 		 * interface we previously thought it was.
844 		 * If it isn't (which may be likely in some situations) we have
845 		 * to re-route it (ie, find a route for the next-hop and the
846 		 * associated interface) and set them here. This is nested
847 		 * forwarding which in most cases is undesirable, except where
848 		 * such control is nigh impossible. So we do it here.
849 		 * And I'm babbling.
850 		 */
851 		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
852 #if 0
853 			/*
854 			 * XXX To improve readability, this block should be
855 			 * changed into a function call as below:
856 			 */
857 			error = ip_ipforward(&m, &dst, &ifp);
858 			if (error)
859 				goto bad;
860 			if (m == NULL) /* ip_input consumed the mbuf */
861 				goto done;
862 #else
863 			struct in_ifaddr *ia;
864 
865 			/*
866 			 * XXX sro_fwd below is static, and a pointer
867 			 * to it gets passed to routines downstream.
868 			 * This could have surprisingly bad results in
869 			 * practice, because its content is overwritten
870 			 * by subsequent packets.
871 			 */
872 			/* There must be a better way to do this next line... */
873 			static struct route sro_fwd;
874 			struct route *ro_fwd = &sro_fwd;
875 
876 #if 0
877 			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
878 			    dst->sin_addr, "\n");
879 #endif
880 
881 			/*
882 			 * We need to figure out if we have been forwarded
883 			 * to a local socket. If so, then we should somehow
884 			 * "loop back" to ip_input, and get directed to the
885 			 * PCB as if we had received this packet. This is
886 			 * because it may be dificult to identify the packets
887 			 * you want to forward until they are being output
888 			 * and have selected an interface. (e.g. locally
889 			 * initiated packets) If we used the loopback inteface,
890 			 * we would not be able to control what happens
891 			 * as the packet runs through ip_input() as
892 			 * it is done through an ISR.
893 			 */
894 			LIST_FOREACH(ia,
895 			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
896 				/*
897 				 * If the addr to forward to is one
898 				 * of ours, we pretend to
899 				 * be the destination for this packet.
900 				 */
901 				if (IA_SIN(ia)->sin_addr.s_addr ==
902 						 dst->sin_addr.s_addr)
903 					break;
904 			}
905 			if (ia) {	/* tell ip_input "dont filter" */
906 				struct m_hdr tag;
907 
908 				tag.mh_type = MT_TAG;
909 				tag.mh_flags = PACKET_TAG_IPFORWARD;
910 				tag.mh_data = (caddr_t)args.next_hop;
911 				tag.mh_next = m;
912 
913 				if (m->m_pkthdr.rcvif == NULL)
914 					m->m_pkthdr.rcvif = ifunit("lo0");
915 				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
916 					m->m_pkthdr.csum_flags |=
917 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
918 					m0->m_pkthdr.csum_data = 0xffff;
919 				}
920 				m->m_pkthdr.csum_flags |=
921 				    CSUM_IP_CHECKED | CSUM_IP_VALID;
922 				ip->ip_len = htons(ip->ip_len);
923 				ip->ip_off = htons(ip->ip_off);
924 				ip_input((struct mbuf *)&tag);
925 				goto done;
926 			}
927 			/* Some of the logic for this was
928 			 * nicked from above.
929 			 *
930 			 * This rewrites the cached route in a local PCB.
931 			 * Is this what we want to do?
932 			 */
933 			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
934 
935 			ro_fwd->ro_rt = 0;
936 			rtalloc_ign(ro_fwd, RTF_PRCLONING);
937 
938 			if (ro_fwd->ro_rt == 0) {
939 				ipstat.ips_noroute++;
940 				error = EHOSTUNREACH;
941 				goto bad;
942 			}
943 
944 			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
945 			ifp = ro_fwd->ro_rt->rt_ifp;
946 			ro_fwd->ro_rt->rt_use++;
947 			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
948 				dst = (struct sockaddr_in *)
949 					ro_fwd->ro_rt->rt_gateway;
950 			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
951 				isbroadcast =
952 				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
953 			else
954 				isbroadcast = in_broadcast(dst->sin_addr, ifp);
955 			if (ro->ro_rt)
956 				RTFREE(ro->ro_rt);
957 			ro->ro_rt = ro_fwd->ro_rt;
958 			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
959 
960 #endif	/* ... block to be put into a function */
961 			/*
962 			 * If we added a default src ip earlier,
963 			 * which would have been gotten from the-then
964 			 * interface, do it again, from the new one.
965 			 */
966 			if (src_was_INADDR_ANY)
967 				ip->ip_src = IA_SIN(ia)->sin_addr;
968 			goto pass ;
969 		}
970 
971                 /*
972                  * if we get here, none of the above matches, and
973                  * we have to drop the pkt
974                  */
975 		m_freem(m);
976                 error = EACCES; /* not sure this is the right error msg */
977                 goto done;
978 	}
979 
980 pass:
981 	/* 127/8 must not appear on wire - RFC1122. */
982 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
983 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
984 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
985 			ipstat.ips_badaddr++;
986 			error = EADDRNOTAVAIL;
987 			goto bad;
988 		}
989 	}
990 
991 	m->m_pkthdr.csum_flags |= CSUM_IP;
992 	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
993 	if (sw_csum & CSUM_DELAY_DATA) {
994 		in_delayed_cksum(m);
995 		sw_csum &= ~CSUM_DELAY_DATA;
996 	}
997 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
998 
999 	/*
1000 	 * If small enough for interface, or the interface will take
1001 	 * care of the fragmentation for us, can just send directly.
1002 	 */
1003 	if (ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) {
1004 		ip->ip_len = htons(ip->ip_len);
1005 		ip->ip_off = htons(ip->ip_off);
1006 		ip->ip_sum = 0;
1007 		if (sw_csum & CSUM_DELAY_IP)
1008 			ip->ip_sum = in_cksum(m, hlen);
1009 
1010 		/* Record statistics for this interface address. */
1011 		if (!(flags & IP_FORWARDING) && ia) {
1012 			ia->ia_ifa.if_opackets++;
1013 			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1014 		}
1015 
1016 #ifdef IPSEC
1017 		/* clean ipsec history once it goes out of the node */
1018 		ipsec_delaux(m);
1019 #endif
1020 
1021 #ifdef MBUF_STRESS_TEST
1022 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
1023 			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
1024 #endif
1025 		error = (*ifp->if_output)(ifp, m,
1026 				(struct sockaddr *)dst, ro->ro_rt);
1027 		goto done;
1028 	}
1029 
1030 	if (ip->ip_off & IP_DF) {
1031 		error = EMSGSIZE;
1032 		/*
1033 		 * This case can happen if the user changed the MTU
1034 		 * of an interface after enabling IP on it.  Because
1035 		 * most netifs don't keep track of routes pointing to
1036 		 * them, there is no way for one to update all its
1037 		 * routes when the MTU is changed.
1038 		 */
1039 		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1040 		    !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1041 		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1042 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1043 		}
1044 		ipstat.ips_cantfrag++;
1045 		goto bad;
1046 	}
1047 
1048 	/*
1049 	 * Too large for interface; fragment if possible. If successful,
1050 	 * on return, m will point to a list of packets to be sent.
1051 	 */
1052 	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1053 	if (error)
1054 		goto bad;
1055 	for (; m; m = m0) {
1056 		m0 = m->m_nextpkt;
1057 		m->m_nextpkt = 0;
1058 #ifdef IPSEC
1059 		/* clean ipsec history once it goes out of the node */
1060 		ipsec_delaux(m);
1061 #endif
1062 		if (error == 0) {
1063 			/* Record statistics for this interface address. */
1064 			if (ia != NULL) {
1065 				ia->ia_ifa.if_opackets++;
1066 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1067 			}
1068 
1069 			error = (*ifp->if_output)(ifp, m,
1070 			    (struct sockaddr *)dst, ro->ro_rt);
1071 		} else
1072 			m_freem(m);
1073 	}
1074 
1075 	if (error == 0)
1076 		ipstat.ips_fragmented++;
1077 
1078 done:
1079 #ifdef IPSEC
1080 	if (ro == &iproute && ro->ro_rt) {
1081 		RTFREE(ro->ro_rt);
1082 		ro->ro_rt = NULL;
1083 	}
1084 	if (sp != NULL) {
1085 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1086 			printf("DP ip_output call free SP:%p\n", sp));
1087 		key_freesp(sp);
1088 	}
1089 #endif
1090 #ifdef FAST_IPSEC
1091 	if (ro == &iproute && ro->ro_rt) {
1092 		RTFREE(ro->ro_rt);
1093 		ro->ro_rt = NULL;
1094 	}
1095 	if (sp != NULL)
1096 		KEY_FREESP(&sp);
1097 #endif
1098 	return (error);
1099 bad:
1100 	m_freem(m);
1101 	goto done;
1102 }
1103 
1104 /*
1105  * Create a chain of fragments which fit the given mtu. m_frag points to the
1106  * mbuf to be fragmented; on return it points to the chain with the fragments.
1107  * Return 0 if no error. If error, m_frag may contain a partially built
1108  * chain of fragments that should be freed by the caller.
1109  *
1110  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1111  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1112  */
1113 int
1114 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1115 	    u_long if_hwassist_flags, int sw_csum)
1116 {
1117 	int error = 0;
1118 	int hlen = ip->ip_hl << 2;
1119 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
1120 	int off;
1121 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
1122 	int firstlen;
1123 	struct mbuf **mnext;
1124 	int nfrags;
1125 
1126 	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
1127 		ipstat.ips_cantfrag++;
1128 		return EMSGSIZE;
1129 	}
1130 
1131 	/*
1132 	 * Must be able to put at least 8 bytes per fragment.
1133 	 */
1134 	if (len < 8)
1135 		return EMSGSIZE;
1136 
1137 	/*
1138 	 * If the interface will not calculate checksums on
1139 	 * fragmented packets, then do it here.
1140 	 */
1141 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1142 	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1143 		in_delayed_cksum(m0);
1144 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1145 	}
1146 
1147 	if (len > PAGE_SIZE) {
1148 		/*
1149 		 * Fragment large datagrams such that each segment
1150 		 * contains a multiple of PAGE_SIZE amount of data,
1151 		 * plus headers. This enables a receiver to perform
1152 		 * page-flipping zero-copy optimizations.
1153 		 *
1154 		 * XXX When does this help given that sender and receiver
1155 		 * could have different page sizes, and also mtu could
1156 		 * be less than the receiver's page size ?
1157 		 */
1158 		int newlen;
1159 		struct mbuf *m;
1160 
1161 		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1162 			off += m->m_len;
1163 
1164 		/*
1165 		 * firstlen (off - hlen) must be aligned on an
1166 		 * 8-byte boundary
1167 		 */
1168 		if (off < hlen)
1169 			goto smart_frag_failure;
1170 		off = ((off - hlen) & ~7) + hlen;
1171 		newlen = (~PAGE_MASK) & mtu;
1172 		if ((newlen + sizeof (struct ip)) > mtu) {
1173 			/* we failed, go back the default */
1174 smart_frag_failure:
1175 			newlen = len;
1176 			off = hlen + len;
1177 		}
1178 		len = newlen;
1179 
1180 	} else {
1181 		off = hlen + len;
1182 	}
1183 
1184 	firstlen = off - hlen;
1185 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
1186 
1187 	/*
1188 	 * Loop through length of segment after first fragment,
1189 	 * make new header and copy data of each part and link onto chain.
1190 	 * Here, m0 is the original packet, m is the fragment being created.
1191 	 * The fragments are linked off the m_nextpkt of the original
1192 	 * packet, which after processing serves as the first fragment.
1193 	 */
1194 	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1195 		struct ip *mhip;	/* ip header on the fragment */
1196 		struct mbuf *m;
1197 		int mhlen = sizeof (struct ip);
1198 
1199 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1200 		if (m == 0) {
1201 			error = ENOBUFS;
1202 			ipstat.ips_odropped++;
1203 			goto done;
1204 		}
1205 		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1206 		/*
1207 		 * In the first mbuf, leave room for the link header, then
1208 		 * copy the original IP header including options. The payload
1209 		 * goes into an additional mbuf chain returned by m_copy().
1210 		 */
1211 		m->m_data += max_linkhdr;
1212 		mhip = mtod(m, struct ip *);
1213 		*mhip = *ip;
1214 		if (hlen > sizeof (struct ip)) {
1215 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1216 			mhip->ip_v = IPVERSION;
1217 			mhip->ip_hl = mhlen >> 2;
1218 		}
1219 		m->m_len = mhlen;
1220 		/* XXX do we need to add ip->ip_off below ? */
1221 		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1222 		if (off + len >= ip->ip_len) {	/* last fragment */
1223 			len = ip->ip_len - off;
1224 			m->m_flags |= M_LASTFRAG;
1225 		} else
1226 			mhip->ip_off |= IP_MF;
1227 		mhip->ip_len = htons((u_short)(len + mhlen));
1228 		m->m_next = m_copy(m0, off, len);
1229 		if (m->m_next == 0) {		/* copy failed */
1230 			m_free(m);
1231 			error = ENOBUFS;	/* ??? */
1232 			ipstat.ips_odropped++;
1233 			goto done;
1234 		}
1235 		m->m_pkthdr.len = mhlen + len;
1236 		m->m_pkthdr.rcvif = (struct ifnet *)0;
1237 #ifdef MAC
1238 		mac_create_fragment(m0, m);
1239 #endif
1240 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1241 		mhip->ip_off = htons(mhip->ip_off);
1242 		mhip->ip_sum = 0;
1243 		if (sw_csum & CSUM_DELAY_IP)
1244 			mhip->ip_sum = in_cksum(m, mhlen);
1245 		*mnext = m;
1246 		mnext = &m->m_nextpkt;
1247 	}
1248 	ipstat.ips_ofragments += nfrags;
1249 
1250 	/* set first marker for fragment chain */
1251 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1252 	m0->m_pkthdr.csum_data = nfrags;
1253 
1254 	/*
1255 	 * Update first fragment by trimming what's been copied out
1256 	 * and updating header.
1257 	 */
1258 	m_adj(m0, hlen + firstlen - ip->ip_len);
1259 	m0->m_pkthdr.len = hlen + firstlen;
1260 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1261 	ip->ip_off |= IP_MF;
1262 	ip->ip_off = htons(ip->ip_off);
1263 	ip->ip_sum = 0;
1264 	if (sw_csum & CSUM_DELAY_IP)
1265 		ip->ip_sum = in_cksum(m0, hlen);
1266 
1267 done:
1268 	*m_frag = m0;
1269 	return error;
1270 }
1271 
1272 void
1273 in_delayed_cksum(struct mbuf *m)
1274 {
1275 	struct ip *ip;
1276 	u_short csum, offset;
1277 
1278 	ip = mtod(m, struct ip *);
1279 	offset = ip->ip_hl << 2 ;
1280 	csum = in_cksum_skip(m, ip->ip_len, offset);
1281 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1282 		csum = 0xffff;
1283 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1284 
1285 	if (offset + sizeof(u_short) > m->m_len) {
1286 		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1287 		    m->m_len, offset, ip->ip_p);
1288 		/*
1289 		 * XXX
1290 		 * this shouldn't happen, but if it does, the
1291 		 * correct behavior may be to insert the checksum
1292 		 * in the existing chain instead of rearranging it.
1293 		 */
1294 		m = m_pullup(m, offset + sizeof(u_short));
1295 	}
1296 	*(u_short *)(m->m_data + offset) = csum;
1297 }
1298 
1299 /*
1300  * Insert IP options into preformed packet.
1301  * Adjust IP destination as required for IP source routing,
1302  * as indicated by a non-zero in_addr at the start of the options.
1303  *
1304  * XXX This routine assumes that the packet has no options in place.
1305  */
1306 static struct mbuf *
1307 ip_insertoptions(m, opt, phlen)
1308 	register struct mbuf *m;
1309 	struct mbuf *opt;
1310 	int *phlen;
1311 {
1312 	register struct ipoption *p = mtod(opt, struct ipoption *);
1313 	struct mbuf *n;
1314 	register struct ip *ip = mtod(m, struct ip *);
1315 	unsigned optlen;
1316 
1317 	optlen = opt->m_len - sizeof(p->ipopt_dst);
1318 	if (optlen + ip->ip_len > IP_MAXPACKET) {
1319 		*phlen = 0;
1320 		return (m);		/* XXX should fail */
1321 	}
1322 	if (p->ipopt_dst.s_addr)
1323 		ip->ip_dst = p->ipopt_dst;
1324 	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1325 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1326 		if (n == 0) {
1327 			*phlen = 0;
1328 			return (m);
1329 		}
1330 		n->m_pkthdr.rcvif = (struct ifnet *)0;
1331 #ifdef MAC
1332 		mac_create_mbuf_from_mbuf(m, n);
1333 #endif
1334 		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1335 		m->m_len -= sizeof(struct ip);
1336 		m->m_data += sizeof(struct ip);
1337 		n->m_next = m;
1338 		m = n;
1339 		m->m_len = optlen + sizeof(struct ip);
1340 		m->m_data += max_linkhdr;
1341 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1342 	} else {
1343 		m->m_data -= optlen;
1344 		m->m_len += optlen;
1345 		m->m_pkthdr.len += optlen;
1346 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1347 	}
1348 	ip = mtod(m, struct ip *);
1349 	bcopy(p->ipopt_list, ip + 1, optlen);
1350 	*phlen = sizeof(struct ip) + optlen;
1351 	ip->ip_v = IPVERSION;
1352 	ip->ip_hl = *phlen >> 2;
1353 	ip->ip_len += optlen;
1354 	return (m);
1355 }
1356 
1357 /*
1358  * Copy options from ip to jp,
1359  * omitting those not copied during fragmentation.
1360  */
1361 int
1362 ip_optcopy(ip, jp)
1363 	struct ip *ip, *jp;
1364 {
1365 	register u_char *cp, *dp;
1366 	int opt, optlen, cnt;
1367 
1368 	cp = (u_char *)(ip + 1);
1369 	dp = (u_char *)(jp + 1);
1370 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1371 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1372 		opt = cp[0];
1373 		if (opt == IPOPT_EOL)
1374 			break;
1375 		if (opt == IPOPT_NOP) {
1376 			/* Preserve for IP mcast tunnel's LSRR alignment. */
1377 			*dp++ = IPOPT_NOP;
1378 			optlen = 1;
1379 			continue;
1380 		}
1381 
1382 		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1383 		    ("ip_optcopy: malformed ipv4 option"));
1384 		optlen = cp[IPOPT_OLEN];
1385 		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1386 		    ("ip_optcopy: malformed ipv4 option"));
1387 
1388 		/* bogus lengths should have been caught by ip_dooptions */
1389 		if (optlen > cnt)
1390 			optlen = cnt;
1391 		if (IPOPT_COPIED(opt)) {
1392 			bcopy(cp, dp, optlen);
1393 			dp += optlen;
1394 		}
1395 	}
1396 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1397 		*dp++ = IPOPT_EOL;
1398 	return (optlen);
1399 }
1400 
1401 /*
1402  * IP socket option processing.
1403  */
1404 int
1405 ip_ctloutput(so, sopt)
1406 	struct socket *so;
1407 	struct sockopt *sopt;
1408 {
1409 	struct	inpcb *inp = sotoinpcb(so);
1410 	int	error, optval;
1411 
1412 	error = optval = 0;
1413 	if (sopt->sopt_level != IPPROTO_IP) {
1414 		return (EINVAL);
1415 	}
1416 
1417 	switch (sopt->sopt_dir) {
1418 	case SOPT_SET:
1419 		switch (sopt->sopt_name) {
1420 		case IP_OPTIONS:
1421 #ifdef notyet
1422 		case IP_RETOPTS:
1423 #endif
1424 		{
1425 			struct mbuf *m;
1426 			if (sopt->sopt_valsize > MLEN) {
1427 				error = EMSGSIZE;
1428 				break;
1429 			}
1430 			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1431 			if (m == 0) {
1432 				error = ENOBUFS;
1433 				break;
1434 			}
1435 			m->m_len = sopt->sopt_valsize;
1436 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1437 					    m->m_len);
1438 
1439 			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1440 					   m));
1441 		}
1442 
1443 		case IP_TOS:
1444 		case IP_TTL:
1445 		case IP_RECVOPTS:
1446 		case IP_RECVRETOPTS:
1447 		case IP_RECVDSTADDR:
1448 		case IP_RECVTTL:
1449 		case IP_RECVIF:
1450 		case IP_FAITH:
1451 		case IP_ONESBCAST:
1452 			error = sooptcopyin(sopt, &optval, sizeof optval,
1453 					    sizeof optval);
1454 			if (error)
1455 				break;
1456 
1457 			switch (sopt->sopt_name) {
1458 			case IP_TOS:
1459 				inp->inp_ip_tos = optval;
1460 				break;
1461 
1462 			case IP_TTL:
1463 				inp->inp_ip_ttl = optval;
1464 				break;
1465 #define	OPTSET(bit) \
1466 	if (optval) \
1467 		inp->inp_flags |= bit; \
1468 	else \
1469 		inp->inp_flags &= ~bit;
1470 
1471 			case IP_RECVOPTS:
1472 				OPTSET(INP_RECVOPTS);
1473 				break;
1474 
1475 			case IP_RECVRETOPTS:
1476 				OPTSET(INP_RECVRETOPTS);
1477 				break;
1478 
1479 			case IP_RECVDSTADDR:
1480 				OPTSET(INP_RECVDSTADDR);
1481 				break;
1482 
1483 			case IP_RECVTTL:
1484 				OPTSET(INP_RECVTTL);
1485 				break;
1486 
1487 			case IP_RECVIF:
1488 				OPTSET(INP_RECVIF);
1489 				break;
1490 
1491 			case IP_FAITH:
1492 				OPTSET(INP_FAITH);
1493 				break;
1494 
1495 			case IP_ONESBCAST:
1496 				OPTSET(INP_ONESBCAST);
1497 				break;
1498 			}
1499 			break;
1500 #undef OPTSET
1501 
1502 		case IP_MULTICAST_IF:
1503 		case IP_MULTICAST_VIF:
1504 		case IP_MULTICAST_TTL:
1505 		case IP_MULTICAST_LOOP:
1506 		case IP_ADD_MEMBERSHIP:
1507 		case IP_DROP_MEMBERSHIP:
1508 			error = ip_setmoptions(sopt, &inp->inp_moptions);
1509 			break;
1510 
1511 		case IP_PORTRANGE:
1512 			error = sooptcopyin(sopt, &optval, sizeof optval,
1513 					    sizeof optval);
1514 			if (error)
1515 				break;
1516 
1517 			switch (optval) {
1518 			case IP_PORTRANGE_DEFAULT:
1519 				inp->inp_flags &= ~(INP_LOWPORT);
1520 				inp->inp_flags &= ~(INP_HIGHPORT);
1521 				break;
1522 
1523 			case IP_PORTRANGE_HIGH:
1524 				inp->inp_flags &= ~(INP_LOWPORT);
1525 				inp->inp_flags |= INP_HIGHPORT;
1526 				break;
1527 
1528 			case IP_PORTRANGE_LOW:
1529 				inp->inp_flags &= ~(INP_HIGHPORT);
1530 				inp->inp_flags |= INP_LOWPORT;
1531 				break;
1532 
1533 			default:
1534 				error = EINVAL;
1535 				break;
1536 			}
1537 			break;
1538 
1539 #if defined(IPSEC) || defined(FAST_IPSEC)
1540 		case IP_IPSEC_POLICY:
1541 		{
1542 			caddr_t req;
1543 			size_t len = 0;
1544 			int priv;
1545 			struct mbuf *m;
1546 			int optname;
1547 
1548 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1549 				break;
1550 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1551 				break;
1552 			priv = (sopt->sopt_td != NULL &&
1553 				suser(sopt->sopt_td) != 0) ? 0 : 1;
1554 			req = mtod(m, caddr_t);
1555 			len = m->m_len;
1556 			optname = sopt->sopt_name;
1557 			error = ipsec4_set_policy(inp, optname, req, len, priv);
1558 			m_freem(m);
1559 			break;
1560 		}
1561 #endif /*IPSEC*/
1562 
1563 		default:
1564 			error = ENOPROTOOPT;
1565 			break;
1566 		}
1567 		break;
1568 
1569 	case SOPT_GET:
1570 		switch (sopt->sopt_name) {
1571 		case IP_OPTIONS:
1572 		case IP_RETOPTS:
1573 			if (inp->inp_options)
1574 				error = sooptcopyout(sopt,
1575 						     mtod(inp->inp_options,
1576 							  char *),
1577 						     inp->inp_options->m_len);
1578 			else
1579 				sopt->sopt_valsize = 0;
1580 			break;
1581 
1582 		case IP_TOS:
1583 		case IP_TTL:
1584 		case IP_RECVOPTS:
1585 		case IP_RECVRETOPTS:
1586 		case IP_RECVDSTADDR:
1587 		case IP_RECVTTL:
1588 		case IP_RECVIF:
1589 		case IP_PORTRANGE:
1590 		case IP_FAITH:
1591 		case IP_ONESBCAST:
1592 			switch (sopt->sopt_name) {
1593 
1594 			case IP_TOS:
1595 				optval = inp->inp_ip_tos;
1596 				break;
1597 
1598 			case IP_TTL:
1599 				optval = inp->inp_ip_ttl;
1600 				break;
1601 
1602 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1603 
1604 			case IP_RECVOPTS:
1605 				optval = OPTBIT(INP_RECVOPTS);
1606 				break;
1607 
1608 			case IP_RECVRETOPTS:
1609 				optval = OPTBIT(INP_RECVRETOPTS);
1610 				break;
1611 
1612 			case IP_RECVDSTADDR:
1613 				optval = OPTBIT(INP_RECVDSTADDR);
1614 				break;
1615 
1616 			case IP_RECVTTL:
1617 				optval = OPTBIT(INP_RECVTTL);
1618 				break;
1619 
1620 			case IP_RECVIF:
1621 				optval = OPTBIT(INP_RECVIF);
1622 				break;
1623 
1624 			case IP_PORTRANGE:
1625 				if (inp->inp_flags & INP_HIGHPORT)
1626 					optval = IP_PORTRANGE_HIGH;
1627 				else if (inp->inp_flags & INP_LOWPORT)
1628 					optval = IP_PORTRANGE_LOW;
1629 				else
1630 					optval = 0;
1631 				break;
1632 
1633 			case IP_FAITH:
1634 				optval = OPTBIT(INP_FAITH);
1635 				break;
1636 
1637 			case IP_ONESBCAST:
1638 				optval = OPTBIT(INP_ONESBCAST);
1639 				break;
1640 			}
1641 			error = sooptcopyout(sopt, &optval, sizeof optval);
1642 			break;
1643 
1644 		case IP_MULTICAST_IF:
1645 		case IP_MULTICAST_VIF:
1646 		case IP_MULTICAST_TTL:
1647 		case IP_MULTICAST_LOOP:
1648 		case IP_ADD_MEMBERSHIP:
1649 		case IP_DROP_MEMBERSHIP:
1650 			error = ip_getmoptions(sopt, inp->inp_moptions);
1651 			break;
1652 
1653 #if defined(IPSEC) || defined(FAST_IPSEC)
1654 		case IP_IPSEC_POLICY:
1655 		{
1656 			struct mbuf *m = NULL;
1657 			caddr_t req = NULL;
1658 			size_t len = 0;
1659 
1660 			if (m != 0) {
1661 				req = mtod(m, caddr_t);
1662 				len = m->m_len;
1663 			}
1664 			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1665 			if (error == 0)
1666 				error = soopt_mcopyout(sopt, m); /* XXX */
1667 			if (error == 0)
1668 				m_freem(m);
1669 			break;
1670 		}
1671 #endif /*IPSEC*/
1672 
1673 		default:
1674 			error = ENOPROTOOPT;
1675 			break;
1676 		}
1677 		break;
1678 	}
1679 	return (error);
1680 }
1681 
1682 /*
1683  * Set up IP options in pcb for insertion in output packets.
1684  * Store in mbuf with pointer in pcbopt, adding pseudo-option
1685  * with destination address if source routed.
1686  */
1687 static int
1688 ip_pcbopts(optname, pcbopt, m)
1689 	int optname;
1690 	struct mbuf **pcbopt;
1691 	register struct mbuf *m;
1692 {
1693 	register int cnt, optlen;
1694 	register u_char *cp;
1695 	u_char opt;
1696 
1697 	/* turn off any old options */
1698 	if (*pcbopt)
1699 		(void)m_free(*pcbopt);
1700 	*pcbopt = 0;
1701 	if (m == (struct mbuf *)0 || m->m_len == 0) {
1702 		/*
1703 		 * Only turning off any previous options.
1704 		 */
1705 		if (m)
1706 			(void)m_free(m);
1707 		return (0);
1708 	}
1709 
1710 	if (m->m_len % sizeof(int32_t))
1711 		goto bad;
1712 	/*
1713 	 * IP first-hop destination address will be stored before
1714 	 * actual options; move other options back
1715 	 * and clear it when none present.
1716 	 */
1717 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1718 		goto bad;
1719 	cnt = m->m_len;
1720 	m->m_len += sizeof(struct in_addr);
1721 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1722 	bcopy(mtod(m, void *), cp, (unsigned)cnt);
1723 	bzero(mtod(m, void *), sizeof(struct in_addr));
1724 
1725 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1726 		opt = cp[IPOPT_OPTVAL];
1727 		if (opt == IPOPT_EOL)
1728 			break;
1729 		if (opt == IPOPT_NOP)
1730 			optlen = 1;
1731 		else {
1732 			if (cnt < IPOPT_OLEN + sizeof(*cp))
1733 				goto bad;
1734 			optlen = cp[IPOPT_OLEN];
1735 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1736 				goto bad;
1737 		}
1738 		switch (opt) {
1739 
1740 		default:
1741 			break;
1742 
1743 		case IPOPT_LSRR:
1744 		case IPOPT_SSRR:
1745 			/*
1746 			 * user process specifies route as:
1747 			 *	->A->B->C->D
1748 			 * D must be our final destination (but we can't
1749 			 * check that since we may not have connected yet).
1750 			 * A is first hop destination, which doesn't appear in
1751 			 * actual IP option, but is stored before the options.
1752 			 */
1753 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1754 				goto bad;
1755 			m->m_len -= sizeof(struct in_addr);
1756 			cnt -= sizeof(struct in_addr);
1757 			optlen -= sizeof(struct in_addr);
1758 			cp[IPOPT_OLEN] = optlen;
1759 			/*
1760 			 * Move first hop before start of options.
1761 			 */
1762 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1763 			    sizeof(struct in_addr));
1764 			/*
1765 			 * Then copy rest of options back
1766 			 * to close up the deleted entry.
1767 			 */
1768 			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
1769 			    &cp[IPOPT_OFFSET+1],
1770 			    (unsigned)cnt + sizeof(struct in_addr));
1771 			break;
1772 		}
1773 	}
1774 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1775 		goto bad;
1776 	*pcbopt = m;
1777 	return (0);
1778 
1779 bad:
1780 	(void)m_free(m);
1781 	return (EINVAL);
1782 }
1783 
1784 /*
1785  * XXX
1786  * The whole multicast option thing needs to be re-thought.
1787  * Several of these options are equally applicable to non-multicast
1788  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1789  * standard option (IP_TTL).
1790  */
1791 
1792 /*
1793  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1794  */
1795 static struct ifnet *
1796 ip_multicast_if(a, ifindexp)
1797 	struct in_addr *a;
1798 	int *ifindexp;
1799 {
1800 	int ifindex;
1801 	struct ifnet *ifp;
1802 
1803 	if (ifindexp)
1804 		*ifindexp = 0;
1805 	if (ntohl(a->s_addr) >> 24 == 0) {
1806 		ifindex = ntohl(a->s_addr) & 0xffffff;
1807 		if (ifindex < 0 || if_index < ifindex)
1808 			return NULL;
1809 		ifp = ifnet_byindex(ifindex);
1810 		if (ifindexp)
1811 			*ifindexp = ifindex;
1812 	} else {
1813 		INADDR_TO_IFP(*a, ifp);
1814 	}
1815 	return ifp;
1816 }
1817 
1818 /*
1819  * Set the IP multicast options in response to user setsockopt().
1820  */
1821 static int
1822 ip_setmoptions(sopt, imop)
1823 	struct sockopt *sopt;
1824 	struct ip_moptions **imop;
1825 {
1826 	int error = 0;
1827 	int i;
1828 	struct in_addr addr;
1829 	struct ip_mreq mreq;
1830 	struct ifnet *ifp;
1831 	struct ip_moptions *imo = *imop;
1832 	struct route ro;
1833 	struct sockaddr_in *dst;
1834 	int ifindex;
1835 	int s;
1836 
1837 	if (imo == NULL) {
1838 		/*
1839 		 * No multicast option buffer attached to the pcb;
1840 		 * allocate one and initialize to default values.
1841 		 */
1842 		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1843 		    M_WAITOK);
1844 
1845 		if (imo == NULL)
1846 			return (ENOBUFS);
1847 		*imop = imo;
1848 		imo->imo_multicast_ifp = NULL;
1849 		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1850 		imo->imo_multicast_vif = -1;
1851 		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1852 		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1853 		imo->imo_num_memberships = 0;
1854 	}
1855 
1856 	switch (sopt->sopt_name) {
1857 	/* store an index number for the vif you wanna use in the send */
1858 	case IP_MULTICAST_VIF:
1859 		if (legal_vif_num == 0) {
1860 			error = EOPNOTSUPP;
1861 			break;
1862 		}
1863 		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1864 		if (error)
1865 			break;
1866 		if (!legal_vif_num(i) && (i != -1)) {
1867 			error = EINVAL;
1868 			break;
1869 		}
1870 		imo->imo_multicast_vif = i;
1871 		break;
1872 
1873 	case IP_MULTICAST_IF:
1874 		/*
1875 		 * Select the interface for outgoing multicast packets.
1876 		 */
1877 		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1878 		if (error)
1879 			break;
1880 		/*
1881 		 * INADDR_ANY is used to remove a previous selection.
1882 		 * When no interface is selected, a default one is
1883 		 * chosen every time a multicast packet is sent.
1884 		 */
1885 		if (addr.s_addr == INADDR_ANY) {
1886 			imo->imo_multicast_ifp = NULL;
1887 			break;
1888 		}
1889 		/*
1890 		 * The selected interface is identified by its local
1891 		 * IP address.  Find the interface and confirm that
1892 		 * it supports multicasting.
1893 		 */
1894 		s = splimp();
1895 		ifp = ip_multicast_if(&addr, &ifindex);
1896 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1897 			splx(s);
1898 			error = EADDRNOTAVAIL;
1899 			break;
1900 		}
1901 		imo->imo_multicast_ifp = ifp;
1902 		if (ifindex)
1903 			imo->imo_multicast_addr = addr;
1904 		else
1905 			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1906 		splx(s);
1907 		break;
1908 
1909 	case IP_MULTICAST_TTL:
1910 		/*
1911 		 * Set the IP time-to-live for outgoing multicast packets.
1912 		 * The original multicast API required a char argument,
1913 		 * which is inconsistent with the rest of the socket API.
1914 		 * We allow either a char or an int.
1915 		 */
1916 		if (sopt->sopt_valsize == 1) {
1917 			u_char ttl;
1918 			error = sooptcopyin(sopt, &ttl, 1, 1);
1919 			if (error)
1920 				break;
1921 			imo->imo_multicast_ttl = ttl;
1922 		} else {
1923 			u_int ttl;
1924 			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1925 					    sizeof ttl);
1926 			if (error)
1927 				break;
1928 			if (ttl > 255)
1929 				error = EINVAL;
1930 			else
1931 				imo->imo_multicast_ttl = ttl;
1932 		}
1933 		break;
1934 
1935 	case IP_MULTICAST_LOOP:
1936 		/*
1937 		 * Set the loopback flag for outgoing multicast packets.
1938 		 * Must be zero or one.  The original multicast API required a
1939 		 * char argument, which is inconsistent with the rest
1940 		 * of the socket API.  We allow either a char or an int.
1941 		 */
1942 		if (sopt->sopt_valsize == 1) {
1943 			u_char loop;
1944 			error = sooptcopyin(sopt, &loop, 1, 1);
1945 			if (error)
1946 				break;
1947 			imo->imo_multicast_loop = !!loop;
1948 		} else {
1949 			u_int loop;
1950 			error = sooptcopyin(sopt, &loop, sizeof loop,
1951 					    sizeof loop);
1952 			if (error)
1953 				break;
1954 			imo->imo_multicast_loop = !!loop;
1955 		}
1956 		break;
1957 
1958 	case IP_ADD_MEMBERSHIP:
1959 		/*
1960 		 * Add a multicast group membership.
1961 		 * Group must be a valid IP multicast address.
1962 		 */
1963 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1964 		if (error)
1965 			break;
1966 
1967 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1968 			error = EINVAL;
1969 			break;
1970 		}
1971 		s = splimp();
1972 		/*
1973 		 * If no interface address was provided, use the interface of
1974 		 * the route to the given multicast address.
1975 		 */
1976 		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1977 			bzero((caddr_t)&ro, sizeof(ro));
1978 			dst = (struct sockaddr_in *)&ro.ro_dst;
1979 			dst->sin_len = sizeof(*dst);
1980 			dst->sin_family = AF_INET;
1981 			dst->sin_addr = mreq.imr_multiaddr;
1982 			rtalloc(&ro);
1983 			if (ro.ro_rt == NULL) {
1984 				error = EADDRNOTAVAIL;
1985 				splx(s);
1986 				break;
1987 			}
1988 			ifp = ro.ro_rt->rt_ifp;
1989 			rtfree(ro.ro_rt);
1990 		}
1991 		else {
1992 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1993 		}
1994 
1995 		/*
1996 		 * See if we found an interface, and confirm that it
1997 		 * supports multicast.
1998 		 */
1999 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
2000 			error = EADDRNOTAVAIL;
2001 			splx(s);
2002 			break;
2003 		}
2004 		/*
2005 		 * See if the membership already exists or if all the
2006 		 * membership slots are full.
2007 		 */
2008 		for (i = 0; i < imo->imo_num_memberships; ++i) {
2009 			if (imo->imo_membership[i]->inm_ifp == ifp &&
2010 			    imo->imo_membership[i]->inm_addr.s_addr
2011 						== mreq.imr_multiaddr.s_addr)
2012 				break;
2013 		}
2014 		if (i < imo->imo_num_memberships) {
2015 			error = EADDRINUSE;
2016 			splx(s);
2017 			break;
2018 		}
2019 		if (i == IP_MAX_MEMBERSHIPS) {
2020 			error = ETOOMANYREFS;
2021 			splx(s);
2022 			break;
2023 		}
2024 		/*
2025 		 * Everything looks good; add a new record to the multicast
2026 		 * address list for the given interface.
2027 		 */
2028 		if ((imo->imo_membership[i] =
2029 		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2030 			error = ENOBUFS;
2031 			splx(s);
2032 			break;
2033 		}
2034 		++imo->imo_num_memberships;
2035 		splx(s);
2036 		break;
2037 
2038 	case IP_DROP_MEMBERSHIP:
2039 		/*
2040 		 * Drop a multicast group membership.
2041 		 * Group must be a valid IP multicast address.
2042 		 */
2043 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2044 		if (error)
2045 			break;
2046 
2047 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2048 			error = EINVAL;
2049 			break;
2050 		}
2051 
2052 		s = splimp();
2053 		/*
2054 		 * If an interface address was specified, get a pointer
2055 		 * to its ifnet structure.
2056 		 */
2057 		if (mreq.imr_interface.s_addr == INADDR_ANY)
2058 			ifp = NULL;
2059 		else {
2060 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2061 			if (ifp == NULL) {
2062 				error = EADDRNOTAVAIL;
2063 				splx(s);
2064 				break;
2065 			}
2066 		}
2067 		/*
2068 		 * Find the membership in the membership array.
2069 		 */
2070 		for (i = 0; i < imo->imo_num_memberships; ++i) {
2071 			if ((ifp == NULL ||
2072 			     imo->imo_membership[i]->inm_ifp == ifp) &&
2073 			     imo->imo_membership[i]->inm_addr.s_addr ==
2074 			     mreq.imr_multiaddr.s_addr)
2075 				break;
2076 		}
2077 		if (i == imo->imo_num_memberships) {
2078 			error = EADDRNOTAVAIL;
2079 			splx(s);
2080 			break;
2081 		}
2082 		/*
2083 		 * Give up the multicast address record to which the
2084 		 * membership points.
2085 		 */
2086 		in_delmulti(imo->imo_membership[i]);
2087 		/*
2088 		 * Remove the gap in the membership array.
2089 		 */
2090 		for (++i; i < imo->imo_num_memberships; ++i)
2091 			imo->imo_membership[i-1] = imo->imo_membership[i];
2092 		--imo->imo_num_memberships;
2093 		splx(s);
2094 		break;
2095 
2096 	default:
2097 		error = EOPNOTSUPP;
2098 		break;
2099 	}
2100 
2101 	/*
2102 	 * If all options have default values, no need to keep the mbuf.
2103 	 */
2104 	if (imo->imo_multicast_ifp == NULL &&
2105 	    imo->imo_multicast_vif == -1 &&
2106 	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2107 	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2108 	    imo->imo_num_memberships == 0) {
2109 		free(*imop, M_IPMOPTS);
2110 		*imop = NULL;
2111 	}
2112 
2113 	return (error);
2114 }
2115 
2116 /*
2117  * Return the IP multicast options in response to user getsockopt().
2118  */
2119 static int
2120 ip_getmoptions(sopt, imo)
2121 	struct sockopt *sopt;
2122 	register struct ip_moptions *imo;
2123 {
2124 	struct in_addr addr;
2125 	struct in_ifaddr *ia;
2126 	int error, optval;
2127 	u_char coptval;
2128 
2129 	error = 0;
2130 	switch (sopt->sopt_name) {
2131 	case IP_MULTICAST_VIF:
2132 		if (imo != NULL)
2133 			optval = imo->imo_multicast_vif;
2134 		else
2135 			optval = -1;
2136 		error = sooptcopyout(sopt, &optval, sizeof optval);
2137 		break;
2138 
2139 	case IP_MULTICAST_IF:
2140 		if (imo == NULL || imo->imo_multicast_ifp == NULL)
2141 			addr.s_addr = INADDR_ANY;
2142 		else if (imo->imo_multicast_addr.s_addr) {
2143 			/* return the value user has set */
2144 			addr = imo->imo_multicast_addr;
2145 		} else {
2146 			IFP_TO_IA(imo->imo_multicast_ifp, ia);
2147 			addr.s_addr = (ia == NULL) ? INADDR_ANY
2148 				: IA_SIN(ia)->sin_addr.s_addr;
2149 		}
2150 		error = sooptcopyout(sopt, &addr, sizeof addr);
2151 		break;
2152 
2153 	case IP_MULTICAST_TTL:
2154 		if (imo == 0)
2155 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2156 		else
2157 			optval = coptval = imo->imo_multicast_ttl;
2158 		if (sopt->sopt_valsize == 1)
2159 			error = sooptcopyout(sopt, &coptval, 1);
2160 		else
2161 			error = sooptcopyout(sopt, &optval, sizeof optval);
2162 		break;
2163 
2164 	case IP_MULTICAST_LOOP:
2165 		if (imo == 0)
2166 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2167 		else
2168 			optval = coptval = imo->imo_multicast_loop;
2169 		if (sopt->sopt_valsize == 1)
2170 			error = sooptcopyout(sopt, &coptval, 1);
2171 		else
2172 			error = sooptcopyout(sopt, &optval, sizeof optval);
2173 		break;
2174 
2175 	default:
2176 		error = ENOPROTOOPT;
2177 		break;
2178 	}
2179 	return (error);
2180 }
2181 
2182 /*
2183  * Discard the IP multicast options.
2184  */
2185 void
2186 ip_freemoptions(imo)
2187 	register struct ip_moptions *imo;
2188 {
2189 	register int i;
2190 
2191 	if (imo != NULL) {
2192 		for (i = 0; i < imo->imo_num_memberships; ++i)
2193 			in_delmulti(imo->imo_membership[i]);
2194 		free(imo, M_IPMOPTS);
2195 	}
2196 }
2197 
2198 /*
2199  * Routine called from ip_output() to loop back a copy of an IP multicast
2200  * packet to the input queue of a specified interface.  Note that this
2201  * calls the output routine of the loopback "driver", but with an interface
2202  * pointer that might NOT be a loopback interface -- evil, but easier than
2203  * replicating that code here.
2204  */
2205 static void
2206 ip_mloopback(ifp, m, dst, hlen)
2207 	struct ifnet *ifp;
2208 	register struct mbuf *m;
2209 	register struct sockaddr_in *dst;
2210 	int hlen;
2211 {
2212 	register struct ip *ip;
2213 	struct mbuf *copym;
2214 
2215 	copym = m_copy(m, 0, M_COPYALL);
2216 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2217 		copym = m_pullup(copym, hlen);
2218 	if (copym != NULL) {
2219 		/*
2220 		 * We don't bother to fragment if the IP length is greater
2221 		 * than the interface's MTU.  Can this possibly matter?
2222 		 */
2223 		ip = mtod(copym, struct ip *);
2224 		ip->ip_len = htons(ip->ip_len);
2225 		ip->ip_off = htons(ip->ip_off);
2226 		ip->ip_sum = 0;
2227 		ip->ip_sum = in_cksum(copym, hlen);
2228 		/*
2229 		 * NB:
2230 		 * It's not clear whether there are any lingering
2231 		 * reentrancy problems in other areas which might
2232 		 * be exposed by using ip_input directly (in
2233 		 * particular, everything which modifies the packet
2234 		 * in-place).  Yet another option is using the
2235 		 * protosw directly to deliver the looped back
2236 		 * packet.  For the moment, we'll err on the side
2237 		 * of safety by using if_simloop().
2238 		 */
2239 #if 1 /* XXX */
2240 		if (dst->sin_family != AF_INET) {
2241 			printf("ip_mloopback: bad address family %d\n",
2242 						dst->sin_family);
2243 			dst->sin_family = AF_INET;
2244 		}
2245 #endif
2246 
2247 #ifdef notdef
2248 		copym->m_pkthdr.rcvif = ifp;
2249 		ip_input(copym);
2250 #else
2251 		/* if the checksum hasn't been computed, mark it as valid */
2252 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2253 			copym->m_pkthdr.csum_flags |=
2254 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2255 			copym->m_pkthdr.csum_data = 0xffff;
2256 		}
2257 		if_simloop(ifp, copym, dst->sin_family, 0);
2258 #endif
2259 	}
2260 }
2261