xref: /freebsd/sys/netinet/ip_output.c (revision 0f8f86b71f022b803e99151c19db81b280f245dc)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipfw.h"
38 #include "opt_ipdn.h"
39 #include "opt_ipdivert.h"
40 #include "opt_ipfilter.h"
41 #include "opt_ipsec.h"
42 #include "opt_mac.h"
43 #include "opt_pfil_hooks.h"
44 #include "opt_random_ip_id.h"
45 #include "opt_mbuf_stress_test.h"
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/mac.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/sysctl.h>
57 
58 #include <net/if.h>
59 #include <net/route.h>
60 
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/ip.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/in_var.h>
66 #include <netinet/ip_var.h>
67 
68 #ifdef PFIL_HOOKS
69 #include <net/pfil.h>
70 #endif
71 
72 #include <machine/in_cksum.h>
73 
74 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
75 
76 #ifdef IPSEC
77 #include <netinet6/ipsec.h>
78 #include <netkey/key.h>
79 #ifdef IPSEC_DEBUG
80 #include <netkey/key_debug.h>
81 #else
82 #define	KEYDEBUG(lev,arg)
83 #endif
84 #endif /*IPSEC*/
85 
86 #ifdef FAST_IPSEC
87 #include <netipsec/ipsec.h>
88 #include <netipsec/xform.h>
89 #include <netipsec/key.h>
90 #endif /*FAST_IPSEC*/
91 
92 #include <netinet/ip_fw.h>
93 #include <netinet/ip_divert.h>
94 #include <netinet/ip_dummynet.h>
95 
96 #define print_ip(x, a, y)	 printf("%s %d.%d.%d.%d%s",\
97 				x, (ntohl(a.s_addr)>>24)&0xFF,\
98 				  (ntohl(a.s_addr)>>16)&0xFF,\
99 				  (ntohl(a.s_addr)>>8)&0xFF,\
100 				  (ntohl(a.s_addr))&0xFF, y);
101 
102 u_short ip_id;
103 
104 #ifdef MBUF_STRESS_TEST
105 int mbuf_frag_size = 0;
106 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
107 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
108 #endif
109 
110 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
111 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
112 static void	ip_mloopback
113 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
114 static int	ip_getmoptions
115 	(struct sockopt *, struct ip_moptions *);
116 static int	ip_pcbopts(int, struct mbuf **, struct mbuf *);
117 static int	ip_setmoptions
118 	(struct sockopt *, struct ip_moptions **);
119 
120 int	ip_optcopy(struct ip *, struct ip *);
121 
122 
123 extern	struct protosw inetsw[];
124 
125 /*
126  * IP output.  The packet in mbuf chain m contains a skeletal IP
127  * header (with len, off, ttl, proto, tos, src, dst).
128  * The mbuf chain containing the packet will be freed.
129  * The mbuf opt, if present, will not be freed.
130  * In the IP forwarding case, the packet will arrive with options already
131  * inserted, so must have a NULL opt pointer.
132  */
133 int
134 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro,
135 	int flags, struct ip_moptions *imo, struct inpcb *inp)
136 {
137 	struct ip *ip;
138 	struct ifnet *ifp = NULL;	/* keep compiler happy */
139 	struct mbuf *m0;
140 	int hlen = sizeof (struct ip);
141 	int len, off, error = 0;
142 	struct sockaddr_in *dst = NULL;	/* keep compiler happy */
143 	struct in_ifaddr *ia = NULL;
144 	int isbroadcast, sw_csum;
145 	struct in_addr pkt_dst;
146 	struct route iproute;
147 	struct m_tag *mtag, *dummytag;
148 #ifdef IPSEC
149 	struct secpolicy *sp = NULL;
150 #endif
151 #ifdef FAST_IPSEC
152 	struct secpolicy *sp = NULL;
153 	struct tdb_ident *tdbi;
154 	int s;
155 #endif /* FAST_IPSEC */
156 	struct ip_fw_args args;
157 	int src_was_INADDR_ANY = 0;	/* as the name says... */
158 
159 	args.eh = NULL;
160 	args.rule = NULL;
161 
162 	M_ASSERTPKTHDR(m);
163 
164 	args.next_hop = ip_claim_next_hop(m);
165 	dummytag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
166 	if (dummytag != NULL) {
167 		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
168 		/*
169 		 * Prevent lower layers from finding the tag
170 		 * Cleanup and free is done below
171 		 */
172 		m_tag_unlink(m, dummytag);
173 		/*
174 		 * the packet was already tagged, so part of the
175 		 * processing was already done, and we need to go down.
176 		 * Get parameters from the header.
177 		 */
178 		args.rule = dt->rule;
179 		ro = &(dt->ro);
180 		dst = dt->dn_dst;
181 		ifp = dt->ifp;
182 	}
183 
184 	if (ro == NULL) {
185 		ro = &iproute;
186 		bzero(ro, sizeof (*ro));
187 	}
188 
189 	if (inp != NULL)
190 		INP_LOCK_ASSERT(inp);
191 
192 	if (args.rule != NULL) {	/* dummynet already saw us */
193 		ip = mtod(m, struct ip *);
194 		hlen = ip->ip_hl << 2 ;
195 		if (ro->ro_rt)
196 			ia = ifatoia(ro->ro_rt->rt_ifa);
197 		goto sendit;
198 	}
199 
200 	if (opt) {
201 		len = 0;
202 		m = ip_insertoptions(m, opt, &len);
203 		if (len != 0)
204 			hlen = len;
205 	}
206 	ip = mtod(m, struct ip *);
207 	pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
208 
209 	/*
210 	 * Fill in IP header.  If we are not allowing fragmentation,
211 	 * then the ip_id field is meaningless, but we don't set it
212 	 * to zero.  Doing so causes various problems when devices along
213 	 * the path (routers, load balancers, firewalls, etc.) illegally
214 	 * disable DF on our packet.  Note that a 16-bit counter
215 	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
216 	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
217 	 * for Counting NATted Hosts", Proc. IMW'02, available at
218 	 * <http://www.research.att.com/~smb/papers/fnat.pdf>.
219 	 */
220 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
221 		ip->ip_v = IPVERSION;
222 		ip->ip_hl = hlen >> 2;
223 #ifdef RANDOM_IP_ID
224 		ip->ip_id = ip_randomid();
225 #else
226 		ip->ip_id = htons(ip_id++);
227 #endif
228 		ipstat.ips_localout++;
229 	} else {
230 		hlen = ip->ip_hl << 2;
231 	}
232 
233 	dst = (struct sockaddr_in *)&ro->ro_dst;
234 	/*
235 	 * If there is a cached route,
236 	 * check that it is to the same destination
237 	 * and is still up.  If not, free it and try again.
238 	 * The address family should also be checked in case of sharing the
239 	 * cache with IPv6.
240 	 */
241 	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
242 			  dst->sin_family != AF_INET ||
243 			  dst->sin_addr.s_addr != pkt_dst.s_addr)) {
244 		RTFREE(ro->ro_rt);
245 		ro->ro_rt = (struct rtentry *)0;
246 	}
247 	if (ro->ro_rt == 0) {
248 		bzero(dst, sizeof(*dst));
249 		dst->sin_family = AF_INET;
250 		dst->sin_len = sizeof(*dst);
251 		dst->sin_addr = pkt_dst;
252 	}
253 	/*
254 	 * If routing to interface only,
255 	 * short circuit routing lookup.
256 	 */
257 	if (flags & IP_ROUTETOIF) {
258 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
259 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
260 			ipstat.ips_noroute++;
261 			error = ENETUNREACH;
262 			goto bad;
263 		}
264 		ifp = ia->ia_ifp;
265 		ip->ip_ttl = 1;
266 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
267 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
268 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
269 		/*
270 		 * Bypass the normal routing lookup for multicast
271 		 * packets if the interface is specified.
272 		 */
273 		ifp = imo->imo_multicast_ifp;
274 		IFP_TO_IA(ifp, ia);
275 		isbroadcast = 0;	/* fool gcc */
276 	} else {
277 		/*
278 		 * We want to do any cloning requested by the link layer,
279 		 * as this is probably required in all cases for correct
280 		 * operation (as it is for ARP).
281 		 */
282 		if (ro->ro_rt == 0)
283 			rtalloc(ro);
284 		if (ro->ro_rt == 0) {
285 			ipstat.ips_noroute++;
286 			error = EHOSTUNREACH;
287 			goto bad;
288 		}
289 		ia = ifatoia(ro->ro_rt->rt_ifa);
290 		ifp = ro->ro_rt->rt_ifp;
291 		ro->ro_rt->rt_rmx.rmx_pksent++;
292 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
293 			dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
294 		if (ro->ro_rt->rt_flags & RTF_HOST)
295 			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
296 		else
297 			isbroadcast = in_broadcast(dst->sin_addr, ifp);
298 	}
299 	if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
300 		struct in_multi *inm;
301 
302 		m->m_flags |= M_MCAST;
303 		/*
304 		 * IP destination address is multicast.  Make sure "dst"
305 		 * still points to the address in "ro".  (It may have been
306 		 * changed to point to a gateway address, above.)
307 		 */
308 		dst = (struct sockaddr_in *)&ro->ro_dst;
309 		/*
310 		 * See if the caller provided any multicast options
311 		 */
312 		if (imo != NULL) {
313 			ip->ip_ttl = imo->imo_multicast_ttl;
314 			if (imo->imo_multicast_vif != -1)
315 				ip->ip_src.s_addr =
316 				    ip_mcast_src ?
317 				    ip_mcast_src(imo->imo_multicast_vif) :
318 				    INADDR_ANY;
319 		} else
320 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
321 		/*
322 		 * Confirm that the outgoing interface supports multicast.
323 		 */
324 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
325 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
326 				ipstat.ips_noroute++;
327 				error = ENETUNREACH;
328 				goto bad;
329 			}
330 		}
331 		/*
332 		 * If source address not specified yet, use address
333 		 * of outgoing interface.
334 		 */
335 		if (ip->ip_src.s_addr == INADDR_ANY) {
336 			/* Interface may have no addresses. */
337 			if (ia != NULL)
338 				ip->ip_src = IA_SIN(ia)->sin_addr;
339 		}
340 
341 		if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
342 			/*
343 			 * XXX
344 			 * delayed checksums are not currently
345 			 * compatible with IP multicast routing
346 			 */
347 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
348 				in_delayed_cksum(m);
349 				m->m_pkthdr.csum_flags &=
350 					~CSUM_DELAY_DATA;
351 			}
352 		}
353 		IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
354 		if (inm != NULL &&
355 		   (imo == NULL || imo->imo_multicast_loop)) {
356 			/*
357 			 * If we belong to the destination multicast group
358 			 * on the outgoing interface, and the caller did not
359 			 * forbid loopback, loop back a copy.
360 			 */
361 			ip_mloopback(ifp, m, dst, hlen);
362 		}
363 		else {
364 			/*
365 			 * If we are acting as a multicast router, perform
366 			 * multicast forwarding as if the packet had just
367 			 * arrived on the interface to which we are about
368 			 * to send.  The multicast forwarding function
369 			 * recursively calls this function, using the
370 			 * IP_FORWARDING flag to prevent infinite recursion.
371 			 *
372 			 * Multicasts that are looped back by ip_mloopback(),
373 			 * above, will be forwarded by the ip_input() routine,
374 			 * if necessary.
375 			 */
376 			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
377 				/*
378 				 * If rsvp daemon is not running, do not
379 				 * set ip_moptions. This ensures that the packet
380 				 * is multicast and not just sent down one link
381 				 * as prescribed by rsvpd.
382 				 */
383 				if (!rsvp_on)
384 					imo = NULL;
385 				if (ip_mforward &&
386 				    ip_mforward(ip, ifp, m, imo) != 0) {
387 					m_freem(m);
388 					goto done;
389 				}
390 			}
391 		}
392 
393 		/*
394 		 * Multicasts with a time-to-live of zero may be looped-
395 		 * back, above, but must not be transmitted on a network.
396 		 * Also, multicasts addressed to the loopback interface
397 		 * are not sent -- the above call to ip_mloopback() will
398 		 * loop back a copy if this host actually belongs to the
399 		 * destination group on the loopback interface.
400 		 */
401 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
402 			m_freem(m);
403 			goto done;
404 		}
405 
406 		goto sendit;
407 	}
408 #ifndef notdef
409 	/*
410 	 * If the source address is not specified yet, use the address
411 	 * of the outoing interface. In case, keep note we did that, so
412 	 * if the the firewall changes the next-hop causing the output
413 	 * interface to change, we can fix that.
414 	 */
415 	if (ip->ip_src.s_addr == INADDR_ANY) {
416 		/* Interface may have no addresses. */
417 		if (ia != NULL) {
418 			ip->ip_src = IA_SIN(ia)->sin_addr;
419 			src_was_INADDR_ANY = 1;
420 		}
421 	}
422 #endif /* notdef */
423 	/*
424 	 * Verify that we have any chance at all of being able to queue
425 	 *      the packet or packet fragments
426 	 */
427 	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
428 		ifp->if_snd.ifq_maxlen) {
429 			error = ENOBUFS;
430 			ipstat.ips_odropped++;
431 			goto bad;
432 	}
433 
434 	/*
435 	 * Look for broadcast address and
436 	 * verify user is allowed to send
437 	 * such a packet.
438 	 */
439 	if (isbroadcast) {
440 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
441 			error = EADDRNOTAVAIL;
442 			goto bad;
443 		}
444 		if ((flags & IP_ALLOWBROADCAST) == 0) {
445 			error = EACCES;
446 			goto bad;
447 		}
448 		/* don't allow broadcast messages to be fragmented */
449 		if (ip->ip_len > ifp->if_mtu) {
450 			error = EMSGSIZE;
451 			goto bad;
452 		}
453 		if (flags & IP_SENDONES)
454 			ip->ip_dst.s_addr = INADDR_BROADCAST;
455 		m->m_flags |= M_BCAST;
456 	} else {
457 		m->m_flags &= ~M_BCAST;
458 	}
459 
460 sendit:
461 #ifdef IPSEC
462 	/* get SP for this packet */
463 	if (inp == NULL)
464 		sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
465 		    flags, &error);
466 	else
467 		sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error);
468 
469 	if (sp == NULL) {
470 		ipsecstat.out_inval++;
471 		goto bad;
472 	}
473 
474 	error = 0;
475 
476 	/* check policy */
477 	switch (sp->policy) {
478 	case IPSEC_POLICY_DISCARD:
479 		/*
480 		 * This packet is just discarded.
481 		 */
482 		ipsecstat.out_polvio++;
483 		goto bad;
484 
485 	case IPSEC_POLICY_BYPASS:
486 	case IPSEC_POLICY_NONE:
487 	case IPSEC_POLICY_TCP:
488 		/* no need to do IPsec. */
489 		goto skip_ipsec;
490 
491 	case IPSEC_POLICY_IPSEC:
492 		if (sp->req == NULL) {
493 			/* acquire a policy */
494 			error = key_spdacquire(sp);
495 			goto bad;
496 		}
497 		break;
498 
499 	case IPSEC_POLICY_ENTRUST:
500 	default:
501 		printf("ip_output: Invalid policy found. %d\n", sp->policy);
502 	}
503     {
504 	struct ipsec_output_state state;
505 	bzero(&state, sizeof(state));
506 	state.m = m;
507 	if (flags & IP_ROUTETOIF) {
508 		state.ro = &iproute;
509 		bzero(&iproute, sizeof(iproute));
510 	} else
511 		state.ro = ro;
512 	state.dst = (struct sockaddr *)dst;
513 
514 	ip->ip_sum = 0;
515 
516 	/*
517 	 * XXX
518 	 * delayed checksums are not currently compatible with IPsec
519 	 */
520 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
521 		in_delayed_cksum(m);
522 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
523 	}
524 
525 	ip->ip_len = htons(ip->ip_len);
526 	ip->ip_off = htons(ip->ip_off);
527 
528 	error = ipsec4_output(&state, sp, flags);
529 
530 	m = state.m;
531 	if (flags & IP_ROUTETOIF) {
532 		/*
533 		 * if we have tunnel mode SA, we may need to ignore
534 		 * IP_ROUTETOIF.
535 		 */
536 		if (state.ro != &iproute || state.ro->ro_rt != NULL) {
537 			flags &= ~IP_ROUTETOIF;
538 			ro = state.ro;
539 		}
540 	} else
541 		ro = state.ro;
542 	dst = (struct sockaddr_in *)state.dst;
543 	if (error) {
544 		/* mbuf is already reclaimed in ipsec4_output. */
545 		m = NULL;
546 		switch (error) {
547 		case EHOSTUNREACH:
548 		case ENETUNREACH:
549 		case EMSGSIZE:
550 		case ENOBUFS:
551 		case ENOMEM:
552 			break;
553 		default:
554 			printf("ip4_output (ipsec): error code %d\n", error);
555 			/*fall through*/
556 		case ENOENT:
557 			/* don't show these error codes to the user */
558 			error = 0;
559 			break;
560 		}
561 		goto bad;
562 	}
563 
564 	/* be sure to update variables that are affected by ipsec4_output() */
565 	ip = mtod(m, struct ip *);
566 	hlen = ip->ip_hl << 2;
567 	if (ro->ro_rt == NULL) {
568 		if ((flags & IP_ROUTETOIF) == 0) {
569 			printf("ip_output: "
570 				"can't update route after IPsec processing\n");
571 			error = EHOSTUNREACH;	/*XXX*/
572 			goto bad;
573 		}
574 	} else {
575 		if (state.encap) {
576 			ia = ifatoia(ro->ro_rt->rt_ifa);
577 			ifp = ro->ro_rt->rt_ifp;
578 		}
579 	}
580     }
581 
582 	/* make it flipped, again. */
583 	ip->ip_len = ntohs(ip->ip_len);
584 	ip->ip_off = ntohs(ip->ip_off);
585 skip_ipsec:
586 #endif /*IPSEC*/
587 #ifdef FAST_IPSEC
588 	/*
589 	 * Check the security policy (SP) for the packet and, if
590 	 * required, do IPsec-related processing.  There are two
591 	 * cases here; the first time a packet is sent through
592 	 * it will be untagged and handled by ipsec4_checkpolicy.
593 	 * If the packet is resubmitted to ip_output (e.g. after
594 	 * AH, ESP, etc. processing), there will be a tag to bypass
595 	 * the lookup and related policy checking.
596 	 */
597 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
598 	s = splnet();
599 	if (mtag != NULL) {
600 		tdbi = (struct tdb_ident *)(mtag + 1);
601 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
602 		if (sp == NULL)
603 			error = -EINVAL;	/* force silent drop */
604 		m_tag_delete(m, mtag);
605 	} else {
606 		sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
607 					&error, inp);
608 	}
609 	/*
610 	 * There are four return cases:
611 	 *    sp != NULL	 	    apply IPsec policy
612 	 *    sp == NULL, error == 0	    no IPsec handling needed
613 	 *    sp == NULL, error == -EINVAL  discard packet w/o error
614 	 *    sp == NULL, error != 0	    discard packet, report error
615 	 */
616 	if (sp != NULL) {
617 		/* Loop detection, check if ipsec processing already done */
618 		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
619 		for (mtag = m_tag_first(m); mtag != NULL;
620 		     mtag = m_tag_next(m, mtag)) {
621 			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
622 				continue;
623 			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
624 			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
625 				continue;
626 			/*
627 			 * Check if policy has an SA associated with it.
628 			 * This can happen when an SP has yet to acquire
629 			 * an SA; e.g. on first reference.  If it occurs,
630 			 * then we let ipsec4_process_packet do its thing.
631 			 */
632 			if (sp->req->sav == NULL)
633 				break;
634 			tdbi = (struct tdb_ident *)(mtag + 1);
635 			if (tdbi->spi == sp->req->sav->spi &&
636 			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
637 			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
638 				 sizeof (union sockaddr_union)) == 0) {
639 				/*
640 				 * No IPsec processing is needed, free
641 				 * reference to SP.
642 				 *
643 				 * NB: null pointer to avoid free at
644 				 *     done: below.
645 				 */
646 				KEY_FREESP(&sp), sp = NULL;
647 				splx(s);
648 				goto spd_done;
649 			}
650 		}
651 
652 		/*
653 		 * Do delayed checksums now because we send before
654 		 * this is done in the normal processing path.
655 		 */
656 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
657 			in_delayed_cksum(m);
658 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
659 		}
660 
661 		ip->ip_len = htons(ip->ip_len);
662 		ip->ip_off = htons(ip->ip_off);
663 
664 		/* NB: callee frees mbuf */
665 		error = ipsec4_process_packet(m, sp->req, flags, 0);
666 		/*
667 		 * Preserve KAME behaviour: ENOENT can be returned
668 		 * when an SA acquire is in progress.  Don't propagate
669 		 * this to user-level; it confuses applications.
670 		 *
671 		 * XXX this will go away when the SADB is redone.
672 		 */
673 		if (error == ENOENT)
674 			error = 0;
675 		splx(s);
676 		goto done;
677 	} else {
678 		splx(s);
679 
680 		if (error != 0) {
681 			/*
682 			 * Hack: -EINVAL is used to signal that a packet
683 			 * should be silently discarded.  This is typically
684 			 * because we asked key management for an SA and
685 			 * it was delayed (e.g. kicked up to IKE).
686 			 */
687 			if (error == -EINVAL)
688 				error = 0;
689 			goto bad;
690 		} else {
691 			/* No IPsec processing for this packet. */
692 		}
693 #ifdef notyet
694 		/*
695 		 * If deferred crypto processing is needed, check that
696 		 * the interface supports it.
697 		 */
698 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
699 		if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
700 			/* notify IPsec to do its own crypto */
701 			ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
702 			error = EHOSTUNREACH;
703 			goto bad;
704 		}
705 #endif
706 	}
707 spd_done:
708 #endif /* FAST_IPSEC */
709 
710 	/*
711 	 * IpHack's section.
712 	 * - Xlate: translate packet's addr/port (NAT).
713 	 * - Firewall: deny/allow/etc.
714 	 * - Wrap: fake packet's addr/port <unimpl.>
715 	 * - Encapsulate: put it in another IP and send out. <unimp.>
716 	 */
717 #ifdef PFIL_HOOKS
718 	/*
719 	 * Run through list of hooks for output packets.
720 	 */
721 	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT);
722 	if (error != 0 || m == NULL)
723 		goto done;
724 	ip = mtod(m, struct ip *);
725 #endif /* PFIL_HOOKS */
726 
727 	/*
728 	 * Check with the firewall...
729 	 * but not if we are already being fwd'd from a firewall.
730 	 */
731 	if (fw_enable && IPFW_LOADED && !args.next_hop) {
732 		struct sockaddr_in *old = dst;
733 
734 		args.m = m;
735 		args.next_hop = dst;
736 		args.oif = ifp;
737 		off = ip_fw_chk_ptr(&args);
738 		m = args.m;
739 		dst = args.next_hop;
740 
741                 /*
742 		 * On return we must do the following:
743 		 * m == NULL	-> drop the pkt (old interface, deprecated)
744 		 * (off & IP_FW_PORT_DENY_FLAG)	-> drop the pkt (new interface)
745 		 * 1<=off<= 0xffff		-> DIVERT
746 		 * (off & IP_FW_PORT_DYNT_FLAG)	-> send to a DUMMYNET pipe
747 		 * (off & IP_FW_PORT_TEE_FLAG)	-> TEE the packet
748 		 * dst != old			-> IPFIREWALL_FORWARD
749 		 * off==0, dst==old		-> accept
750 		 * If some of the above modules are not compiled in, then
751 		 * we should't have to check the corresponding condition
752 		 * (because the ipfw control socket should not accept
753 		 * unsupported rules), but better play safe and drop
754 		 * packets in case of doubt.
755 		 */
756 		if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
757 			if (m)
758 				m_freem(m);
759 			error = EACCES;
760 			goto done;
761 		}
762 		ip = mtod(m, struct ip *);
763 		if (off == 0 && dst == old)		/* common case */
764 			goto pass;
765                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
766 			/*
767 			 * pass the pkt to dummynet. Need to include
768 			 * pipe number, m, ifp, ro, dst because these are
769 			 * not recomputed in the next pass.
770 			 * All other parameters have been already used and
771 			 * so they are not needed anymore.
772 			 * XXX note: if the ifp or ro entry are deleted
773 			 * while a pkt is in dummynet, we are in trouble!
774 			 */
775 			args.ro = ro;
776 			args.dst = dst;
777 			args.flags = flags;
778 
779 			error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
780 				&args);
781 			goto done;
782 		}
783 #ifdef IPDIVERT
784 		if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
785 			struct mbuf *clone;
786 
787 			/* Clone packet if we're doing a 'tee' */
788 			if ((off & IP_FW_PORT_TEE_FLAG) != 0)
789 				clone = divert_clone(m);
790 			else
791 				clone = NULL;
792 
793 			/*
794 			 * XXX
795 			 * delayed checksums are not currently compatible
796 			 * with divert sockets.
797 			 */
798 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
799 				in_delayed_cksum(m);
800 				m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
801 			}
802 
803 			/* Restore packet header fields to original values */
804 			ip->ip_len = htons(ip->ip_len);
805 			ip->ip_off = htons(ip->ip_off);
806 
807 			/* Deliver packet to divert input routine */
808 			divert_packet(m, 0);
809 
810 			/* If 'tee', continue with original packet */
811 			if (clone != NULL) {
812 				m = clone;
813 				ip = mtod(m, struct ip *);
814 				goto pass;
815 			}
816 			goto done;
817 		}
818 #endif
819 
820 		/* IPFIREWALL_FORWARD */
821 		/*
822 		 * Check dst to make sure it is directly reachable on the
823 		 * interface we previously thought it was.
824 		 * If it isn't (which may be likely in some situations) we have
825 		 * to re-route it (ie, find a route for the next-hop and the
826 		 * associated interface) and set them here. This is nested
827 		 * forwarding which in most cases is undesirable, except where
828 		 * such control is nigh impossible. So we do it here.
829 		 * And I'm babbling.
830 		 */
831 		if (off == 0 && old != dst) { /* FORWARD, dst has changed */
832 #if 0
833 			/*
834 			 * XXX To improve readability, this block should be
835 			 * changed into a function call as below:
836 			 */
837 			error = ip_ipforward(&m, &dst, &ifp);
838 			if (error)
839 				goto bad;
840 			if (m == NULL) /* ip_input consumed the mbuf */
841 				goto done;
842 #else
843 			struct in_ifaddr *ia;
844 
845 			/*
846 			 * XXX sro_fwd below is static, and a pointer
847 			 * to it gets passed to routines downstream.
848 			 * This could have surprisingly bad results in
849 			 * practice, because its content is overwritten
850 			 * by subsequent packets.
851 			 */
852 			/* There must be a better way to do this next line... */
853 			static struct route sro_fwd;
854 			struct route *ro_fwd = &sro_fwd;
855 
856 #if 0
857 			print_ip("IPFIREWALL_FORWARD: New dst ip: ",
858 			    dst->sin_addr, "\n");
859 #endif
860 
861 			/*
862 			 * We need to figure out if we have been forwarded
863 			 * to a local socket. If so, then we should somehow
864 			 * "loop back" to ip_input, and get directed to the
865 			 * PCB as if we had received this packet. This is
866 			 * because it may be dificult to identify the packets
867 			 * you want to forward until they are being output
868 			 * and have selected an interface. (e.g. locally
869 			 * initiated packets) If we used the loopback inteface,
870 			 * we would not be able to control what happens
871 			 * as the packet runs through ip_input() as
872 			 * it is done through an ISR.
873 			 */
874 			LIST_FOREACH(ia,
875 			    INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
876 				/*
877 				 * If the addr to forward to is one
878 				 * of ours, we pretend to
879 				 * be the destination for this packet.
880 				 */
881 				if (IA_SIN(ia)->sin_addr.s_addr ==
882 						 dst->sin_addr.s_addr)
883 					break;
884 			}
885 			if (ia) {	/* tell ip_input "dont filter" */
886 				mtag = m_tag_get(
887 				    PACKET_TAG_IPFORWARD,
888 				    sizeof(struct sockaddr_in *), M_NOWAIT);
889 				if (mtag == NULL) {
890 					error = ENOBUFS;
891 					goto bad;
892 				}
893 				*(struct sockaddr_in **)(mtag+1) =
894 				    args.next_hop;
895 				m_tag_prepend(m, mtag);
896 
897 				if (m->m_pkthdr.rcvif == NULL)
898 					m->m_pkthdr.rcvif = ifunit("lo0");
899 				if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
900 					m->m_pkthdr.csum_flags |=
901 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
902 					m->m_pkthdr.csum_data = 0xffff;
903 				}
904 				m->m_pkthdr.csum_flags |=
905 				    CSUM_IP_CHECKED | CSUM_IP_VALID;
906 				ip->ip_len = htons(ip->ip_len);
907 				ip->ip_off = htons(ip->ip_off);
908 				ip_input(m);
909 				goto done;
910 			}
911 			/*
912 			 * Some of the logic for this was
913 			 * nicked from above.
914 			 */
915 			bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
916 
917 			ro_fwd->ro_rt = 0;
918 			rtalloc_ign(ro_fwd, RTF_CLONING);
919 
920 			if (ro_fwd->ro_rt == 0) {
921 				ipstat.ips_noroute++;
922 				error = EHOSTUNREACH;
923 				goto bad;
924 			}
925 
926 			ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
927 			ifp = ro_fwd->ro_rt->rt_ifp;
928 			ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
929 			if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
930 				dst = (struct sockaddr_in *)
931 					ro_fwd->ro_rt->rt_gateway;
932 			if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
933 				isbroadcast =
934 				    (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
935 			else
936 				isbroadcast = in_broadcast(dst->sin_addr, ifp);
937 			if (ro->ro_rt)
938 				RTFREE(ro->ro_rt);
939 			ro->ro_rt = ro_fwd->ro_rt;
940 			dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
941 
942 #endif	/* ... block to be put into a function */
943 			/*
944 			 * If we added a default src ip earlier,
945 			 * which would have been gotten from the-then
946 			 * interface, do it again, from the new one.
947 			 */
948 			if (src_was_INADDR_ANY)
949 				ip->ip_src = IA_SIN(ia)->sin_addr;
950 			goto pass ;
951 		}
952 
953                 /*
954                  * if we get here, none of the above matches, and
955                  * we have to drop the pkt
956                  */
957 		m_freem(m);
958                 error = EACCES; /* not sure this is the right error msg */
959                 goto done;
960 	}
961 
962 pass:
963 	/* 127/8 must not appear on wire - RFC1122. */
964 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
965 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
966 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
967 			ipstat.ips_badaddr++;
968 			error = EADDRNOTAVAIL;
969 			goto bad;
970 		}
971 	}
972 
973 	m->m_pkthdr.csum_flags |= CSUM_IP;
974 	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
975 	if (sw_csum & CSUM_DELAY_DATA) {
976 		in_delayed_cksum(m);
977 		sw_csum &= ~CSUM_DELAY_DATA;
978 	}
979 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
980 
981 	/*
982 	 * If small enough for interface, or the interface will take
983 	 * care of the fragmentation for us, can just send directly.
984 	 */
985 	if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT &&
986 	    ((ip->ip_off & IP_DF) == 0))) {
987 		ip->ip_len = htons(ip->ip_len);
988 		ip->ip_off = htons(ip->ip_off);
989 		ip->ip_sum = 0;
990 		if (sw_csum & CSUM_DELAY_IP)
991 			ip->ip_sum = in_cksum(m, hlen);
992 
993 		/* Record statistics for this interface address. */
994 		if (!(flags & IP_FORWARDING) && ia) {
995 			ia->ia_ifa.if_opackets++;
996 			ia->ia_ifa.if_obytes += m->m_pkthdr.len;
997 		}
998 
999 #ifdef IPSEC
1000 		/* clean ipsec history once it goes out of the node */
1001 		ipsec_delaux(m);
1002 #endif
1003 
1004 #ifdef MBUF_STRESS_TEST
1005 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
1006 			m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
1007 #endif
1008 		error = (*ifp->if_output)(ifp, m,
1009 				(struct sockaddr *)dst, ro->ro_rt);
1010 		goto done;
1011 	}
1012 
1013 	if (ip->ip_off & IP_DF) {
1014 		error = EMSGSIZE;
1015 		/*
1016 		 * This case can happen if the user changed the MTU
1017 		 * of an interface after enabling IP on it.  Because
1018 		 * most netifs don't keep track of routes pointing to
1019 		 * them, there is no way for one to update all its
1020 		 * routes when the MTU is changed.
1021 		 */
1022 		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1023 		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1024 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1025 		}
1026 		ipstat.ips_cantfrag++;
1027 		goto bad;
1028 	}
1029 
1030 	/*
1031 	 * Too large for interface; fragment if possible. If successful,
1032 	 * on return, m will point to a list of packets to be sent.
1033 	 */
1034 	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1035 	if (error)
1036 		goto bad;
1037 	for (; m; m = m0) {
1038 		m0 = m->m_nextpkt;
1039 		m->m_nextpkt = 0;
1040 #ifdef IPSEC
1041 		/* clean ipsec history once it goes out of the node */
1042 		ipsec_delaux(m);
1043 #endif
1044 		if (error == 0) {
1045 			/* Record statistics for this interface address. */
1046 			if (ia != NULL) {
1047 				ia->ia_ifa.if_opackets++;
1048 				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1049 			}
1050 
1051 			error = (*ifp->if_output)(ifp, m,
1052 			    (struct sockaddr *)dst, ro->ro_rt);
1053 		} else
1054 			m_freem(m);
1055 	}
1056 
1057 	if (error == 0)
1058 		ipstat.ips_fragmented++;
1059 
1060 done:
1061 	if (ro == &iproute && ro->ro_rt) {
1062 		RTFREE(ro->ro_rt);
1063 		ro->ro_rt = NULL;
1064 	}
1065 	if (dummytag) {
1066 		struct dn_pkt_tag *dt = (struct dn_pkt_tag *)(dummytag+1);
1067 		if (dt->ro.ro_rt)
1068 			RTFREE(dt->ro.ro_rt);
1069 		m_tag_free(dummytag);
1070 	}
1071 #ifdef IPSEC
1072 	if (sp != NULL) {
1073 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1074 			printf("DP ip_output call free SP:%p\n", sp));
1075 		key_freesp(sp);
1076 	}
1077 #endif
1078 #ifdef FAST_IPSEC
1079 	if (sp != NULL)
1080 		KEY_FREESP(&sp);
1081 #endif
1082 	return (error);
1083 bad:
1084 	m_freem(m);
1085 	goto done;
1086 }
1087 
1088 /*
1089  * Create a chain of fragments which fit the given mtu. m_frag points to the
1090  * mbuf to be fragmented; on return it points to the chain with the fragments.
1091  * Return 0 if no error. If error, m_frag may contain a partially built
1092  * chain of fragments that should be freed by the caller.
1093  *
1094  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1095  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1096  */
1097 int
1098 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1099 	    u_long if_hwassist_flags, int sw_csum)
1100 {
1101 	int error = 0;
1102 	int hlen = ip->ip_hl << 2;
1103 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
1104 	int off;
1105 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
1106 	int firstlen;
1107 	struct mbuf **mnext;
1108 	int nfrags;
1109 
1110 	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
1111 		ipstat.ips_cantfrag++;
1112 		return EMSGSIZE;
1113 	}
1114 
1115 	/*
1116 	 * Must be able to put at least 8 bytes per fragment.
1117 	 */
1118 	if (len < 8)
1119 		return EMSGSIZE;
1120 
1121 	/*
1122 	 * If the interface will not calculate checksums on
1123 	 * fragmented packets, then do it here.
1124 	 */
1125 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1126 	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1127 		in_delayed_cksum(m0);
1128 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1129 	}
1130 
1131 	if (len > PAGE_SIZE) {
1132 		/*
1133 		 * Fragment large datagrams such that each segment
1134 		 * contains a multiple of PAGE_SIZE amount of data,
1135 		 * plus headers. This enables a receiver to perform
1136 		 * page-flipping zero-copy optimizations.
1137 		 *
1138 		 * XXX When does this help given that sender and receiver
1139 		 * could have different page sizes, and also mtu could
1140 		 * be less than the receiver's page size ?
1141 		 */
1142 		int newlen;
1143 		struct mbuf *m;
1144 
1145 		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1146 			off += m->m_len;
1147 
1148 		/*
1149 		 * firstlen (off - hlen) must be aligned on an
1150 		 * 8-byte boundary
1151 		 */
1152 		if (off < hlen)
1153 			goto smart_frag_failure;
1154 		off = ((off - hlen) & ~7) + hlen;
1155 		newlen = (~PAGE_MASK) & mtu;
1156 		if ((newlen + sizeof (struct ip)) > mtu) {
1157 			/* we failed, go back the default */
1158 smart_frag_failure:
1159 			newlen = len;
1160 			off = hlen + len;
1161 		}
1162 		len = newlen;
1163 
1164 	} else {
1165 		off = hlen + len;
1166 	}
1167 
1168 	firstlen = off - hlen;
1169 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
1170 
1171 	/*
1172 	 * Loop through length of segment after first fragment,
1173 	 * make new header and copy data of each part and link onto chain.
1174 	 * Here, m0 is the original packet, m is the fragment being created.
1175 	 * The fragments are linked off the m_nextpkt of the original
1176 	 * packet, which after processing serves as the first fragment.
1177 	 */
1178 	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1179 		struct ip *mhip;	/* ip header on the fragment */
1180 		struct mbuf *m;
1181 		int mhlen = sizeof (struct ip);
1182 
1183 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1184 		if (m == 0) {
1185 			error = ENOBUFS;
1186 			ipstat.ips_odropped++;
1187 			goto done;
1188 		}
1189 		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1190 		/*
1191 		 * In the first mbuf, leave room for the link header, then
1192 		 * copy the original IP header including options. The payload
1193 		 * goes into an additional mbuf chain returned by m_copy().
1194 		 */
1195 		m->m_data += max_linkhdr;
1196 		mhip = mtod(m, struct ip *);
1197 		*mhip = *ip;
1198 		if (hlen > sizeof (struct ip)) {
1199 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1200 			mhip->ip_v = IPVERSION;
1201 			mhip->ip_hl = mhlen >> 2;
1202 		}
1203 		m->m_len = mhlen;
1204 		/* XXX do we need to add ip->ip_off below ? */
1205 		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1206 		if (off + len >= ip->ip_len) {	/* last fragment */
1207 			len = ip->ip_len - off;
1208 			m->m_flags |= M_LASTFRAG;
1209 		} else
1210 			mhip->ip_off |= IP_MF;
1211 		mhip->ip_len = htons((u_short)(len + mhlen));
1212 		m->m_next = m_copy(m0, off, len);
1213 		if (m->m_next == 0) {		/* copy failed */
1214 			m_free(m);
1215 			error = ENOBUFS;	/* ??? */
1216 			ipstat.ips_odropped++;
1217 			goto done;
1218 		}
1219 		m->m_pkthdr.len = mhlen + len;
1220 		m->m_pkthdr.rcvif = (struct ifnet *)0;
1221 #ifdef MAC
1222 		mac_create_fragment(m0, m);
1223 #endif
1224 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1225 		mhip->ip_off = htons(mhip->ip_off);
1226 		mhip->ip_sum = 0;
1227 		if (sw_csum & CSUM_DELAY_IP)
1228 			mhip->ip_sum = in_cksum(m, mhlen);
1229 		*mnext = m;
1230 		mnext = &m->m_nextpkt;
1231 	}
1232 	ipstat.ips_ofragments += nfrags;
1233 
1234 	/* set first marker for fragment chain */
1235 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1236 	m0->m_pkthdr.csum_data = nfrags;
1237 
1238 	/*
1239 	 * Update first fragment by trimming what's been copied out
1240 	 * and updating header.
1241 	 */
1242 	m_adj(m0, hlen + firstlen - ip->ip_len);
1243 	m0->m_pkthdr.len = hlen + firstlen;
1244 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1245 	ip->ip_off |= IP_MF;
1246 	ip->ip_off = htons(ip->ip_off);
1247 	ip->ip_sum = 0;
1248 	if (sw_csum & CSUM_DELAY_IP)
1249 		ip->ip_sum = in_cksum(m0, hlen);
1250 
1251 done:
1252 	*m_frag = m0;
1253 	return error;
1254 }
1255 
1256 void
1257 in_delayed_cksum(struct mbuf *m)
1258 {
1259 	struct ip *ip;
1260 	u_short csum, offset;
1261 
1262 	ip = mtod(m, struct ip *);
1263 	offset = ip->ip_hl << 2 ;
1264 	csum = in_cksum_skip(m, ip->ip_len, offset);
1265 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1266 		csum = 0xffff;
1267 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1268 
1269 	if (offset + sizeof(u_short) > m->m_len) {
1270 		printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1271 		    m->m_len, offset, ip->ip_p);
1272 		/*
1273 		 * XXX
1274 		 * this shouldn't happen, but if it does, the
1275 		 * correct behavior may be to insert the checksum
1276 		 * in the existing chain instead of rearranging it.
1277 		 */
1278 		m = m_pullup(m, offset + sizeof(u_short));
1279 	}
1280 	*(u_short *)(m->m_data + offset) = csum;
1281 }
1282 
1283 /*
1284  * Insert IP options into preformed packet.
1285  * Adjust IP destination as required for IP source routing,
1286  * as indicated by a non-zero in_addr at the start of the options.
1287  *
1288  * XXX This routine assumes that the packet has no options in place.
1289  */
1290 static struct mbuf *
1291 ip_insertoptions(m, opt, phlen)
1292 	register struct mbuf *m;
1293 	struct mbuf *opt;
1294 	int *phlen;
1295 {
1296 	register struct ipoption *p = mtod(opt, struct ipoption *);
1297 	struct mbuf *n;
1298 	register struct ip *ip = mtod(m, struct ip *);
1299 	unsigned optlen;
1300 
1301 	optlen = opt->m_len - sizeof(p->ipopt_dst);
1302 	if (optlen + ip->ip_len > IP_MAXPACKET) {
1303 		*phlen = 0;
1304 		return (m);		/* XXX should fail */
1305 	}
1306 	if (p->ipopt_dst.s_addr)
1307 		ip->ip_dst = p->ipopt_dst;
1308 	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1309 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
1310 		if (n == 0) {
1311 			*phlen = 0;
1312 			return (m);
1313 		}
1314 		n->m_pkthdr.rcvif = (struct ifnet *)0;
1315 #ifdef MAC
1316 		mac_create_mbuf_from_mbuf(m, n);
1317 #endif
1318 		n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1319 		m->m_len -= sizeof(struct ip);
1320 		m->m_data += sizeof(struct ip);
1321 		n->m_next = m;
1322 		m = n;
1323 		m->m_len = optlen + sizeof(struct ip);
1324 		m->m_data += max_linkhdr;
1325 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1326 	} else {
1327 		m->m_data -= optlen;
1328 		m->m_len += optlen;
1329 		m->m_pkthdr.len += optlen;
1330 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
1331 	}
1332 	ip = mtod(m, struct ip *);
1333 	bcopy(p->ipopt_list, ip + 1, optlen);
1334 	*phlen = sizeof(struct ip) + optlen;
1335 	ip->ip_v = IPVERSION;
1336 	ip->ip_hl = *phlen >> 2;
1337 	ip->ip_len += optlen;
1338 	return (m);
1339 }
1340 
1341 /*
1342  * Copy options from ip to jp,
1343  * omitting those not copied during fragmentation.
1344  */
1345 int
1346 ip_optcopy(ip, jp)
1347 	struct ip *ip, *jp;
1348 {
1349 	register u_char *cp, *dp;
1350 	int opt, optlen, cnt;
1351 
1352 	cp = (u_char *)(ip + 1);
1353 	dp = (u_char *)(jp + 1);
1354 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1355 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1356 		opt = cp[0];
1357 		if (opt == IPOPT_EOL)
1358 			break;
1359 		if (opt == IPOPT_NOP) {
1360 			/* Preserve for IP mcast tunnel's LSRR alignment. */
1361 			*dp++ = IPOPT_NOP;
1362 			optlen = 1;
1363 			continue;
1364 		}
1365 
1366 		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1367 		    ("ip_optcopy: malformed ipv4 option"));
1368 		optlen = cp[IPOPT_OLEN];
1369 		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1370 		    ("ip_optcopy: malformed ipv4 option"));
1371 
1372 		/* bogus lengths should have been caught by ip_dooptions */
1373 		if (optlen > cnt)
1374 			optlen = cnt;
1375 		if (IPOPT_COPIED(opt)) {
1376 			bcopy(cp, dp, optlen);
1377 			dp += optlen;
1378 		}
1379 	}
1380 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1381 		*dp++ = IPOPT_EOL;
1382 	return (optlen);
1383 }
1384 
1385 /*
1386  * IP socket option processing.
1387  */
1388 int
1389 ip_ctloutput(so, sopt)
1390 	struct socket *so;
1391 	struct sockopt *sopt;
1392 {
1393 	struct	inpcb *inp = sotoinpcb(so);
1394 	int	error, optval;
1395 
1396 	error = optval = 0;
1397 	if (sopt->sopt_level != IPPROTO_IP) {
1398 		return (EINVAL);
1399 	}
1400 
1401 	switch (sopt->sopt_dir) {
1402 	case SOPT_SET:
1403 		switch (sopt->sopt_name) {
1404 		case IP_OPTIONS:
1405 #ifdef notyet
1406 		case IP_RETOPTS:
1407 #endif
1408 		{
1409 			struct mbuf *m;
1410 			if (sopt->sopt_valsize > MLEN) {
1411 				error = EMSGSIZE;
1412 				break;
1413 			}
1414 			MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
1415 			if (m == 0) {
1416 				error = ENOBUFS;
1417 				break;
1418 			}
1419 			m->m_len = sopt->sopt_valsize;
1420 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1421 					    m->m_len);
1422 
1423 			return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1424 					   m));
1425 		}
1426 
1427 		case IP_TOS:
1428 		case IP_TTL:
1429 		case IP_RECVOPTS:
1430 		case IP_RECVRETOPTS:
1431 		case IP_RECVDSTADDR:
1432 		case IP_RECVTTL:
1433 		case IP_RECVIF:
1434 		case IP_FAITH:
1435 		case IP_ONESBCAST:
1436 			error = sooptcopyin(sopt, &optval, sizeof optval,
1437 					    sizeof optval);
1438 			if (error)
1439 				break;
1440 
1441 			switch (sopt->sopt_name) {
1442 			case IP_TOS:
1443 				inp->inp_ip_tos = optval;
1444 				break;
1445 
1446 			case IP_TTL:
1447 				inp->inp_ip_ttl = optval;
1448 				break;
1449 #define	OPTSET(bit) \
1450 	if (optval) \
1451 		inp->inp_flags |= bit; \
1452 	else \
1453 		inp->inp_flags &= ~bit;
1454 
1455 			case IP_RECVOPTS:
1456 				OPTSET(INP_RECVOPTS);
1457 				break;
1458 
1459 			case IP_RECVRETOPTS:
1460 				OPTSET(INP_RECVRETOPTS);
1461 				break;
1462 
1463 			case IP_RECVDSTADDR:
1464 				OPTSET(INP_RECVDSTADDR);
1465 				break;
1466 
1467 			case IP_RECVTTL:
1468 				OPTSET(INP_RECVTTL);
1469 				break;
1470 
1471 			case IP_RECVIF:
1472 				OPTSET(INP_RECVIF);
1473 				break;
1474 
1475 			case IP_FAITH:
1476 				OPTSET(INP_FAITH);
1477 				break;
1478 
1479 			case IP_ONESBCAST:
1480 				OPTSET(INP_ONESBCAST);
1481 				break;
1482 			}
1483 			break;
1484 #undef OPTSET
1485 
1486 		case IP_MULTICAST_IF:
1487 		case IP_MULTICAST_VIF:
1488 		case IP_MULTICAST_TTL:
1489 		case IP_MULTICAST_LOOP:
1490 		case IP_ADD_MEMBERSHIP:
1491 		case IP_DROP_MEMBERSHIP:
1492 			error = ip_setmoptions(sopt, &inp->inp_moptions);
1493 			break;
1494 
1495 		case IP_PORTRANGE:
1496 			error = sooptcopyin(sopt, &optval, sizeof optval,
1497 					    sizeof optval);
1498 			if (error)
1499 				break;
1500 
1501 			switch (optval) {
1502 			case IP_PORTRANGE_DEFAULT:
1503 				inp->inp_flags &= ~(INP_LOWPORT);
1504 				inp->inp_flags &= ~(INP_HIGHPORT);
1505 				break;
1506 
1507 			case IP_PORTRANGE_HIGH:
1508 				inp->inp_flags &= ~(INP_LOWPORT);
1509 				inp->inp_flags |= INP_HIGHPORT;
1510 				break;
1511 
1512 			case IP_PORTRANGE_LOW:
1513 				inp->inp_flags &= ~(INP_HIGHPORT);
1514 				inp->inp_flags |= INP_LOWPORT;
1515 				break;
1516 
1517 			default:
1518 				error = EINVAL;
1519 				break;
1520 			}
1521 			break;
1522 
1523 #if defined(IPSEC) || defined(FAST_IPSEC)
1524 		case IP_IPSEC_POLICY:
1525 		{
1526 			caddr_t req;
1527 			size_t len = 0;
1528 			int priv;
1529 			struct mbuf *m;
1530 			int optname;
1531 
1532 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1533 				break;
1534 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1535 				break;
1536 			priv = (sopt->sopt_td != NULL &&
1537 				suser(sopt->sopt_td) != 0) ? 0 : 1;
1538 			req = mtod(m, caddr_t);
1539 			len = m->m_len;
1540 			optname = sopt->sopt_name;
1541 			error = ipsec4_set_policy(inp, optname, req, len, priv);
1542 			m_freem(m);
1543 			break;
1544 		}
1545 #endif /*IPSEC*/
1546 
1547 		default:
1548 			error = ENOPROTOOPT;
1549 			break;
1550 		}
1551 		break;
1552 
1553 	case SOPT_GET:
1554 		switch (sopt->sopt_name) {
1555 		case IP_OPTIONS:
1556 		case IP_RETOPTS:
1557 			if (inp->inp_options)
1558 				error = sooptcopyout(sopt,
1559 						     mtod(inp->inp_options,
1560 							  char *),
1561 						     inp->inp_options->m_len);
1562 			else
1563 				sopt->sopt_valsize = 0;
1564 			break;
1565 
1566 		case IP_TOS:
1567 		case IP_TTL:
1568 		case IP_RECVOPTS:
1569 		case IP_RECVRETOPTS:
1570 		case IP_RECVDSTADDR:
1571 		case IP_RECVTTL:
1572 		case IP_RECVIF:
1573 		case IP_PORTRANGE:
1574 		case IP_FAITH:
1575 		case IP_ONESBCAST:
1576 			switch (sopt->sopt_name) {
1577 
1578 			case IP_TOS:
1579 				optval = inp->inp_ip_tos;
1580 				break;
1581 
1582 			case IP_TTL:
1583 				optval = inp->inp_ip_ttl;
1584 				break;
1585 
1586 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1587 
1588 			case IP_RECVOPTS:
1589 				optval = OPTBIT(INP_RECVOPTS);
1590 				break;
1591 
1592 			case IP_RECVRETOPTS:
1593 				optval = OPTBIT(INP_RECVRETOPTS);
1594 				break;
1595 
1596 			case IP_RECVDSTADDR:
1597 				optval = OPTBIT(INP_RECVDSTADDR);
1598 				break;
1599 
1600 			case IP_RECVTTL:
1601 				optval = OPTBIT(INP_RECVTTL);
1602 				break;
1603 
1604 			case IP_RECVIF:
1605 				optval = OPTBIT(INP_RECVIF);
1606 				break;
1607 
1608 			case IP_PORTRANGE:
1609 				if (inp->inp_flags & INP_HIGHPORT)
1610 					optval = IP_PORTRANGE_HIGH;
1611 				else if (inp->inp_flags & INP_LOWPORT)
1612 					optval = IP_PORTRANGE_LOW;
1613 				else
1614 					optval = 0;
1615 				break;
1616 
1617 			case IP_FAITH:
1618 				optval = OPTBIT(INP_FAITH);
1619 				break;
1620 
1621 			case IP_ONESBCAST:
1622 				optval = OPTBIT(INP_ONESBCAST);
1623 				break;
1624 			}
1625 			error = sooptcopyout(sopt, &optval, sizeof optval);
1626 			break;
1627 
1628 		case IP_MULTICAST_IF:
1629 		case IP_MULTICAST_VIF:
1630 		case IP_MULTICAST_TTL:
1631 		case IP_MULTICAST_LOOP:
1632 		case IP_ADD_MEMBERSHIP:
1633 		case IP_DROP_MEMBERSHIP:
1634 			error = ip_getmoptions(sopt, inp->inp_moptions);
1635 			break;
1636 
1637 #if defined(IPSEC) || defined(FAST_IPSEC)
1638 		case IP_IPSEC_POLICY:
1639 		{
1640 			struct mbuf *m = NULL;
1641 			caddr_t req = NULL;
1642 			size_t len = 0;
1643 
1644 			if (m != 0) {
1645 				req = mtod(m, caddr_t);
1646 				len = m->m_len;
1647 			}
1648 			error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1649 			if (error == 0)
1650 				error = soopt_mcopyout(sopt, m); /* XXX */
1651 			if (error == 0)
1652 				m_freem(m);
1653 			break;
1654 		}
1655 #endif /*IPSEC*/
1656 
1657 		default:
1658 			error = ENOPROTOOPT;
1659 			break;
1660 		}
1661 		break;
1662 	}
1663 	return (error);
1664 }
1665 
1666 /*
1667  * Set up IP options in pcb for insertion in output packets.
1668  * Store in mbuf with pointer in pcbopt, adding pseudo-option
1669  * with destination address if source routed.
1670  */
1671 static int
1672 ip_pcbopts(optname, pcbopt, m)
1673 	int optname;
1674 	struct mbuf **pcbopt;
1675 	register struct mbuf *m;
1676 {
1677 	register int cnt, optlen;
1678 	register u_char *cp;
1679 	u_char opt;
1680 
1681 	/* turn off any old options */
1682 	if (*pcbopt)
1683 		(void)m_free(*pcbopt);
1684 	*pcbopt = 0;
1685 	if (m == (struct mbuf *)0 || m->m_len == 0) {
1686 		/*
1687 		 * Only turning off any previous options.
1688 		 */
1689 		if (m)
1690 			(void)m_free(m);
1691 		return (0);
1692 	}
1693 
1694 	if (m->m_len % sizeof(int32_t))
1695 		goto bad;
1696 	/*
1697 	 * IP first-hop destination address will be stored before
1698 	 * actual options; move other options back
1699 	 * and clear it when none present.
1700 	 */
1701 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1702 		goto bad;
1703 	cnt = m->m_len;
1704 	m->m_len += sizeof(struct in_addr);
1705 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1706 	bcopy(mtod(m, void *), cp, (unsigned)cnt);
1707 	bzero(mtod(m, void *), sizeof(struct in_addr));
1708 
1709 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1710 		opt = cp[IPOPT_OPTVAL];
1711 		if (opt == IPOPT_EOL)
1712 			break;
1713 		if (opt == IPOPT_NOP)
1714 			optlen = 1;
1715 		else {
1716 			if (cnt < IPOPT_OLEN + sizeof(*cp))
1717 				goto bad;
1718 			optlen = cp[IPOPT_OLEN];
1719 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1720 				goto bad;
1721 		}
1722 		switch (opt) {
1723 
1724 		default:
1725 			break;
1726 
1727 		case IPOPT_LSRR:
1728 		case IPOPT_SSRR:
1729 			/*
1730 			 * user process specifies route as:
1731 			 *	->A->B->C->D
1732 			 * D must be our final destination (but we can't
1733 			 * check that since we may not have connected yet).
1734 			 * A is first hop destination, which doesn't appear in
1735 			 * actual IP option, but is stored before the options.
1736 			 */
1737 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1738 				goto bad;
1739 			m->m_len -= sizeof(struct in_addr);
1740 			cnt -= sizeof(struct in_addr);
1741 			optlen -= sizeof(struct in_addr);
1742 			cp[IPOPT_OLEN] = optlen;
1743 			/*
1744 			 * Move first hop before start of options.
1745 			 */
1746 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1747 			    sizeof(struct in_addr));
1748 			/*
1749 			 * Then copy rest of options back
1750 			 * to close up the deleted entry.
1751 			 */
1752 			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
1753 			    &cp[IPOPT_OFFSET+1],
1754 			    (unsigned)cnt + sizeof(struct in_addr));
1755 			break;
1756 		}
1757 	}
1758 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1759 		goto bad;
1760 	*pcbopt = m;
1761 	return (0);
1762 
1763 bad:
1764 	(void)m_free(m);
1765 	return (EINVAL);
1766 }
1767 
1768 /*
1769  * XXX
1770  * The whole multicast option thing needs to be re-thought.
1771  * Several of these options are equally applicable to non-multicast
1772  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1773  * standard option (IP_TTL).
1774  */
1775 
1776 /*
1777  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1778  */
1779 static struct ifnet *
1780 ip_multicast_if(a, ifindexp)
1781 	struct in_addr *a;
1782 	int *ifindexp;
1783 {
1784 	int ifindex;
1785 	struct ifnet *ifp;
1786 
1787 	if (ifindexp)
1788 		*ifindexp = 0;
1789 	if (ntohl(a->s_addr) >> 24 == 0) {
1790 		ifindex = ntohl(a->s_addr) & 0xffffff;
1791 		if (ifindex < 0 || if_index < ifindex)
1792 			return NULL;
1793 		ifp = ifnet_byindex(ifindex);
1794 		if (ifindexp)
1795 			*ifindexp = ifindex;
1796 	} else {
1797 		INADDR_TO_IFP(*a, ifp);
1798 	}
1799 	return ifp;
1800 }
1801 
1802 /*
1803  * Set the IP multicast options in response to user setsockopt().
1804  */
1805 static int
1806 ip_setmoptions(sopt, imop)
1807 	struct sockopt *sopt;
1808 	struct ip_moptions **imop;
1809 {
1810 	int error = 0;
1811 	int i;
1812 	struct in_addr addr;
1813 	struct ip_mreq mreq;
1814 	struct ifnet *ifp;
1815 	struct ip_moptions *imo = *imop;
1816 	struct route ro;
1817 	struct sockaddr_in *dst;
1818 	int ifindex;
1819 	int s;
1820 
1821 	if (imo == NULL) {
1822 		/*
1823 		 * No multicast option buffer attached to the pcb;
1824 		 * allocate one and initialize to default values.
1825 		 */
1826 		imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1827 		    M_WAITOK);
1828 
1829 		if (imo == NULL)
1830 			return (ENOBUFS);
1831 		*imop = imo;
1832 		imo->imo_multicast_ifp = NULL;
1833 		imo->imo_multicast_addr.s_addr = INADDR_ANY;
1834 		imo->imo_multicast_vif = -1;
1835 		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1836 		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1837 		imo->imo_num_memberships = 0;
1838 	}
1839 
1840 	switch (sopt->sopt_name) {
1841 	/* store an index number for the vif you wanna use in the send */
1842 	case IP_MULTICAST_VIF:
1843 		if (legal_vif_num == 0) {
1844 			error = EOPNOTSUPP;
1845 			break;
1846 		}
1847 		error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1848 		if (error)
1849 			break;
1850 		if (!legal_vif_num(i) && (i != -1)) {
1851 			error = EINVAL;
1852 			break;
1853 		}
1854 		imo->imo_multicast_vif = i;
1855 		break;
1856 
1857 	case IP_MULTICAST_IF:
1858 		/*
1859 		 * Select the interface for outgoing multicast packets.
1860 		 */
1861 		error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1862 		if (error)
1863 			break;
1864 		/*
1865 		 * INADDR_ANY is used to remove a previous selection.
1866 		 * When no interface is selected, a default one is
1867 		 * chosen every time a multicast packet is sent.
1868 		 */
1869 		if (addr.s_addr == INADDR_ANY) {
1870 			imo->imo_multicast_ifp = NULL;
1871 			break;
1872 		}
1873 		/*
1874 		 * The selected interface is identified by its local
1875 		 * IP address.  Find the interface and confirm that
1876 		 * it supports multicasting.
1877 		 */
1878 		s = splimp();
1879 		ifp = ip_multicast_if(&addr, &ifindex);
1880 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1881 			splx(s);
1882 			error = EADDRNOTAVAIL;
1883 			break;
1884 		}
1885 		imo->imo_multicast_ifp = ifp;
1886 		if (ifindex)
1887 			imo->imo_multicast_addr = addr;
1888 		else
1889 			imo->imo_multicast_addr.s_addr = INADDR_ANY;
1890 		splx(s);
1891 		break;
1892 
1893 	case IP_MULTICAST_TTL:
1894 		/*
1895 		 * Set the IP time-to-live for outgoing multicast packets.
1896 		 * The original multicast API required a char argument,
1897 		 * which is inconsistent with the rest of the socket API.
1898 		 * We allow either a char or an int.
1899 		 */
1900 		if (sopt->sopt_valsize == 1) {
1901 			u_char ttl;
1902 			error = sooptcopyin(sopt, &ttl, 1, 1);
1903 			if (error)
1904 				break;
1905 			imo->imo_multicast_ttl = ttl;
1906 		} else {
1907 			u_int ttl;
1908 			error = sooptcopyin(sopt, &ttl, sizeof ttl,
1909 					    sizeof ttl);
1910 			if (error)
1911 				break;
1912 			if (ttl > 255)
1913 				error = EINVAL;
1914 			else
1915 				imo->imo_multicast_ttl = ttl;
1916 		}
1917 		break;
1918 
1919 	case IP_MULTICAST_LOOP:
1920 		/*
1921 		 * Set the loopback flag for outgoing multicast packets.
1922 		 * Must be zero or one.  The original multicast API required a
1923 		 * char argument, which is inconsistent with the rest
1924 		 * of the socket API.  We allow either a char or an int.
1925 		 */
1926 		if (sopt->sopt_valsize == 1) {
1927 			u_char loop;
1928 			error = sooptcopyin(sopt, &loop, 1, 1);
1929 			if (error)
1930 				break;
1931 			imo->imo_multicast_loop = !!loop;
1932 		} else {
1933 			u_int loop;
1934 			error = sooptcopyin(sopt, &loop, sizeof loop,
1935 					    sizeof loop);
1936 			if (error)
1937 				break;
1938 			imo->imo_multicast_loop = !!loop;
1939 		}
1940 		break;
1941 
1942 	case IP_ADD_MEMBERSHIP:
1943 		/*
1944 		 * Add a multicast group membership.
1945 		 * Group must be a valid IP multicast address.
1946 		 */
1947 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1948 		if (error)
1949 			break;
1950 
1951 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1952 			error = EINVAL;
1953 			break;
1954 		}
1955 		s = splimp();
1956 		/*
1957 		 * If no interface address was provided, use the interface of
1958 		 * the route to the given multicast address.
1959 		 */
1960 		if (mreq.imr_interface.s_addr == INADDR_ANY) {
1961 			bzero((caddr_t)&ro, sizeof(ro));
1962 			dst = (struct sockaddr_in *)&ro.ro_dst;
1963 			dst->sin_len = sizeof(*dst);
1964 			dst->sin_family = AF_INET;
1965 			dst->sin_addr = mreq.imr_multiaddr;
1966 			rtalloc_ign(&ro, RTF_CLONING);
1967 			if (ro.ro_rt == NULL) {
1968 				error = EADDRNOTAVAIL;
1969 				splx(s);
1970 				break;
1971 			}
1972 			ifp = ro.ro_rt->rt_ifp;
1973 			RTFREE(ro.ro_rt);
1974 		}
1975 		else {
1976 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1977 		}
1978 
1979 		/*
1980 		 * See if we found an interface, and confirm that it
1981 		 * supports multicast.
1982 		 */
1983 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1984 			error = EADDRNOTAVAIL;
1985 			splx(s);
1986 			break;
1987 		}
1988 		/*
1989 		 * See if the membership already exists or if all the
1990 		 * membership slots are full.
1991 		 */
1992 		for (i = 0; i < imo->imo_num_memberships; ++i) {
1993 			if (imo->imo_membership[i]->inm_ifp == ifp &&
1994 			    imo->imo_membership[i]->inm_addr.s_addr
1995 						== mreq.imr_multiaddr.s_addr)
1996 				break;
1997 		}
1998 		if (i < imo->imo_num_memberships) {
1999 			error = EADDRINUSE;
2000 			splx(s);
2001 			break;
2002 		}
2003 		if (i == IP_MAX_MEMBERSHIPS) {
2004 			error = ETOOMANYREFS;
2005 			splx(s);
2006 			break;
2007 		}
2008 		/*
2009 		 * Everything looks good; add a new record to the multicast
2010 		 * address list for the given interface.
2011 		 */
2012 		if ((imo->imo_membership[i] =
2013 		    in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2014 			error = ENOBUFS;
2015 			splx(s);
2016 			break;
2017 		}
2018 		++imo->imo_num_memberships;
2019 		splx(s);
2020 		break;
2021 
2022 	case IP_DROP_MEMBERSHIP:
2023 		/*
2024 		 * Drop a multicast group membership.
2025 		 * Group must be a valid IP multicast address.
2026 		 */
2027 		error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2028 		if (error)
2029 			break;
2030 
2031 		if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2032 			error = EINVAL;
2033 			break;
2034 		}
2035 
2036 		s = splimp();
2037 		/*
2038 		 * If an interface address was specified, get a pointer
2039 		 * to its ifnet structure.
2040 		 */
2041 		if (mreq.imr_interface.s_addr == INADDR_ANY)
2042 			ifp = NULL;
2043 		else {
2044 			ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2045 			if (ifp == NULL) {
2046 				error = EADDRNOTAVAIL;
2047 				splx(s);
2048 				break;
2049 			}
2050 		}
2051 		/*
2052 		 * Find the membership in the membership array.
2053 		 */
2054 		for (i = 0; i < imo->imo_num_memberships; ++i) {
2055 			if ((ifp == NULL ||
2056 			     imo->imo_membership[i]->inm_ifp == ifp) &&
2057 			     imo->imo_membership[i]->inm_addr.s_addr ==
2058 			     mreq.imr_multiaddr.s_addr)
2059 				break;
2060 		}
2061 		if (i == imo->imo_num_memberships) {
2062 			error = EADDRNOTAVAIL;
2063 			splx(s);
2064 			break;
2065 		}
2066 		/*
2067 		 * Give up the multicast address record to which the
2068 		 * membership points.
2069 		 */
2070 		in_delmulti(imo->imo_membership[i]);
2071 		/*
2072 		 * Remove the gap in the membership array.
2073 		 */
2074 		for (++i; i < imo->imo_num_memberships; ++i)
2075 			imo->imo_membership[i-1] = imo->imo_membership[i];
2076 		--imo->imo_num_memberships;
2077 		splx(s);
2078 		break;
2079 
2080 	default:
2081 		error = EOPNOTSUPP;
2082 		break;
2083 	}
2084 
2085 	/*
2086 	 * If all options have default values, no need to keep the mbuf.
2087 	 */
2088 	if (imo->imo_multicast_ifp == NULL &&
2089 	    imo->imo_multicast_vif == -1 &&
2090 	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2091 	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2092 	    imo->imo_num_memberships == 0) {
2093 		free(*imop, M_IPMOPTS);
2094 		*imop = NULL;
2095 	}
2096 
2097 	return (error);
2098 }
2099 
2100 /*
2101  * Return the IP multicast options in response to user getsockopt().
2102  */
2103 static int
2104 ip_getmoptions(sopt, imo)
2105 	struct sockopt *sopt;
2106 	register struct ip_moptions *imo;
2107 {
2108 	struct in_addr addr;
2109 	struct in_ifaddr *ia;
2110 	int error, optval;
2111 	u_char coptval;
2112 
2113 	error = 0;
2114 	switch (sopt->sopt_name) {
2115 	case IP_MULTICAST_VIF:
2116 		if (imo != NULL)
2117 			optval = imo->imo_multicast_vif;
2118 		else
2119 			optval = -1;
2120 		error = sooptcopyout(sopt, &optval, sizeof optval);
2121 		break;
2122 
2123 	case IP_MULTICAST_IF:
2124 		if (imo == NULL || imo->imo_multicast_ifp == NULL)
2125 			addr.s_addr = INADDR_ANY;
2126 		else if (imo->imo_multicast_addr.s_addr) {
2127 			/* return the value user has set */
2128 			addr = imo->imo_multicast_addr;
2129 		} else {
2130 			IFP_TO_IA(imo->imo_multicast_ifp, ia);
2131 			addr.s_addr = (ia == NULL) ? INADDR_ANY
2132 				: IA_SIN(ia)->sin_addr.s_addr;
2133 		}
2134 		error = sooptcopyout(sopt, &addr, sizeof addr);
2135 		break;
2136 
2137 	case IP_MULTICAST_TTL:
2138 		if (imo == 0)
2139 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2140 		else
2141 			optval = coptval = imo->imo_multicast_ttl;
2142 		if (sopt->sopt_valsize == 1)
2143 			error = sooptcopyout(sopt, &coptval, 1);
2144 		else
2145 			error = sooptcopyout(sopt, &optval, sizeof optval);
2146 		break;
2147 
2148 	case IP_MULTICAST_LOOP:
2149 		if (imo == 0)
2150 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2151 		else
2152 			optval = coptval = imo->imo_multicast_loop;
2153 		if (sopt->sopt_valsize == 1)
2154 			error = sooptcopyout(sopt, &coptval, 1);
2155 		else
2156 			error = sooptcopyout(sopt, &optval, sizeof optval);
2157 		break;
2158 
2159 	default:
2160 		error = ENOPROTOOPT;
2161 		break;
2162 	}
2163 	return (error);
2164 }
2165 
2166 /*
2167  * Discard the IP multicast options.
2168  */
2169 void
2170 ip_freemoptions(imo)
2171 	register struct ip_moptions *imo;
2172 {
2173 	register int i;
2174 
2175 	if (imo != NULL) {
2176 		for (i = 0; i < imo->imo_num_memberships; ++i)
2177 			in_delmulti(imo->imo_membership[i]);
2178 		free(imo, M_IPMOPTS);
2179 	}
2180 }
2181 
2182 /*
2183  * Routine called from ip_output() to loop back a copy of an IP multicast
2184  * packet to the input queue of a specified interface.  Note that this
2185  * calls the output routine of the loopback "driver", but with an interface
2186  * pointer that might NOT be a loopback interface -- evil, but easier than
2187  * replicating that code here.
2188  */
2189 static void
2190 ip_mloopback(ifp, m, dst, hlen)
2191 	struct ifnet *ifp;
2192 	register struct mbuf *m;
2193 	register struct sockaddr_in *dst;
2194 	int hlen;
2195 {
2196 	register struct ip *ip;
2197 	struct mbuf *copym;
2198 
2199 	copym = m_copy(m, 0, M_COPYALL);
2200 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2201 		copym = m_pullup(copym, hlen);
2202 	if (copym != NULL) {
2203 		/*
2204 		 * We don't bother to fragment if the IP length is greater
2205 		 * than the interface's MTU.  Can this possibly matter?
2206 		 */
2207 		ip = mtod(copym, struct ip *);
2208 		ip->ip_len = htons(ip->ip_len);
2209 		ip->ip_off = htons(ip->ip_off);
2210 		ip->ip_sum = 0;
2211 		ip->ip_sum = in_cksum(copym, hlen);
2212 		/*
2213 		 * NB:
2214 		 * It's not clear whether there are any lingering
2215 		 * reentrancy problems in other areas which might
2216 		 * be exposed by using ip_input directly (in
2217 		 * particular, everything which modifies the packet
2218 		 * in-place).  Yet another option is using the
2219 		 * protosw directly to deliver the looped back
2220 		 * packet.  For the moment, we'll err on the side
2221 		 * of safety by using if_simloop().
2222 		 */
2223 #if 1 /* XXX */
2224 		if (dst->sin_family != AF_INET) {
2225 			printf("ip_mloopback: bad address family %d\n",
2226 						dst->sin_family);
2227 			dst->sin_family = AF_INET;
2228 		}
2229 #endif
2230 
2231 #ifdef notdef
2232 		copym->m_pkthdr.rcvif = ifp;
2233 		ip_input(copym);
2234 #else
2235 		/* if the checksum hasn't been computed, mark it as valid */
2236 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2237 			copym->m_pkthdr.csum_flags |=
2238 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2239 			copym->m_pkthdr.csum_data = 0xffff;
2240 		}
2241 		if_simloop(ifp, copym, dst->sin_family, 0);
2242 #endif
2243 	}
2244 }
2245