xref: /freebsd/sys/netinet/in_pcb.c (revision 7773002178c8dbc52b44e4d705f07706409af8e4)
1 /*
2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipsec.h"
38 #include "opt_inet6.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/limits.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/proc.h>
50 #include <sys/jail.h>
51 #include <sys/kernel.h>
52 #include <sys/sysctl.h>
53 
54 #include <vm/uma.h>
55 
56 #include <net/if.h>
57 #include <net/if_types.h>
58 #include <net/route.h>
59 
60 #include <netinet/in.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/in_var.h>
63 #include <netinet/ip_var.h>
64 #include <netinet/tcp_var.h>
65 #ifdef INET6
66 #include <netinet/ip6.h>
67 #include <netinet6/ip6_var.h>
68 #endif /* INET6 */
69 
70 #ifdef IPSEC
71 #include <netinet6/ipsec.h>
72 #include <netkey/key.h>
73 #endif /* IPSEC */
74 
75 #ifdef FAST_IPSEC
76 #if defined(IPSEC) || defined(IPSEC_ESP)
77 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
78 #endif
79 
80 #include <netipsec/ipsec.h>
81 #include <netipsec/key.h>
82 #endif /* FAST_IPSEC */
83 
84 struct	in_addr zeroin_addr;
85 
86 /*
87  * These configure the range of local port addresses assigned to
88  * "unspecified" outgoing connections/packets/whatever.
89  */
90 int	ipport_lowfirstauto  = IPPORT_RESERVED - 1;	/* 1023 */
91 int	ipport_lowlastauto = IPPORT_RESERVEDSTART;	/* 600 */
92 int	ipport_firstauto = IPPORT_HIFIRSTAUTO;		/* 49152 */
93 int	ipport_lastauto  = IPPORT_HILASTAUTO;		/* 65535 */
94 int	ipport_hifirstauto = IPPORT_HIFIRSTAUTO;	/* 49152 */
95 int	ipport_hilastauto  = IPPORT_HILASTAUTO;		/* 65535 */
96 
97 /*
98  * Reserved ports accessible only to root. There are significant
99  * security considerations that must be accounted for when changing these,
100  * but the security benefits can be great. Please be careful.
101  */
102 int	ipport_reservedhigh = IPPORT_RESERVED - 1;	/* 1023 */
103 int	ipport_reservedlow = 0;
104 
105 #define RANGECHK(var, min, max) \
106 	if ((var) < (min)) { (var) = (min); } \
107 	else if ((var) > (max)) { (var) = (max); }
108 
109 static int
110 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
111 {
112 	int error = sysctl_handle_int(oidp,
113 		oidp->oid_arg1, oidp->oid_arg2, req);
114 	if (!error) {
115 		RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
116 		RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
117 		RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
118 		RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
119 		RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
120 		RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
121 	}
122 	return error;
123 }
124 
125 #undef RANGECHK
126 
127 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
128 
129 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
130 	   &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
131 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
132 	   &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
133 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
134 	   &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
135 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
136 	   &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
137 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
138 	   &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
139 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
140 	   &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
141 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
142 	   CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, "");
143 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
144 	   CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, "");
145 
146 /*
147  * in_pcb.c: manage the Protocol Control Blocks.
148  *
149  * NOTE: It is assumed that most of these functions will be called at
150  * splnet(). XXX - There are, unfortunately, a few exceptions to this
151  * rule that should be fixed.
152  */
153 
154 /*
155  * Allocate a PCB and associate it with the socket.
156  */
157 int
158 in_pcballoc(so, pcbinfo, td)
159 	struct socket *so;
160 	struct inpcbinfo *pcbinfo;
161 	struct thread *td;
162 {
163 	register struct inpcb *inp;
164 #if defined(IPSEC) || defined(FAST_IPSEC)
165 	int error;
166 #endif
167 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT | M_ZERO);
168 	if (inp == NULL)
169 		return (ENOBUFS);
170 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
171 	inp->inp_pcbinfo = pcbinfo;
172 	inp->inp_socket = so;
173 #if defined(IPSEC) || defined(FAST_IPSEC)
174 #ifdef FAST_IPSEC
175 	error = ipsec_init_policy(so, &inp->inp_sp);
176 #else
177 	error = ipsec_init_pcbpolicy(so, &inp->inp_sp);
178 #endif
179 	if (error != 0) {
180 		uma_zfree(pcbinfo->ipi_zone, inp);
181 		return error;
182 	}
183 #endif /*IPSEC*/
184 #if defined(INET6)
185 	if (INP_SOCKAF(so) == AF_INET6) {
186 		inp->inp_vflag |= INP_IPV6PROTO;
187 		if (ip6_v6only)
188 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
189 	}
190 #endif
191 	LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
192 	pcbinfo->ipi_count++;
193 	so->so_pcb = (caddr_t)inp;
194 	INP_LOCK_INIT(inp, "inp");
195 #ifdef INET6
196 	if (ip6_auto_flowlabel)
197 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
198 #endif
199 	return (0);
200 }
201 
202 int
203 in_pcbbind(inp, nam, td)
204 	register struct inpcb *inp;
205 	struct sockaddr *nam;
206 	struct thread *td;
207 {
208 	int anonport, error;
209 
210 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
211 		return (EINVAL);
212 	anonport = inp->inp_lport == 0 && (nam == NULL ||
213 	    ((struct sockaddr_in *)nam)->sin_port == 0);
214 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
215 	    &inp->inp_lport, td);
216 	if (error)
217 		return (error);
218 	if (in_pcbinshash(inp) != 0) {
219 		inp->inp_laddr.s_addr = INADDR_ANY;
220 		inp->inp_lport = 0;
221 		return (EAGAIN);
222 	}
223 	if (anonport)
224 		inp->inp_flags |= INP_ANONPORT;
225 	return (0);
226 }
227 
228 /*
229  * Set up a bind operation on a PCB, performing port allocation
230  * as required, but do not actually modify the PCB. Callers can
231  * either complete the bind by setting inp_laddr/inp_lport and
232  * calling in_pcbinshash(), or they can just use the resulting
233  * port and address to authorise the sending of a once-off packet.
234  *
235  * On error, the values of *laddrp and *lportp are not changed.
236  */
237 int
238 in_pcbbind_setup(inp, nam, laddrp, lportp, td)
239 	struct inpcb *inp;
240 	struct sockaddr *nam;
241 	in_addr_t *laddrp;
242 	u_short *lportp;
243 	struct thread *td;
244 {
245 	struct socket *so = inp->inp_socket;
246 	unsigned short *lastport;
247 	struct sockaddr_in *sin;
248 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
249 	struct in_addr laddr;
250 	u_short lport = 0;
251 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
252 	int error, prison = 0;
253 
254 	if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
255 		return (EADDRNOTAVAIL);
256 	laddr.s_addr = *laddrp;
257 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
258 		return (EINVAL);
259 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
260 		wild = 1;
261 	if (nam) {
262 		sin = (struct sockaddr_in *)nam;
263 		if (nam->sa_len != sizeof (*sin))
264 			return (EINVAL);
265 #ifdef notdef
266 		/*
267 		 * We should check the family, but old programs
268 		 * incorrectly fail to initialize it.
269 		 */
270 		if (sin->sin_family != AF_INET)
271 			return (EAFNOSUPPORT);
272 #endif
273 		if (sin->sin_addr.s_addr != INADDR_ANY)
274 			if (prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
275 				return(EINVAL);
276 		if (sin->sin_port != *lportp) {
277 			/* Don't allow the port to change. */
278 			if (*lportp != 0)
279 				return (EINVAL);
280 			lport = sin->sin_port;
281 		}
282 		/* NB: lport is left as 0 if the port isn't being changed. */
283 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
284 			/*
285 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
286 			 * allow complete duplication of binding if
287 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
288 			 * and a multicast address is bound on both
289 			 * new and duplicated sockets.
290 			 */
291 			if (so->so_options & SO_REUSEADDR)
292 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
293 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
294 			sin->sin_port = 0;		/* yech... */
295 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
296 			if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
297 				return (EADDRNOTAVAIL);
298 		}
299 		laddr = sin->sin_addr;
300 		if (lport) {
301 			struct inpcb *t;
302 			/* GROSS */
303 			if (ntohs(lport) <= ipport_reservedhigh &&
304 			    ntohs(lport) >= ipport_reservedlow &&
305 			    td && suser_cred(td->td_ucred, PRISON_ROOT))
306 				return (EACCES);
307 			if (td && jailed(td->td_ucred))
308 				prison = 1;
309 			if (so->so_cred->cr_uid != 0 &&
310 			    !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
311 				t = in_pcblookup_local(inp->inp_pcbinfo,
312 				    sin->sin_addr, lport,
313 				    prison ? 0 :  INPLOOKUP_WILDCARD);
314 	/*
315 	 * XXX
316 	 * This entire block sorely needs a rewrite.
317 	 */
318 				if (t && (t->inp_vflag & INP_TIMEWAIT)) {
319 					if ((ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
320 					    ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
321 					    (intotw(t)->tw_so_options & SO_REUSEPORT) == 0) &&
322 					    (so->so_cred->cr_uid != intotw(t)->tw_cred->cr_uid))
323 						return (EADDRINUSE);
324 				} else
325 				if (t &&
326 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
327 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
328 				     (t->inp_socket->so_options &
329 					 SO_REUSEPORT) == 0) &&
330 				    (so->so_cred->cr_uid !=
331 				     t->inp_socket->so_cred->cr_uid)) {
332 #if defined(INET6)
333 					if (ntohl(sin->sin_addr.s_addr) !=
334 					    INADDR_ANY ||
335 					    ntohl(t->inp_laddr.s_addr) !=
336 					    INADDR_ANY ||
337 					    INP_SOCKAF(so) ==
338 					    INP_SOCKAF(t->inp_socket))
339 #endif /* defined(INET6) */
340 					return (EADDRINUSE);
341 				}
342 			}
343 			if (prison &&
344 			    prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
345 				return (EADDRNOTAVAIL);
346 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
347 			    lport, prison ? 0 : wild);
348 			if (t && (t->inp_vflag & INP_TIMEWAIT)) {
349 				if ((reuseport & intotw(t)->tw_so_options) == 0)
350 					return (EADDRINUSE);
351 			} else
352 			if (t &&
353 			    (reuseport & t->inp_socket->so_options) == 0) {
354 #if defined(INET6)
355 				if (ntohl(sin->sin_addr.s_addr) !=
356 				    INADDR_ANY ||
357 				    ntohl(t->inp_laddr.s_addr) !=
358 				    INADDR_ANY ||
359 				    INP_SOCKAF(so) ==
360 				    INP_SOCKAF(t->inp_socket))
361 #endif /* defined(INET6) */
362 				return (EADDRINUSE);
363 			}
364 		}
365 	}
366 	if (*lportp != 0)
367 		lport = *lportp;
368 	if (lport == 0) {
369 		u_short first, last;
370 		int count;
371 
372 		if (laddr.s_addr != INADDR_ANY)
373 			if (prison_ip(td->td_ucred, 0, &laddr.s_addr))
374 				return (EINVAL);
375 
376 		if (inp->inp_flags & INP_HIGHPORT) {
377 			first = ipport_hifirstauto;	/* sysctl */
378 			last  = ipport_hilastauto;
379 			lastport = &pcbinfo->lasthi;
380 		} else if (inp->inp_flags & INP_LOWPORT) {
381 			if (td && (error = suser_cred(td->td_ucred,
382 			    PRISON_ROOT)) != 0)
383 				return error;
384 			first = ipport_lowfirstauto;	/* 1023 */
385 			last  = ipport_lowlastauto;	/* 600 */
386 			lastport = &pcbinfo->lastlow;
387 		} else {
388 			first = ipport_firstauto;	/* sysctl */
389 			last  = ipport_lastauto;
390 			lastport = &pcbinfo->lastport;
391 		}
392 		/*
393 		 * Simple check to ensure all ports are not used up causing
394 		 * a deadlock here.
395 		 *
396 		 * We split the two cases (up and down) so that the direction
397 		 * is not being tested on each round of the loop.
398 		 */
399 		if (first > last) {
400 			/*
401 			 * counting down
402 			 */
403 			count = first - last;
404 
405 			do {
406 				if (count-- < 0)	/* completely used? */
407 					return (EADDRNOTAVAIL);
408 				--*lastport;
409 				if (*lastport > first || *lastport < last)
410 					*lastport = first;
411 				lport = htons(*lastport);
412 			} while (in_pcblookup_local(pcbinfo, laddr, lport,
413 			    wild));
414 		} else {
415 			/*
416 			 * counting up
417 			 */
418 			count = last - first;
419 
420 			do {
421 				if (count-- < 0)	/* completely used? */
422 					return (EADDRNOTAVAIL);
423 				++*lastport;
424 				if (*lastport < first || *lastport > last)
425 					*lastport = first;
426 				lport = htons(*lastport);
427 			} while (in_pcblookup_local(pcbinfo, laddr, lport,
428 			    wild));
429 		}
430 	}
431 	if (prison_ip(td->td_ucred, 0, &laddr.s_addr))
432 		return (EINVAL);
433 	*laddrp = laddr.s_addr;
434 	*lportp = lport;
435 	return (0);
436 }
437 
438 /*
439  * Connect from a socket to a specified address.
440  * Both address and port must be specified in argument sin.
441  * If don't have a local address for this socket yet,
442  * then pick one.
443  */
444 int
445 in_pcbconnect(inp, nam, td)
446 	register struct inpcb *inp;
447 	struct sockaddr *nam;
448 	struct thread *td;
449 {
450 	u_short lport, fport;
451 	in_addr_t laddr, faddr;
452 	int anonport, error;
453 
454 	lport = inp->inp_lport;
455 	laddr = inp->inp_laddr.s_addr;
456 	anonport = (lport == 0);
457 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
458 	    NULL, td);
459 	if (error)
460 		return (error);
461 
462 	/* Do the initial binding of the local address if required. */
463 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
464 		inp->inp_lport = lport;
465 		inp->inp_laddr.s_addr = laddr;
466 		if (in_pcbinshash(inp) != 0) {
467 			inp->inp_laddr.s_addr = INADDR_ANY;
468 			inp->inp_lport = 0;
469 			return (EAGAIN);
470 		}
471 	}
472 
473 	/* Commit the remaining changes. */
474 	inp->inp_lport = lport;
475 	inp->inp_laddr.s_addr = laddr;
476 	inp->inp_faddr.s_addr = faddr;
477 	inp->inp_fport = fport;
478 	in_pcbrehash(inp);
479 #ifdef IPSEC
480 	if (inp->inp_socket->so_type == SOCK_STREAM)
481 		ipsec_pcbconn(inp->inp_sp);
482 #endif
483 	if (anonport)
484 		inp->inp_flags |= INP_ANONPORT;
485 	return (0);
486 }
487 
488 /*
489  * Set up for a connect from a socket to the specified address.
490  * On entry, *laddrp and *lportp should contain the current local
491  * address and port for the PCB; these are updated to the values
492  * that should be placed in inp_laddr and inp_lport to complete
493  * the connect.
494  *
495  * On success, *faddrp and *fportp will be set to the remote address
496  * and port. These are not updated in the error case.
497  *
498  * If the operation fails because the connection already exists,
499  * *oinpp will be set to the PCB of that connection so that the
500  * caller can decide to override it. In all other cases, *oinpp
501  * is set to NULL.
502  */
503 int
504 in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
505 	register struct inpcb *inp;
506 	struct sockaddr *nam;
507 	in_addr_t *laddrp;
508 	u_short *lportp;
509 	in_addr_t *faddrp;
510 	u_short *fportp;
511 	struct inpcb **oinpp;
512 	struct thread *td;
513 {
514 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
515 	struct in_ifaddr *ia;
516 	struct sockaddr_in sa;
517 	struct ucred *cred;
518 	struct inpcb *oinp;
519 	struct in_addr laddr, faddr;
520 	u_short lport, fport;
521 	int error;
522 
523 	if (oinpp != NULL)
524 		*oinpp = NULL;
525 	if (nam->sa_len != sizeof (*sin))
526 		return (EINVAL);
527 	if (sin->sin_family != AF_INET)
528 		return (EAFNOSUPPORT);
529 	if (sin->sin_port == 0)
530 		return (EADDRNOTAVAIL);
531 	laddr.s_addr = *laddrp;
532 	lport = *lportp;
533 	faddr = sin->sin_addr;
534 	fport = sin->sin_port;
535 	cred = inp->inp_socket->so_cred;
536 	if (laddr.s_addr == INADDR_ANY && jailed(cred)) {
537 		bzero(&sa, sizeof(sa));
538 		sa.sin_addr.s_addr = htonl(prison_getip(cred));
539 		sa.sin_len = sizeof(sa);
540 		sa.sin_family = AF_INET;
541 		error = in_pcbbind_setup(inp, (struct sockaddr *)&sa,
542 		    &laddr.s_addr, &lport, td);
543 		if (error)
544 			return (error);
545 	}
546 
547 	if (!TAILQ_EMPTY(&in_ifaddrhead)) {
548 		/*
549 		 * If the destination address is INADDR_ANY,
550 		 * use the primary local address.
551 		 * If the supplied address is INADDR_BROADCAST,
552 		 * and the primary interface supports broadcast,
553 		 * choose the broadcast address for that interface.
554 		 */
555 		if (faddr.s_addr == INADDR_ANY)
556 			faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
557 		else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
558 		    (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags &
559 		    IFF_BROADCAST))
560 			faddr = satosin(&TAILQ_FIRST(
561 			    &in_ifaddrhead)->ia_broadaddr)->sin_addr;
562 	}
563 	if (laddr.s_addr == INADDR_ANY) {
564 		register struct route *ro;
565 
566 		ia = (struct in_ifaddr *)0;
567 		/*
568 		 * If route is known or can be allocated now,
569 		 * our src addr is taken from the i/f, else punt.
570 		 * Note that we should check the address family of the cached
571 		 * destination, in case of sharing the cache with IPv6.
572 		 */
573 		ro = &inp->inp_route;
574 		if (ro->ro_rt &&
575 		    (ro->ro_dst.sa_family != AF_INET ||
576 		     satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr ||
577 		     inp->inp_socket->so_options & SO_DONTROUTE)) {
578 			RTFREE(ro->ro_rt);
579 			ro->ro_rt = (struct rtentry *)0;
580 		}
581 		if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
582 		    (ro->ro_rt == (struct rtentry *)0 ||
583 		    ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
584 			/* No route yet, so try to acquire one */
585 			bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
586 			ro->ro_dst.sa_family = AF_INET;
587 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
588 			((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr;
589 			rtalloc(ro);
590 		}
591 		/*
592 		 * If we found a route, use the address
593 		 * corresponding to the outgoing interface
594 		 * unless it is the loopback (in case a route
595 		 * to our address on another net goes to loopback).
596 		 */
597 		if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
598 			ia = ifatoia(ro->ro_rt->rt_ifa);
599 		if (ia == 0) {
600 			bzero(&sa, sizeof(sa));
601 			sa.sin_addr = faddr;
602 			sa.sin_len = sizeof(sa);
603 			sa.sin_family = AF_INET;
604 
605 			ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa)));
606 			if (ia == 0)
607 				ia = ifatoia(ifa_ifwithnet(sintosa(&sa)));
608 			if (ia == 0)
609 				ia = TAILQ_FIRST(&in_ifaddrhead);
610 			if (ia == 0)
611 				return (EADDRNOTAVAIL);
612 		}
613 		/*
614 		 * If the destination address is multicast and an outgoing
615 		 * interface has been set as a multicast option, use the
616 		 * address of that interface as our source address.
617 		 */
618 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
619 		    inp->inp_moptions != NULL) {
620 			struct ip_moptions *imo;
621 			struct ifnet *ifp;
622 
623 			imo = inp->inp_moptions;
624 			if (imo->imo_multicast_ifp != NULL) {
625 				ifp = imo->imo_multicast_ifp;
626 				TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
627 					if (ia->ia_ifp == ifp)
628 						break;
629 				if (ia == 0)
630 					return (EADDRNOTAVAIL);
631 			}
632 		}
633 		laddr = ia->ia_addr.sin_addr;
634 	}
635 
636 	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
637 	    0, NULL);
638 	if (oinp != NULL) {
639 		if (oinpp != NULL)
640 			*oinpp = oinp;
641 		return (EADDRINUSE);
642 	}
643 	if (lport == 0) {
644 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, td);
645 		if (error)
646 			return (error);
647 	}
648 	*laddrp = laddr.s_addr;
649 	*lportp = lport;
650 	*faddrp = faddr.s_addr;
651 	*fportp = fport;
652 	return (0);
653 }
654 
655 void
656 in_pcbdisconnect(inp)
657 	struct inpcb *inp;
658 {
659 
660 	inp->inp_faddr.s_addr = INADDR_ANY;
661 	inp->inp_fport = 0;
662 	in_pcbrehash(inp);
663 	if (inp->inp_socket->so_state & SS_NOFDREF)
664 		in_pcbdetach(inp);
665 #ifdef IPSEC
666 	ipsec_pcbdisconn(inp->inp_sp);
667 #endif
668 }
669 
670 void
671 in_pcbdetach(inp)
672 	struct inpcb *inp;
673 {
674 	struct socket *so = inp->inp_socket;
675 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
676 
677 #if defined(IPSEC) || defined(FAST_IPSEC)
678 	ipsec4_delete_pcbpolicy(inp);
679 #endif /*IPSEC*/
680 	inp->inp_gencnt = ++ipi->ipi_gencnt;
681 	in_pcbremlists(inp);
682 	if (so) {
683 		so->so_pcb = 0;
684 		sotryfree(so);
685 	}
686 	if (inp->inp_options)
687 		(void)m_free(inp->inp_options);
688 	if (inp->inp_route.ro_rt)
689 		RTFREE(inp->inp_route.ro_rt);
690 	ip_freemoptions(inp->inp_moptions);
691 	inp->inp_vflag = 0;
692 	INP_LOCK_DESTROY(inp);
693 	uma_zfree(ipi->ipi_zone, inp);
694 }
695 
696 struct sockaddr *
697 in_sockaddr(port, addr_p)
698 	in_port_t port;
699 	struct in_addr *addr_p;
700 {
701 	struct sockaddr_in *sin;
702 
703 	MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
704 		M_WAITOK | M_ZERO);
705 	sin->sin_family = AF_INET;
706 	sin->sin_len = sizeof(*sin);
707 	sin->sin_addr = *addr_p;
708 	sin->sin_port = port;
709 
710 	return (struct sockaddr *)sin;
711 }
712 
713 /*
714  * The wrapper function will pass down the pcbinfo for this function to lock.
715  * The socket must have a valid
716  * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
717  * except through a kernel programming error, so it is acceptable to panic
718  * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
719  * because there actually /is/ a programming error somewhere... XXX)
720  */
721 int
722 in_setsockaddr(so, nam, pcbinfo)
723 	struct socket *so;
724 	struct sockaddr **nam;
725 	struct inpcbinfo *pcbinfo;
726 {
727 	int s;
728 	register struct inpcb *inp;
729 	struct in_addr addr;
730 	in_port_t port;
731 
732 	s = splnet();
733 	INP_INFO_RLOCK(pcbinfo);
734 	inp = sotoinpcb(so);
735 	if (!inp) {
736 		INP_INFO_RUNLOCK(pcbinfo);
737 		splx(s);
738 		return ECONNRESET;
739 	}
740 	INP_LOCK(inp);
741 	port = inp->inp_lport;
742 	addr = inp->inp_laddr;
743 	INP_UNLOCK(inp);
744 	INP_INFO_RUNLOCK(pcbinfo);
745 	splx(s);
746 
747 	*nam = in_sockaddr(port, &addr);
748 	return 0;
749 }
750 
751 /*
752  * The wrapper function will pass down the pcbinfo for this function to lock.
753  */
754 int
755 in_setpeeraddr(so, nam, pcbinfo)
756 	struct socket *so;
757 	struct sockaddr **nam;
758 	struct inpcbinfo *pcbinfo;
759 {
760 	int s;
761 	register struct inpcb *inp;
762 	struct in_addr addr;
763 	in_port_t port;
764 
765 	s = splnet();
766 	INP_INFO_RLOCK(pcbinfo);
767 	inp = sotoinpcb(so);
768 	if (!inp) {
769 		INP_INFO_RUNLOCK(pcbinfo);
770 		splx(s);
771 		return ECONNRESET;
772 	}
773 	INP_LOCK(inp);
774 	port = inp->inp_fport;
775 	addr = inp->inp_faddr;
776 	INP_UNLOCK(inp);
777 	INP_INFO_RUNLOCK(pcbinfo);
778 	splx(s);
779 
780 	*nam = in_sockaddr(port, &addr);
781 	return 0;
782 }
783 
784 void
785 in_pcbnotifyall(pcbinfo, faddr, errno, notify)
786 	struct inpcbinfo *pcbinfo;
787 	struct in_addr faddr;
788 	int errno;
789 	struct inpcb *(*notify)(struct inpcb *, int);
790 {
791 	struct inpcb *inp, *ninp;
792 	struct inpcbhead *head;
793 	int s;
794 
795 	s = splnet();
796 	INP_INFO_WLOCK(pcbinfo);
797 	head = pcbinfo->listhead;
798 	for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) {
799 		INP_LOCK(inp);
800 		ninp = LIST_NEXT(inp, inp_list);
801 #ifdef INET6
802 		if ((inp->inp_vflag & INP_IPV4) == 0) {
803 			INP_UNLOCK(inp);
804 			continue;
805 		}
806 #endif
807 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
808 		    inp->inp_socket == NULL) {
809 			INP_UNLOCK(inp);
810 			continue;
811 		}
812 		if ((*notify)(inp, errno))
813 			INP_UNLOCK(inp);
814 	}
815 	INP_INFO_WUNLOCK(pcbinfo);
816 	splx(s);
817 }
818 
819 void
820 in_pcbpurgeif0(pcbinfo, ifp)
821 	struct inpcbinfo *pcbinfo;
822 	struct ifnet *ifp;
823 {
824 	struct inpcb *inp;
825 	struct ip_moptions *imo;
826 	int i, gap;
827 
828 	/* why no splnet here? XXX */
829 	INP_INFO_RLOCK(pcbinfo);
830 	LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
831 		INP_LOCK(inp);
832 		imo = inp->inp_moptions;
833 		if ((inp->inp_vflag & INP_IPV4) &&
834 		    imo != NULL) {
835 			/*
836 			 * Unselect the outgoing interface if it is being
837 			 * detached.
838 			 */
839 			if (imo->imo_multicast_ifp == ifp)
840 				imo->imo_multicast_ifp = NULL;
841 
842 			/*
843 			 * Drop multicast group membership if we joined
844 			 * through the interface being detached.
845 			 */
846 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
847 			    i++) {
848 				if (imo->imo_membership[i]->inm_ifp == ifp) {
849 					in_delmulti(imo->imo_membership[i]);
850 					gap++;
851 				} else if (gap != 0)
852 					imo->imo_membership[i - gap] =
853 					    imo->imo_membership[i];
854 			}
855 			imo->imo_num_memberships -= gap;
856 		}
857 		INP_UNLOCK(inp);
858 	}
859 	INP_INFO_RUNLOCK(pcbinfo);
860 }
861 
862 /*
863  * Check for alternatives when higher level complains
864  * about service problems.  For now, invalidate cached
865  * routing information.  If the route was created dynamically
866  * (by a redirect), time to try a default gateway again.
867  */
868 void
869 in_losing(inp)
870 	struct inpcb *inp;
871 {
872 	register struct rtentry *rt;
873 	struct rt_addrinfo info;
874 
875 	if ((rt = inp->inp_route.ro_rt)) {
876 		RT_LOCK(rt);
877 		inp->inp_route.ro_rt = NULL;
878 		bzero((caddr_t)&info, sizeof(info));
879 		info.rti_flags = rt->rt_flags;
880 		info.rti_info[RTAX_DST] = rt_key(rt);
881 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
882 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
883 		rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
884 		if (rt->rt_flags & RTF_DYNAMIC)
885 			rtexpunge(rt);
886 		RTFREE_LOCKED(rt);
887 		/*
888 		 * A new route can be allocated
889 		 * the next time output is attempted.
890 		 */
891 	}
892 }
893 
894 /*
895  * After a routing change, flush old routing
896  * and allocate a (hopefully) better one.
897  */
898 struct inpcb *
899 in_rtchange(inp, errno)
900 	register struct inpcb *inp;
901 	int errno;
902 {
903 	if (inp->inp_route.ro_rt) {
904 		RTFREE(inp->inp_route.ro_rt);
905 		inp->inp_route.ro_rt = 0;
906 		/*
907 		 * A new route can be allocated the next time
908 		 * output is attempted.
909 		 */
910 	}
911 	return inp;
912 }
913 
914 /*
915  * Lookup a PCB based on the local address and port.
916  */
917 struct inpcb *
918 in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay)
919 	struct inpcbinfo *pcbinfo;
920 	struct in_addr laddr;
921 	u_int lport_arg;
922 	int wild_okay;
923 {
924 	register struct inpcb *inp;
925 	int matchwild = 3, wildcard;
926 	u_short lport = lport_arg;
927 
928 	if (!wild_okay) {
929 		struct inpcbhead *head;
930 		/*
931 		 * Look for an unconnected (wildcard foreign addr) PCB that
932 		 * matches the local address and port we're looking for.
933 		 */
934 		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
935 		LIST_FOREACH(inp, head, inp_hash) {
936 #ifdef INET6
937 			if ((inp->inp_vflag & INP_IPV4) == 0)
938 				continue;
939 #endif
940 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
941 			    inp->inp_laddr.s_addr == laddr.s_addr &&
942 			    inp->inp_lport == lport) {
943 				/*
944 				 * Found.
945 				 */
946 				return (inp);
947 			}
948 		}
949 		/*
950 		 * Not found.
951 		 */
952 		return (NULL);
953 	} else {
954 		struct inpcbporthead *porthash;
955 		struct inpcbport *phd;
956 		struct inpcb *match = NULL;
957 		/*
958 		 * Best fit PCB lookup.
959 		 *
960 		 * First see if this local port is in use by looking on the
961 		 * port hash list.
962 		 */
963 		retrylookup:
964 		porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
965 		    pcbinfo->porthashmask)];
966 		LIST_FOREACH(phd, porthash, phd_hash) {
967 			if (phd->phd_port == lport)
968 				break;
969 		}
970 		if (phd != NULL) {
971 			/*
972 			 * Port is in use by one or more PCBs. Look for best
973 			 * fit.
974 			 */
975 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
976 				wildcard = 0;
977 #ifdef INET6
978 				if ((inp->inp_vflag & INP_IPV4) == 0)
979 					continue;
980 #endif
981 				/*
982 				 * Clean out old time_wait sockets if they
983 				 * are clogging up needed local ports.
984 				 */
985 				if ((inp->inp_vflag & INP_TIMEWAIT) != 0) {
986 					if (tcp_twrecycleable((struct tcptw *)inp->inp_ppcb)) {
987 						tcp_twclose((struct tcptw *)inp->inp_ppcb, 0);
988 						match = NULL;
989 						goto retrylookup;
990 					}
991 				}
992 				if (inp->inp_faddr.s_addr != INADDR_ANY)
993 					wildcard++;
994 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
995 					if (laddr.s_addr == INADDR_ANY)
996 						wildcard++;
997 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
998 						continue;
999 				} else {
1000 					if (laddr.s_addr != INADDR_ANY)
1001 						wildcard++;
1002 				}
1003 				if (wildcard < matchwild) {
1004 					match = inp;
1005 					matchwild = wildcard;
1006 					if (matchwild == 0) {
1007 						break;
1008 					}
1009 				}
1010 			}
1011 		}
1012 		return (match);
1013 	}
1014 }
1015 
1016 /*
1017  * Lookup PCB in hash list.
1018  */
1019 struct inpcb *
1020 in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard,
1021 		  ifp)
1022 	struct inpcbinfo *pcbinfo;
1023 	struct in_addr faddr, laddr;
1024 	u_int fport_arg, lport_arg;
1025 	int wildcard;
1026 	struct ifnet *ifp;
1027 {
1028 	struct inpcbhead *head;
1029 	register struct inpcb *inp;
1030 	u_short fport = fport_arg, lport = lport_arg;
1031 
1032 	/*
1033 	 * First look for an exact match.
1034 	 */
1035 	head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1036 	LIST_FOREACH(inp, head, inp_hash) {
1037 #ifdef INET6
1038 		if ((inp->inp_vflag & INP_IPV4) == 0)
1039 			continue;
1040 #endif
1041 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1042 		    inp->inp_laddr.s_addr == laddr.s_addr &&
1043 		    inp->inp_fport == fport &&
1044 		    inp->inp_lport == lport) {
1045 			/*
1046 			 * Found.
1047 			 */
1048 			return (inp);
1049 		}
1050 	}
1051 	if (wildcard) {
1052 		struct inpcb *local_wild = NULL;
1053 #if defined(INET6)
1054 		struct inpcb *local_wild_mapped = NULL;
1055 #endif /* defined(INET6) */
1056 
1057 		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1058 		LIST_FOREACH(inp, head, inp_hash) {
1059 #ifdef INET6
1060 			if ((inp->inp_vflag & INP_IPV4) == 0)
1061 				continue;
1062 #endif
1063 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1064 			    inp->inp_lport == lport) {
1065 				if (ifp && ifp->if_type == IFT_FAITH &&
1066 				    (inp->inp_flags & INP_FAITH) == 0)
1067 					continue;
1068 				if (inp->inp_laddr.s_addr == laddr.s_addr)
1069 					return (inp);
1070 				else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1071 #if defined(INET6)
1072 					if (INP_CHECK_SOCKAF(inp->inp_socket,
1073 							     AF_INET6))
1074 						local_wild_mapped = inp;
1075 					else
1076 #endif /* defined(INET6) */
1077 					local_wild = inp;
1078 				}
1079 			}
1080 		}
1081 #if defined(INET6)
1082 		if (local_wild == NULL)
1083 			return (local_wild_mapped);
1084 #endif /* defined(INET6) */
1085 		return (local_wild);
1086 	}
1087 
1088 	/*
1089 	 * Not found.
1090 	 */
1091 	return (NULL);
1092 }
1093 
1094 /*
1095  * Insert PCB onto various hash lists.
1096  */
1097 int
1098 in_pcbinshash(inp)
1099 	struct inpcb *inp;
1100 {
1101 	struct inpcbhead *pcbhash;
1102 	struct inpcbporthead *pcbporthash;
1103 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1104 	struct inpcbport *phd;
1105 	u_int32_t hashkey_faddr;
1106 
1107 #ifdef INET6
1108 	if (inp->inp_vflag & INP_IPV6)
1109 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1110 	else
1111 #endif /* INET6 */
1112 	hashkey_faddr = inp->inp_faddr.s_addr;
1113 
1114 	pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1115 		 inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
1116 
1117 	pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1118 	    pcbinfo->porthashmask)];
1119 
1120 	/*
1121 	 * Go through port list and look for a head for this lport.
1122 	 */
1123 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
1124 		if (phd->phd_port == inp->inp_lport)
1125 			break;
1126 	}
1127 	/*
1128 	 * If none exists, malloc one and tack it on.
1129 	 */
1130 	if (phd == NULL) {
1131 		MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1132 		if (phd == NULL) {
1133 			return (ENOBUFS); /* XXX */
1134 		}
1135 		phd->phd_port = inp->inp_lport;
1136 		LIST_INIT(&phd->phd_pcblist);
1137 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1138 	}
1139 	inp->inp_phd = phd;
1140 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1141 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1142 	return (0);
1143 }
1144 
1145 /*
1146  * Move PCB to the proper hash bucket when { faddr, fport } have  been
1147  * changed. NOTE: This does not handle the case of the lport changing (the
1148  * hashed port list would have to be updated as well), so the lport must
1149  * not change after in_pcbinshash() has been called.
1150  */
1151 void
1152 in_pcbrehash(inp)
1153 	struct inpcb *inp;
1154 {
1155 	struct inpcbhead *head;
1156 	u_int32_t hashkey_faddr;
1157 
1158 #ifdef INET6
1159 	if (inp->inp_vflag & INP_IPV6)
1160 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1161 	else
1162 #endif /* INET6 */
1163 	hashkey_faddr = inp->inp_faddr.s_addr;
1164 
1165 	head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1166 		inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)];
1167 
1168 	LIST_REMOVE(inp, inp_hash);
1169 	LIST_INSERT_HEAD(head, inp, inp_hash);
1170 }
1171 
1172 /*
1173  * Remove PCB from various lists.
1174  */
1175 void
1176 in_pcbremlists(inp)
1177 	struct inpcb *inp;
1178 {
1179 	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1180 	if (inp->inp_lport) {
1181 		struct inpcbport *phd = inp->inp_phd;
1182 
1183 		LIST_REMOVE(inp, inp_hash);
1184 		LIST_REMOVE(inp, inp_portlist);
1185 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1186 			LIST_REMOVE(phd, phd_hash);
1187 			free(phd, M_PCB);
1188 		}
1189 	}
1190 	LIST_REMOVE(inp, inp_list);
1191 	inp->inp_pcbinfo->ipi_count--;
1192 }
1193 
1194 int
1195 prison_xinpcb(struct thread *td, struct inpcb *inp)
1196 {
1197 	if (!jailed(td->td_ucred))
1198 		return (0);
1199 	if (ntohl(inp->inp_laddr.s_addr) == prison_getip(td->td_ucred))
1200 		return (0);
1201 	return (1);
1202 }
1203