xref: /freebsd/sys/netinet/in_pcb.c (revision a3e8fd0b7f663db7eafff527d5c3ca3bcfa8a537)
1 /*
2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36 
37 #include "opt_ipsec.h"
38 #include "opt_inet6.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/proc.h>
49 #include <sys/jail.h>
50 #include <sys/kernel.h>
51 #include <sys/sysctl.h>
52 
53 #include <machine/limits.h>
54 
55 #include <vm/uma.h>
56 
57 #include <net/if.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 
61 #include <netinet/in.h>
62 #include <netinet/in_pcb.h>
63 #include <netinet/in_var.h>
64 #include <netinet/ip_var.h>
65 #ifdef INET6
66 #include <netinet/ip6.h>
67 #include <netinet6/ip6_var.h>
68 #endif /* INET6 */
69 
70 #ifdef IPSEC
71 #include <netinet6/ipsec.h>
72 #include <netkey/key.h>
73 #endif /* IPSEC */
74 
75 #ifdef FAST_IPSEC
76 #if defined(IPSEC) || defined(IPSEC_ESP)
77 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
78 #endif
79 
80 #include <netipsec/ipsec.h>
81 #include <netipsec/key.h>
82 #define	IPSEC
83 #endif /* FAST_IPSEC */
84 
85 struct	in_addr zeroin_addr;
86 
87 /*
88  * These configure the range of local port addresses assigned to
89  * "unspecified" outgoing connections/packets/whatever.
90  */
91 int	ipport_lowfirstauto  = IPPORT_RESERVED - 1;	/* 1023 */
92 int	ipport_lowlastauto = IPPORT_RESERVEDSTART;	/* 600 */
93 int	ipport_firstauto = IPPORT_HIFIRSTAUTO;		/* 49152 */
94 int	ipport_lastauto  = IPPORT_HILASTAUTO;		/* 65535 */
95 int	ipport_hifirstauto = IPPORT_HIFIRSTAUTO;	/* 49152 */
96 int	ipport_hilastauto  = IPPORT_HILASTAUTO;		/* 65535 */
97 
98 #define RANGECHK(var, min, max) \
99 	if ((var) < (min)) { (var) = (min); } \
100 	else if ((var) > (max)) { (var) = (max); }
101 
102 static int
103 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
104 {
105 	int error = sysctl_handle_int(oidp,
106 		oidp->oid_arg1, oidp->oid_arg2, req);
107 	if (!error) {
108 		RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
109 		RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
110 		RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
111 		RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
112 		RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
113 		RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
114 	}
115 	return error;
116 }
117 
118 #undef RANGECHK
119 
120 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
121 
122 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
123 	   &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
124 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
125 	   &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
126 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
127 	   &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
128 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
129 	   &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
130 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
131 	   &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
132 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
133 	   &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
134 
135 /*
136  * in_pcb.c: manage the Protocol Control Blocks.
137  *
138  * NOTE: It is assumed that most of these functions will be called at
139  * splnet(). XXX - There are, unfortunately, a few exceptions to this
140  * rule that should be fixed.
141  */
142 
143 /*
144  * Allocate a PCB and associate it with the socket.
145  */
146 int
147 in_pcballoc(so, pcbinfo, td)
148 	struct socket *so;
149 	struct inpcbinfo *pcbinfo;
150 	struct thread *td;
151 {
152 	register struct inpcb *inp;
153 #ifdef IPSEC
154 	int error;
155 #endif
156 
157 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
158 	if (inp == NULL)
159 		return (ENOBUFS);
160 	bzero((caddr_t)inp, sizeof(*inp));
161 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
162 	inp->inp_pcbinfo = pcbinfo;
163 	inp->inp_socket = so;
164 #ifdef IPSEC
165 	error = ipsec_init_policy(so, &inp->inp_sp);
166 	if (error != 0) {
167 		uma_zfree(pcbinfo->ipi_zone, inp);
168 		return error;
169 	}
170 #endif /*IPSEC*/
171 #if defined(INET6)
172 	if (INP_SOCKAF(so) == AF_INET6 && ip6_v6only)
173 		inp->inp_flags |= IN6P_IPV6_V6ONLY;
174 #endif
175 	LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
176 	pcbinfo->ipi_count++;
177 	so->so_pcb = (caddr_t)inp;
178 	INP_LOCK_INIT(inp, "inp");
179 #ifdef INET6
180 	if (ip6_auto_flowlabel)
181 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
182 #endif
183 	return (0);
184 }
185 
186 int
187 in_pcbbind(inp, nam, td)
188 	register struct inpcb *inp;
189 	struct sockaddr *nam;
190 	struct thread *td;
191 {
192 	int anonport, error;
193 
194 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
195 		return (EINVAL);
196 	anonport = inp->inp_lport == 0 && (nam == NULL ||
197 	    ((struct sockaddr_in *)nam)->sin_port == 0);
198 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
199 	    &inp->inp_lport, td);
200 	if (error)
201 		return (error);
202 	if (in_pcbinshash(inp) != 0) {
203 		inp->inp_laddr.s_addr = INADDR_ANY;
204 		inp->inp_lport = 0;
205 		return (EAGAIN);
206 	}
207 	if (anonport)
208 		inp->inp_flags |= INP_ANONPORT;
209 	return (0);
210 }
211 
212 /*
213  * Set up a bind operation on a PCB, performing port allocation
214  * as required, but do not actually modify the PCB. Callers can
215  * either complete the bind by setting inp_laddr/inp_lport and
216  * calling in_pcbinshash(), or they can just use the resulting
217  * port and address to authorise the sending of a once-off packet.
218  *
219  * On error, the values of *laddrp and *lportp are not changed.
220  */
221 int
222 in_pcbbind_setup(inp, nam, laddrp, lportp, td)
223 	struct inpcb *inp;
224 	struct sockaddr *nam;
225 	in_addr_t *laddrp;
226 	u_short *lportp;
227 	struct thread *td;
228 {
229 	struct socket *so = inp->inp_socket;
230 	unsigned short *lastport;
231 	struct sockaddr_in *sin;
232 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
233 	struct in_addr laddr;
234 	u_short lport = 0;
235 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
236 	int error, prison = 0;
237 
238 	if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
239 		return (EADDRNOTAVAIL);
240 	laddr.s_addr = *laddrp;
241 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
242 		return (EINVAL);
243 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
244 		wild = 1;
245 	if (nam) {
246 		sin = (struct sockaddr_in *)nam;
247 		if (nam->sa_len != sizeof (*sin))
248 			return (EINVAL);
249 #ifdef notdef
250 		/*
251 		 * We should check the family, but old programs
252 		 * incorrectly fail to initialize it.
253 		 */
254 		if (sin->sin_family != AF_INET)
255 			return (EAFNOSUPPORT);
256 #endif
257 		if (sin->sin_addr.s_addr != INADDR_ANY)
258 			if (prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
259 				return(EINVAL);
260 		if (sin->sin_port != *lportp) {
261 			/* Don't allow the port to change. */
262 			if (*lportp != 0)
263 				return (EINVAL);
264 			lport = sin->sin_port;
265 		}
266 		/* NB: lport is left as 0 if the port isn't being changed. */
267 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
268 			/*
269 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
270 			 * allow complete duplication of binding if
271 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
272 			 * and a multicast address is bound on both
273 			 * new and duplicated sockets.
274 			 */
275 			if (so->so_options & SO_REUSEADDR)
276 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
277 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
278 			sin->sin_port = 0;		/* yech... */
279 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
280 			if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
281 				return (EADDRNOTAVAIL);
282 		}
283 		laddr = sin->sin_addr;
284 		if (lport) {
285 			struct inpcb *t;
286 			/* GROSS */
287 			if (ntohs(lport) < IPPORT_RESERVED && td &&
288 			    suser_cred(td->td_ucred, PRISON_ROOT))
289 				return (EACCES);
290 			if (td && jailed(td->td_ucred))
291 				prison = 1;
292 			if (so->so_cred->cr_uid != 0 &&
293 			    !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
294 				t = in_pcblookup_local(inp->inp_pcbinfo,
295 				    sin->sin_addr, lport,
296 				    prison ? 0 :  INPLOOKUP_WILDCARD);
297 				if (t &&
298 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
299 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
300 				     (t->inp_socket->so_options &
301 					 SO_REUSEPORT) == 0) &&
302 				    (so->so_cred->cr_uid !=
303 				     t->inp_socket->so_cred->cr_uid)) {
304 #if defined(INET6)
305 					if (ntohl(sin->sin_addr.s_addr) !=
306 					    INADDR_ANY ||
307 					    ntohl(t->inp_laddr.s_addr) !=
308 					    INADDR_ANY ||
309 					    INP_SOCKAF(so) ==
310 					    INP_SOCKAF(t->inp_socket))
311 #endif /* defined(INET6) */
312 					return (EADDRINUSE);
313 				}
314 			}
315 			if (prison &&
316 			    prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
317 				return (EADDRNOTAVAIL);
318 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
319 			    lport, prison ? 0 : wild);
320 			if (t &&
321 			    (reuseport & t->inp_socket->so_options) == 0) {
322 #if defined(INET6)
323 				if (ntohl(sin->sin_addr.s_addr) !=
324 				    INADDR_ANY ||
325 				    ntohl(t->inp_laddr.s_addr) !=
326 				    INADDR_ANY ||
327 				    INP_SOCKAF(so) ==
328 				    INP_SOCKAF(t->inp_socket))
329 #endif /* defined(INET6) */
330 				return (EADDRINUSE);
331 			}
332 		}
333 	}
334 	if (*lportp != 0)
335 		lport = *lportp;
336 	if (lport == 0) {
337 		ushort first, last;
338 		int count;
339 
340 		if (laddr.s_addr != INADDR_ANY)
341 			if (prison_ip(td->td_ucred, 0, &laddr.s_addr))
342 				return (EINVAL);
343 
344 		if (inp->inp_flags & INP_HIGHPORT) {
345 			first = ipport_hifirstauto;	/* sysctl */
346 			last  = ipport_hilastauto;
347 			lastport = &pcbinfo->lasthi;
348 		} else if (inp->inp_flags & INP_LOWPORT) {
349 			if (td && (error = suser_cred(td->td_ucred,
350 			    PRISON_ROOT)) != 0)
351 				return error;
352 			first = ipport_lowfirstauto;	/* 1023 */
353 			last  = ipport_lowlastauto;	/* 600 */
354 			lastport = &pcbinfo->lastlow;
355 		} else {
356 			first = ipport_firstauto;	/* sysctl */
357 			last  = ipport_lastauto;
358 			lastport = &pcbinfo->lastport;
359 		}
360 		/*
361 		 * Simple check to ensure all ports are not used up causing
362 		 * a deadlock here.
363 		 *
364 		 * We split the two cases (up and down) so that the direction
365 		 * is not being tested on each round of the loop.
366 		 */
367 		if (first > last) {
368 			/*
369 			 * counting down
370 			 */
371 			count = first - last;
372 
373 			do {
374 				if (count-- < 0)	/* completely used? */
375 					return (EADDRNOTAVAIL);
376 				--*lastport;
377 				if (*lastport > first || *lastport < last)
378 					*lastport = first;
379 				lport = htons(*lastport);
380 			} while (in_pcblookup_local(pcbinfo, laddr, lport,
381 			    wild));
382 		} else {
383 			/*
384 			 * counting up
385 			 */
386 			count = last - first;
387 
388 			do {
389 				if (count-- < 0)	/* completely used? */
390 					return (EADDRNOTAVAIL);
391 				++*lastport;
392 				if (*lastport < first || *lastport > last)
393 					*lastport = first;
394 				lport = htons(*lastport);
395 			} while (in_pcblookup_local(pcbinfo, laddr, lport,
396 			    wild));
397 		}
398 	}
399 	if (prison_ip(td->td_ucred, 0, &laddr.s_addr))
400 		return (EINVAL);
401 	*laddrp = laddr.s_addr;
402 	*lportp = lport;
403 	return (0);
404 }
405 
406 /*
407  * Connect from a socket to a specified address.
408  * Both address and port must be specified in argument sin.
409  * If don't have a local address for this socket yet,
410  * then pick one.
411  */
412 int
413 in_pcbconnect(inp, nam, td)
414 	register struct inpcb *inp;
415 	struct sockaddr *nam;
416 	struct thread *td;
417 {
418 	u_short lport, fport;
419 	in_addr_t laddr, faddr;
420 	int anonport, error;
421 
422 	lport = inp->inp_lport;
423 	laddr = inp->inp_laddr.s_addr;
424 	anonport = (lport == 0);
425 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
426 	    NULL, td);
427 	if (error)
428 		return (error);
429 
430 	/* Do the initial binding of the local address if required. */
431 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
432 		inp->inp_lport = lport;
433 		inp->inp_laddr.s_addr = laddr;
434 		if (in_pcbinshash(inp) != 0) {
435 			inp->inp_laddr.s_addr = INADDR_ANY;
436 			inp->inp_lport = 0;
437 			return (EAGAIN);
438 		}
439 	}
440 
441 	/* Commit the remaining changes. */
442 	inp->inp_lport = lport;
443 	inp->inp_laddr.s_addr = laddr;
444 	inp->inp_faddr.s_addr = faddr;
445 	inp->inp_fport = fport;
446 	in_pcbrehash(inp);
447 	if (anonport)
448 		inp->inp_flags |= INP_ANONPORT;
449 	return (0);
450 }
451 
452 /*
453  * Set up for a connect from a socket to the specified address.
454  * On entry, *laddrp and *lportp should contain the current local
455  * address and port for the PCB; these are updated to the values
456  * that should be placed in inp_laddr and inp_lport to complete
457  * the connect.
458  *
459  * On success, *faddrp and *fportp will be set to the remote address
460  * and port. These are not updated in the error case.
461  *
462  * If the operation fails because the connection already exists,
463  * *oinpp will be set to the PCB of that connection so that the
464  * caller can decide to override it. In all other cases, *oinpp
465  * is set to NULL.
466  */
467 int
468 in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
469 	register struct inpcb *inp;
470 	struct sockaddr *nam;
471 	in_addr_t *laddrp;
472 	u_short *lportp;
473 	in_addr_t *faddrp;
474 	u_short *fportp;
475 	struct inpcb **oinpp;
476 	struct thread *td;
477 {
478 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
479 	struct in_ifaddr *ia;
480 	struct sockaddr_in sa;
481 	struct ucred *cred;
482 	struct inpcb *oinp;
483 	struct in_addr laddr, faddr;
484 	u_short lport, fport;
485 	int error;
486 
487 	if (oinpp != NULL)
488 		*oinpp = NULL;
489 	if (nam->sa_len != sizeof (*sin))
490 		return (EINVAL);
491 	if (sin->sin_family != AF_INET)
492 		return (EAFNOSUPPORT);
493 	if (sin->sin_port == 0)
494 		return (EADDRNOTAVAIL);
495 	laddr.s_addr = *laddrp;
496 	lport = *lportp;
497 	faddr = sin->sin_addr;
498 	fport = sin->sin_port;
499 	cred = inp->inp_socket->so_cred;
500 	if (laddr.s_addr == INADDR_ANY && jailed(cred)) {
501 		bzero(&sa, sizeof(sa));
502 		sa.sin_addr.s_addr = htonl(prison_getip(cred));
503 		sa.sin_len = sizeof(sa);
504 		sa.sin_family = AF_INET;
505 		error = in_pcbbind_setup(inp, (struct sockaddr *)&sa,
506 		    &laddr.s_addr, &lport, td);
507 		if (error)
508 			return (error);
509 	}
510 
511 	if (!TAILQ_EMPTY(&in_ifaddrhead)) {
512 		/*
513 		 * If the destination address is INADDR_ANY,
514 		 * use the primary local address.
515 		 * If the supplied address is INADDR_BROADCAST,
516 		 * and the primary interface supports broadcast,
517 		 * choose the broadcast address for that interface.
518 		 */
519 		if (faddr.s_addr == INADDR_ANY)
520 			faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
521 		else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
522 		    (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags &
523 		    IFF_BROADCAST))
524 			faddr = satosin(&TAILQ_FIRST(
525 			    &in_ifaddrhead)->ia_broadaddr)->sin_addr;
526 	}
527 	if (laddr.s_addr == INADDR_ANY) {
528 		register struct route *ro;
529 
530 		ia = (struct in_ifaddr *)0;
531 		/*
532 		 * If route is known or can be allocated now,
533 		 * our src addr is taken from the i/f, else punt.
534 		 * Note that we should check the address family of the cached
535 		 * destination, in case of sharing the cache with IPv6.
536 		 */
537 		ro = &inp->inp_route;
538 		if (ro->ro_rt &&
539 		    (ro->ro_dst.sa_family != AF_INET ||
540 		     satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr ||
541 		     inp->inp_socket->so_options & SO_DONTROUTE)) {
542 			RTFREE(ro->ro_rt);
543 			ro->ro_rt = (struct rtentry *)0;
544 		}
545 		if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
546 		    (ro->ro_rt == (struct rtentry *)0 ||
547 		    ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
548 			/* No route yet, so try to acquire one */
549 			bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
550 			ro->ro_dst.sa_family = AF_INET;
551 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
552 			((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr;
553 			rtalloc(ro);
554 		}
555 		/*
556 		 * If we found a route, use the address
557 		 * corresponding to the outgoing interface
558 		 * unless it is the loopback (in case a route
559 		 * to our address on another net goes to loopback).
560 		 */
561 		if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
562 			ia = ifatoia(ro->ro_rt->rt_ifa);
563 		if (ia == 0) {
564 			bzero(&sa, sizeof(sa));
565 			sa.sin_addr = faddr;
566 			sa.sin_len = sizeof(sa);
567 			sa.sin_family = AF_INET;
568 
569 			ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa)));
570 			if (ia == 0)
571 				ia = ifatoia(ifa_ifwithnet(sintosa(&sa)));
572 			if (ia == 0)
573 				ia = TAILQ_FIRST(&in_ifaddrhead);
574 			if (ia == 0)
575 				return (EADDRNOTAVAIL);
576 		}
577 		/*
578 		 * If the destination address is multicast and an outgoing
579 		 * interface has been set as a multicast option, use the
580 		 * address of that interface as our source address.
581 		 */
582 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
583 		    inp->inp_moptions != NULL) {
584 			struct ip_moptions *imo;
585 			struct ifnet *ifp;
586 
587 			imo = inp->inp_moptions;
588 			if (imo->imo_multicast_ifp != NULL) {
589 				ifp = imo->imo_multicast_ifp;
590 				TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
591 					if (ia->ia_ifp == ifp)
592 						break;
593 				if (ia == 0)
594 					return (EADDRNOTAVAIL);
595 			}
596 		}
597 		laddr = ia->ia_addr.sin_addr;
598 	}
599 
600 	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
601 	    0, NULL);
602 	if (oinp != NULL) {
603 		if (oinpp != NULL)
604 			*oinpp = oinp;
605 		return (EADDRINUSE);
606 	}
607 	if (lport == 0) {
608 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, td);
609 		if (error)
610 			return (error);
611 	}
612 	*laddrp = laddr.s_addr;
613 	*lportp = lport;
614 	*faddrp = faddr.s_addr;
615 	*fportp = fport;
616 	return (0);
617 }
618 
619 void
620 in_pcbdisconnect(inp)
621 	struct inpcb *inp;
622 {
623 
624 	inp->inp_faddr.s_addr = INADDR_ANY;
625 	inp->inp_fport = 0;
626 	in_pcbrehash(inp);
627 	if (inp->inp_socket->so_state & SS_NOFDREF)
628 		in_pcbdetach(inp);
629 }
630 
631 void
632 in_pcbdetach(inp)
633 	struct inpcb *inp;
634 {
635 	struct socket *so = inp->inp_socket;
636 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
637 
638 #ifdef IPSEC
639 	ipsec4_delete_pcbpolicy(inp);
640 #endif /*IPSEC*/
641 	inp->inp_gencnt = ++ipi->ipi_gencnt;
642 	in_pcbremlists(inp);
643 	so->so_pcb = 0;
644 	sotryfree(so);
645 	if (inp->inp_options)
646 		(void)m_free(inp->inp_options);
647 	if (inp->inp_route.ro_rt)
648 		rtfree(inp->inp_route.ro_rt);
649 	ip_freemoptions(inp->inp_moptions);
650 	inp->inp_vflag = 0;
651 	INP_LOCK_DESTROY(inp);
652 	uma_zfree(ipi->ipi_zone, inp);
653 }
654 
655 struct sockaddr *
656 in_sockaddr(port, addr_p)
657 	in_port_t port;
658 	struct in_addr *addr_p;
659 {
660 	struct sockaddr_in *sin;
661 
662 	MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
663 		M_WAITOK | M_ZERO);
664 	sin->sin_family = AF_INET;
665 	sin->sin_len = sizeof(*sin);
666 	sin->sin_addr = *addr_p;
667 	sin->sin_port = port;
668 
669 	return (struct sockaddr *)sin;
670 }
671 
672 /*
673  * The wrapper function will pass down the pcbinfo for this function to lock.
674  * The socket must have a valid
675  * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
676  * except through a kernel programming error, so it is acceptable to panic
677  * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
678  * because there actually /is/ a programming error somewhere... XXX)
679  */
680 int
681 in_setsockaddr(so, nam, pcbinfo)
682 	struct socket *so;
683 	struct sockaddr **nam;
684 	struct inpcbinfo *pcbinfo;
685 {
686 	int s;
687 	register struct inpcb *inp;
688 	struct in_addr addr;
689 	in_port_t port;
690 
691 	s = splnet();
692 	INP_INFO_RLOCK(pcbinfo);
693 	inp = sotoinpcb(so);
694 	if (!inp) {
695 		INP_INFO_RUNLOCK(pcbinfo);
696 		splx(s);
697 		return ECONNRESET;
698 	}
699 	INP_LOCK(inp);
700 	port = inp->inp_lport;
701 	addr = inp->inp_laddr;
702 	INP_UNLOCK(inp);
703 	INP_INFO_RUNLOCK(pcbinfo);
704 	splx(s);
705 
706 	*nam = in_sockaddr(port, &addr);
707 	return 0;
708 }
709 
710 /*
711  * The wrapper function will pass down the pcbinfo for this function to lock.
712  */
713 int
714 in_setpeeraddr(so, nam, pcbinfo)
715 	struct socket *so;
716 	struct sockaddr **nam;
717 	struct inpcbinfo *pcbinfo;
718 {
719 	int s;
720 	register struct inpcb *inp;
721 	struct in_addr addr;
722 	in_port_t port;
723 
724 	s = splnet();
725 	INP_INFO_RLOCK(pcbinfo);
726 	inp = sotoinpcb(so);
727 	if (!inp) {
728 		INP_INFO_RUNLOCK(pcbinfo);
729 		splx(s);
730 		return ECONNRESET;
731 	}
732 	INP_LOCK(inp);
733 	port = inp->inp_fport;
734 	addr = inp->inp_faddr;
735 	INP_UNLOCK(inp);
736 	INP_INFO_RUNLOCK(pcbinfo);
737 	splx(s);
738 
739 	*nam = in_sockaddr(port, &addr);
740 	return 0;
741 }
742 
743 void
744 in_pcbnotifyall(pcbinfo, faddr, errno, notify)
745 	struct inpcbinfo *pcbinfo;
746 	struct in_addr faddr;
747 	int errno;
748 	struct inpcb *(*notify)(struct inpcb *, int);
749 {
750 	struct inpcb *inp, *ninp;
751 	struct inpcbhead *head;
752 	int s;
753 
754 	s = splnet();
755 	INP_INFO_RLOCK(pcbinfo);
756 	head = pcbinfo->listhead;
757 	for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) {
758 		INP_LOCK(inp);
759 		ninp = LIST_NEXT(inp, inp_list);
760 #ifdef INET6
761 		if ((inp->inp_vflag & INP_IPV4) == 0) {
762 			INP_UNLOCK(inp);
763 			continue;
764 		}
765 #endif
766 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
767 		    inp->inp_socket == NULL) {
768 				INP_UNLOCK(inp);
769 				continue;
770 		}
771 		(*notify)(inp, errno);
772 		INP_UNLOCK(inp);
773 	}
774 	INP_INFO_RUNLOCK(pcbinfo);
775 	splx(s);
776 }
777 
778 void
779 in_pcbpurgeif0(pcbinfo, ifp)
780 	struct inpcbinfo *pcbinfo;
781 	struct ifnet *ifp;
782 {
783 	struct inpcb *inp;
784 	struct ip_moptions *imo;
785 	int i, gap;
786 
787 	/* why no splnet here? XXX */
788 	INP_INFO_RLOCK(pcbinfo);
789 	LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
790 		INP_LOCK(inp);
791 		imo = inp->inp_moptions;
792 		if ((inp->inp_vflag & INP_IPV4) &&
793 		    imo != NULL) {
794 			/*
795 			 * Unselect the outgoing interface if it is being
796 			 * detached.
797 			 */
798 			if (imo->imo_multicast_ifp == ifp)
799 				imo->imo_multicast_ifp = NULL;
800 
801 			/*
802 			 * Drop multicast group membership if we joined
803 			 * through the interface being detached.
804 			 */
805 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
806 			    i++) {
807 				if (imo->imo_membership[i]->inm_ifp == ifp) {
808 					in_delmulti(imo->imo_membership[i]);
809 					gap++;
810 				} else if (gap != 0)
811 					imo->imo_membership[i - gap] =
812 					    imo->imo_membership[i];
813 			}
814 			imo->imo_num_memberships -= gap;
815 		}
816 		INP_UNLOCK(inp);
817 	}
818 	INP_INFO_RUNLOCK(pcbinfo);
819 }
820 
821 /*
822  * Check for alternatives when higher level complains
823  * about service problems.  For now, invalidate cached
824  * routing information.  If the route was created dynamically
825  * (by a redirect), time to try a default gateway again.
826  */
827 void
828 in_losing(inp)
829 	struct inpcb *inp;
830 {
831 	register struct rtentry *rt;
832 	struct rt_addrinfo info;
833 
834 	if ((rt = inp->inp_route.ro_rt)) {
835 		bzero((caddr_t)&info, sizeof(info));
836 		info.rti_flags = rt->rt_flags;
837 		info.rti_info[RTAX_DST] = rt_key(rt);
838 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
839 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
840 		rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
841 		if (rt->rt_flags & RTF_DYNAMIC)
842 			(void) rtrequest1(RTM_DELETE, &info, NULL);
843 		inp->inp_route.ro_rt = NULL;
844 		rtfree(rt);
845 		/*
846 		 * A new route can be allocated
847 		 * the next time output is attempted.
848 		 */
849 	}
850 }
851 
852 /*
853  * After a routing change, flush old routing
854  * and allocate a (hopefully) better one.
855  */
856 struct inpcb *
857 in_rtchange(inp, errno)
858 	register struct inpcb *inp;
859 	int errno;
860 {
861 	if (inp->inp_route.ro_rt) {
862 		rtfree(inp->inp_route.ro_rt);
863 		inp->inp_route.ro_rt = 0;
864 		/*
865 		 * A new route can be allocated the next time
866 		 * output is attempted.
867 		 */
868 	}
869 	return inp;
870 }
871 
872 /*
873  * Lookup a PCB based on the local address and port.
874  */
875 struct inpcb *
876 in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay)
877 	struct inpcbinfo *pcbinfo;
878 	struct in_addr laddr;
879 	u_int lport_arg;
880 	int wild_okay;
881 {
882 	register struct inpcb *inp;
883 	int matchwild = 3, wildcard;
884 	u_short lport = lport_arg;
885 
886 	if (!wild_okay) {
887 		struct inpcbhead *head;
888 		/*
889 		 * Look for an unconnected (wildcard foreign addr) PCB that
890 		 * matches the local address and port we're looking for.
891 		 */
892 		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
893 		LIST_FOREACH(inp, head, inp_hash) {
894 #ifdef INET6
895 			if ((inp->inp_vflag & INP_IPV4) == 0)
896 				continue;
897 #endif
898 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
899 			    inp->inp_laddr.s_addr == laddr.s_addr &&
900 			    inp->inp_lport == lport) {
901 				/*
902 				 * Found.
903 				 */
904 				return (inp);
905 			}
906 		}
907 		/*
908 		 * Not found.
909 		 */
910 		return (NULL);
911 	} else {
912 		struct inpcbporthead *porthash;
913 		struct inpcbport *phd;
914 		struct inpcb *match = NULL;
915 		/*
916 		 * Best fit PCB lookup.
917 		 *
918 		 * First see if this local port is in use by looking on the
919 		 * port hash list.
920 		 */
921 		porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
922 		    pcbinfo->porthashmask)];
923 		LIST_FOREACH(phd, porthash, phd_hash) {
924 			if (phd->phd_port == lport)
925 				break;
926 		}
927 		if (phd != NULL) {
928 			/*
929 			 * Port is in use by one or more PCBs. Look for best
930 			 * fit.
931 			 */
932 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
933 				wildcard = 0;
934 #ifdef INET6
935 				if ((inp->inp_vflag & INP_IPV4) == 0)
936 					continue;
937 #endif
938 				if (inp->inp_faddr.s_addr != INADDR_ANY)
939 					wildcard++;
940 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
941 					if (laddr.s_addr == INADDR_ANY)
942 						wildcard++;
943 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
944 						continue;
945 				} else {
946 					if (laddr.s_addr != INADDR_ANY)
947 						wildcard++;
948 				}
949 				if (wildcard < matchwild) {
950 					match = inp;
951 					matchwild = wildcard;
952 					if (matchwild == 0) {
953 						break;
954 					}
955 				}
956 			}
957 		}
958 		return (match);
959 	}
960 }
961 
962 /*
963  * Lookup PCB in hash list.
964  */
965 struct inpcb *
966 in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard,
967 		  ifp)
968 	struct inpcbinfo *pcbinfo;
969 	struct in_addr faddr, laddr;
970 	u_int fport_arg, lport_arg;
971 	int wildcard;
972 	struct ifnet *ifp;
973 {
974 	struct inpcbhead *head;
975 	register struct inpcb *inp;
976 	u_short fport = fport_arg, lport = lport_arg;
977 
978 	/*
979 	 * First look for an exact match.
980 	 */
981 	head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
982 	LIST_FOREACH(inp, head, inp_hash) {
983 #ifdef INET6
984 		if ((inp->inp_vflag & INP_IPV4) == 0)
985 			continue;
986 #endif
987 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
988 		    inp->inp_laddr.s_addr == laddr.s_addr &&
989 		    inp->inp_fport == fport &&
990 		    inp->inp_lport == lport) {
991 			/*
992 			 * Found.
993 			 */
994 			return (inp);
995 		}
996 	}
997 	if (wildcard) {
998 		struct inpcb *local_wild = NULL;
999 #if defined(INET6)
1000 		struct inpcb *local_wild_mapped = NULL;
1001 #endif /* defined(INET6) */
1002 
1003 		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1004 		LIST_FOREACH(inp, head, inp_hash) {
1005 #ifdef INET6
1006 			if ((inp->inp_vflag & INP_IPV4) == 0)
1007 				continue;
1008 #endif
1009 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1010 			    inp->inp_lport == lport) {
1011 				if (ifp && ifp->if_type == IFT_FAITH &&
1012 				    (inp->inp_flags & INP_FAITH) == 0)
1013 					continue;
1014 				if (inp->inp_laddr.s_addr == laddr.s_addr)
1015 					return (inp);
1016 				else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1017 #if defined(INET6)
1018 					if (INP_CHECK_SOCKAF(inp->inp_socket,
1019 							     AF_INET6))
1020 						local_wild_mapped = inp;
1021 					else
1022 #endif /* defined(INET6) */
1023 					local_wild = inp;
1024 				}
1025 			}
1026 		}
1027 #if defined(INET6)
1028 		if (local_wild == NULL)
1029 			return (local_wild_mapped);
1030 #endif /* defined(INET6) */
1031 		return (local_wild);
1032 	}
1033 
1034 	/*
1035 	 * Not found.
1036 	 */
1037 	return (NULL);
1038 }
1039 
1040 /*
1041  * Insert PCB onto various hash lists.
1042  */
1043 int
1044 in_pcbinshash(inp)
1045 	struct inpcb *inp;
1046 {
1047 	struct inpcbhead *pcbhash;
1048 	struct inpcbporthead *pcbporthash;
1049 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1050 	struct inpcbport *phd;
1051 	u_int32_t hashkey_faddr;
1052 
1053 #ifdef INET6
1054 	if (inp->inp_vflag & INP_IPV6)
1055 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1056 	else
1057 #endif /* INET6 */
1058 	hashkey_faddr = inp->inp_faddr.s_addr;
1059 
1060 	pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1061 		 inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
1062 
1063 	pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1064 	    pcbinfo->porthashmask)];
1065 
1066 	/*
1067 	 * Go through port list and look for a head for this lport.
1068 	 */
1069 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
1070 		if (phd->phd_port == inp->inp_lport)
1071 			break;
1072 	}
1073 	/*
1074 	 * If none exists, malloc one and tack it on.
1075 	 */
1076 	if (phd == NULL) {
1077 		MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1078 		if (phd == NULL) {
1079 			return (ENOBUFS); /* XXX */
1080 		}
1081 		phd->phd_port = inp->inp_lport;
1082 		LIST_INIT(&phd->phd_pcblist);
1083 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1084 	}
1085 	inp->inp_phd = phd;
1086 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1087 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1088 	return (0);
1089 }
1090 
1091 /*
1092  * Move PCB to the proper hash bucket when { faddr, fport } have  been
1093  * changed. NOTE: This does not handle the case of the lport changing (the
1094  * hashed port list would have to be updated as well), so the lport must
1095  * not change after in_pcbinshash() has been called.
1096  */
1097 void
1098 in_pcbrehash(inp)
1099 	struct inpcb *inp;
1100 {
1101 	struct inpcbhead *head;
1102 	u_int32_t hashkey_faddr;
1103 
1104 #ifdef INET6
1105 	if (inp->inp_vflag & INP_IPV6)
1106 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1107 	else
1108 #endif /* INET6 */
1109 	hashkey_faddr = inp->inp_faddr.s_addr;
1110 
1111 	head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1112 		inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)];
1113 
1114 	LIST_REMOVE(inp, inp_hash);
1115 	LIST_INSERT_HEAD(head, inp, inp_hash);
1116 }
1117 
1118 /*
1119  * Remove PCB from various lists.
1120  */
1121 void
1122 in_pcbremlists(inp)
1123 	struct inpcb *inp;
1124 {
1125 	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1126 	if (inp->inp_lport) {
1127 		struct inpcbport *phd = inp->inp_phd;
1128 
1129 		LIST_REMOVE(inp, inp_hash);
1130 		LIST_REMOVE(inp, inp_portlist);
1131 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1132 			LIST_REMOVE(phd, phd_hash);
1133 			free(phd, M_PCB);
1134 		}
1135 	}
1136 	LIST_REMOVE(inp, inp_list);
1137 	inp->inp_pcbinfo->ipi_count--;
1138 }
1139 
1140 int
1141 prison_xinpcb(struct thread *td, struct inpcb *inp)
1142 {
1143 	if (!jailed(td->td_ucred))
1144 		return (0);
1145 	if (ntohl(inp->inp_laddr.s_addr) == prison_getip(td->td_ucred))
1146 		return (0);
1147 	return (1);
1148 }
1149