xref: /freebsd/sys/netinet/udp_usrreq.c (revision e45764721aedfa6460e1767664864bda9457c10e)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2008 Robert N. M. Watson
5  * Copyright (c) 2010-2011 Juniper Networks, Inc.
6  * Copyright (c) 2014 Kevin Lo
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Robert N. M. Watson under
10  * contract to Juniper Networks, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "opt_ipfw.h"
43 #include "opt_inet.h"
44 #include "opt_inet6.h"
45 #include "opt_ipsec.h"
46 
47 #include <sys/param.h>
48 #include <sys/domain.h>
49 #include <sys/eventhandler.h>
50 #include <sys/jail.h>
51 #include <sys/kernel.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/priv.h>
56 #include <sys/proc.h>
57 #include <sys/protosw.h>
58 #include <sys/sdt.h>
59 #include <sys/signalvar.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sx.h>
63 #include <sys/sysctl.h>
64 #include <sys/syslog.h>
65 #include <sys/systm.h>
66 
67 #include <vm/uma.h>
68 
69 #include <net/if.h>
70 #include <net/if_var.h>
71 #include <net/route.h>
72 
73 #include <netinet/in.h>
74 #include <netinet/in_kdtrace.h>
75 #include <netinet/in_pcb.h>
76 #include <netinet/in_systm.h>
77 #include <netinet/in_var.h>
78 #include <netinet/ip.h>
79 #ifdef INET6
80 #include <netinet/ip6.h>
81 #endif
82 #include <netinet/ip_icmp.h>
83 #include <netinet/icmp_var.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/ip_options.h>
86 #ifdef INET6
87 #include <netinet6/ip6_var.h>
88 #endif
89 #include <netinet/udp.h>
90 #include <netinet/udp_var.h>
91 #include <netinet/udplite.h>
92 
93 #ifdef IPSEC
94 #include <netipsec/ipsec.h>
95 #include <netipsec/esp.h>
96 #endif
97 
98 #include <machine/in_cksum.h>
99 
100 #include <security/mac/mac_framework.h>
101 
102 /*
103  * UDP and UDP-Lite protocols implementation.
104  * Per RFC 768, August, 1980.
105  * Per RFC 3828, July, 2004.
106  */
107 
108 /*
109  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
110  * removes the only data integrity mechanism for packets and malformed
111  * packets that would otherwise be discarded due to bad checksums, and may
112  * cause problems (especially for NFS data blocks).
113  */
114 VNET_DEFINE(int, udp_cksum) = 1;
115 SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
116     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
117 
118 int	udp_log_in_vain = 0;
119 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
120     &udp_log_in_vain, 0, "Log all incoming UDP packets");
121 
122 VNET_DEFINE(int, udp_blackhole) = 0;
123 SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
124     &VNET_NAME(udp_blackhole), 0,
125     "Do not send port unreachables for refused connects");
126 
127 u_long	udp_sendspace = 9216;		/* really max datagram size */
128 					/* 40 1K datagrams */
129 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
130     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
131 
132 u_long	udp_recvspace = 40 * (1024 +
133 #ifdef INET6
134 				      sizeof(struct sockaddr_in6)
135 #else
136 				      sizeof(struct sockaddr_in)
137 #endif
138 				      );
139 
140 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
141     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
142 
143 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
144 VNET_DEFINE(struct inpcbinfo, udbinfo);
145 VNET_DEFINE(struct inpcbhead, ulitecb);
146 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
147 static VNET_DEFINE(uma_zone_t, udpcb_zone);
148 #define	V_udpcb_zone			VNET(udpcb_zone)
149 
150 #ifndef UDBHASHSIZE
151 #define	UDBHASHSIZE	128
152 #endif
153 
154 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
155 VNET_PCPUSTAT_SYSINIT(udpstat);
156 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
157     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
158 
159 #ifdef VIMAGE
160 VNET_PCPUSTAT_SYSUNINIT(udpstat);
161 #endif /* VIMAGE */
162 #ifdef INET
163 static void	udp_detach(struct socket *so);
164 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
165 		    struct mbuf *, struct thread *);
166 #endif
167 
168 #ifdef IPSEC
169 #ifdef IPSEC_NAT_T
170 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
171 #ifdef INET
172 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
173 #endif
174 #endif /* IPSEC_NAT_T */
175 #endif /* IPSEC */
176 
177 static void
178 udp_zone_change(void *tag)
179 {
180 
181 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
182 	uma_zone_set_max(V_udpcb_zone, maxsockets);
183 }
184 
185 static int
186 udp_inpcb_init(void *mem, int size, int flags)
187 {
188 	struct inpcb *inp;
189 
190 	inp = mem;
191 	INP_LOCK_INIT(inp, "inp", "udpinp");
192 	return (0);
193 }
194 
195 static int
196 udplite_inpcb_init(void *mem, int size, int flags)
197 {
198 	struct inpcb *inp;
199 
200 	inp = mem;
201 	INP_LOCK_INIT(inp, "inp", "udpliteinp");
202 	return (0);
203 }
204 
205 void
206 udp_init(void)
207 {
208 
209 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
210 	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
211 	    IPI_HASHFIELDS_2TUPLE);
212 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
213 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
214 	uma_zone_set_max(V_udpcb_zone, maxsockets);
215 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
216 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
217 	    EVENTHANDLER_PRI_ANY);
218 }
219 
220 void
221 udplite_init(void)
222 {
223 
224 	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
225 	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
226 	    UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE);
227 }
228 
229 /*
230  * Kernel module interface for updating udpstat.  The argument is an index
231  * into udpstat treated as an array of u_long.  While this encodes the
232  * general layout of udpstat into the caller, it doesn't encode its location,
233  * so that future changes to add, for example, per-CPU stats support won't
234  * cause binary compatibility problems for kernel modules.
235  */
236 void
237 kmod_udpstat_inc(int statnum)
238 {
239 
240 	counter_u64_add(VNET(udpstat)[statnum], 1);
241 }
242 
243 int
244 udp_newudpcb(struct inpcb *inp)
245 {
246 	struct udpcb *up;
247 
248 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
249 	if (up == NULL)
250 		return (ENOBUFS);
251 	inp->inp_ppcb = up;
252 	return (0);
253 }
254 
255 void
256 udp_discardcb(struct udpcb *up)
257 {
258 
259 	uma_zfree(V_udpcb_zone, up);
260 }
261 
262 #ifdef VIMAGE
263 void
264 udp_destroy(void)
265 {
266 
267 	in_pcbinfo_destroy(&V_udbinfo);
268 	uma_zdestroy(V_udpcb_zone);
269 }
270 
271 void
272 udplite_destroy(void)
273 {
274 
275 	in_pcbinfo_destroy(&V_ulitecbinfo);
276 }
277 #endif
278 
279 #ifdef INET
280 /*
281  * Subroutine of udp_input(), which appends the provided mbuf chain to the
282  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
283  * contains the source address.  If the socket ends up being an IPv6 socket,
284  * udp_append() will convert to a sockaddr_in6 before passing the address
285  * into the socket code.
286  */
287 static void
288 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
289     struct sockaddr_in *udp_in)
290 {
291 	struct sockaddr *append_sa;
292 	struct socket *so;
293 	struct mbuf *opts = 0;
294 #ifdef INET6
295 	struct sockaddr_in6 udp_in6;
296 #endif
297 	struct udpcb *up;
298 
299 	INP_LOCK_ASSERT(inp);
300 
301 	/*
302 	 * Engage the tunneling protocol.
303 	 */
304 	up = intoudpcb(inp);
305 	if (up->u_tun_func != NULL) {
306 		(*up->u_tun_func)(n, off, inp);
307 		return;
308 	}
309 
310 	if (n == NULL)
311 		return;
312 
313 	off += sizeof(struct udphdr);
314 
315 #ifdef IPSEC
316 	/* Check AH/ESP integrity. */
317 	if (ipsec4_in_reject(n, inp)) {
318 		m_freem(n);
319 		IPSECSTAT_INC(ips_in_polvio);
320 		return;
321 	}
322 #ifdef IPSEC_NAT_T
323 	up = intoudpcb(inp);
324 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
325 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
326 		n = udp4_espdecap(inp, n, off);
327 		if (n == NULL)				/* Consumed. */
328 			return;
329 	}
330 #endif /* IPSEC_NAT_T */
331 #endif /* IPSEC */
332 #ifdef MAC
333 	if (mac_inpcb_check_deliver(inp, n) != 0) {
334 		m_freem(n);
335 		return;
336 	}
337 #endif /* MAC */
338 	if (inp->inp_flags & INP_CONTROLOPTS ||
339 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
340 #ifdef INET6
341 		if (inp->inp_vflag & INP_IPV6)
342 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
343 		else
344 #endif /* INET6 */
345 			ip_savecontrol(inp, &opts, ip, n);
346 	}
347 #ifdef INET6
348 	if (inp->inp_vflag & INP_IPV6) {
349 		bzero(&udp_in6, sizeof(udp_in6));
350 		udp_in6.sin6_len = sizeof(udp_in6);
351 		udp_in6.sin6_family = AF_INET6;
352 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
353 		append_sa = (struct sockaddr *)&udp_in6;
354 	} else
355 #endif /* INET6 */
356 		append_sa = (struct sockaddr *)udp_in;
357 	m_adj(n, off);
358 
359 	so = inp->inp_socket;
360 	SOCKBUF_LOCK(&so->so_rcv);
361 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
362 		SOCKBUF_UNLOCK(&so->so_rcv);
363 		m_freem(n);
364 		if (opts)
365 			m_freem(opts);
366 		UDPSTAT_INC(udps_fullsock);
367 	} else
368 		sorwakeup_locked(so);
369 }
370 
371 int
372 udp_input(struct mbuf **mp, int *offp, int proto)
373 {
374 	struct ip *ip;
375 	struct udphdr *uh;
376 	struct ifnet *ifp;
377 	struct inpcb *inp;
378 	uint16_t len, ip_len;
379 	struct inpcbinfo *pcbinfo;
380 	struct ip save_ip;
381 	struct sockaddr_in udp_in;
382 	struct mbuf *m;
383 	struct m_tag *fwd_tag;
384 	int cscov_partial, iphlen;
385 
386 	m = *mp;
387 	iphlen = *offp;
388 	ifp = m->m_pkthdr.rcvif;
389 	*mp = NULL;
390 	UDPSTAT_INC(udps_ipackets);
391 
392 	/*
393 	 * Strip IP options, if any; should skip this, make available to
394 	 * user, and use on returned packets, but we don't yet have a way to
395 	 * check the checksum with options still present.
396 	 */
397 	if (iphlen > sizeof (struct ip)) {
398 		ip_stripoptions(m);
399 		iphlen = sizeof(struct ip);
400 	}
401 
402 	/*
403 	 * Get IP and UDP header together in first mbuf.
404 	 */
405 	ip = mtod(m, struct ip *);
406 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
407 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
408 			UDPSTAT_INC(udps_hdrops);
409 			return (IPPROTO_DONE);
410 		}
411 		ip = mtod(m, struct ip *);
412 	}
413 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
414 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
415 
416 	/*
417 	 * Destination port of 0 is illegal, based on RFC768.
418 	 */
419 	if (uh->uh_dport == 0)
420 		goto badunlocked;
421 
422 	/*
423 	 * Construct sockaddr format source address.  Stuff source address
424 	 * and datagram in user buffer.
425 	 */
426 	bzero(&udp_in, sizeof(udp_in));
427 	udp_in.sin_len = sizeof(udp_in);
428 	udp_in.sin_family = AF_INET;
429 	udp_in.sin_port = uh->uh_sport;
430 	udp_in.sin_addr = ip->ip_src;
431 
432 	/*
433 	 * Make mbuf data length reflect UDP length.  If not enough data to
434 	 * reflect UDP length, drop.
435 	 */
436 	len = ntohs((u_short)uh->uh_ulen);
437 	ip_len = ntohs(ip->ip_len) - iphlen;
438 	if (proto == IPPROTO_UDPLITE && len == 0) {
439 		/* Zero means checksum over the complete packet. */
440 		len = ip_len;
441 		cscov_partial = 0;
442 	}
443 	if (ip_len != len) {
444 		if (len > ip_len || len < sizeof(struct udphdr)) {
445 			UDPSTAT_INC(udps_badlen);
446 			goto badunlocked;
447 		}
448 		if (proto == IPPROTO_UDP)
449 			m_adj(m, len - ip_len);
450 	}
451 
452 	/*
453 	 * Save a copy of the IP header in case we want restore it for
454 	 * sending an ICMP error message in response.
455 	 */
456 	if (!V_udp_blackhole)
457 		save_ip = *ip;
458 	else
459 		memset(&save_ip, 0, sizeof(save_ip));
460 
461 	/*
462 	 * Checksum extended UDP header and data.
463 	 */
464 	if (uh->uh_sum) {
465 		u_short uh_sum;
466 
467 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
468 		    !cscov_partial) {
469 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
470 				uh_sum = m->m_pkthdr.csum_data;
471 			else
472 				uh_sum = in_pseudo(ip->ip_src.s_addr,
473 				    ip->ip_dst.s_addr, htonl((u_short)len +
474 				    m->m_pkthdr.csum_data + proto));
475 			uh_sum ^= 0xffff;
476 		} else {
477 			char b[9];
478 
479 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
480 			bzero(((struct ipovly *)ip)->ih_x1, 9);
481 			((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
482 			    uh->uh_ulen : htons(ip_len);
483 			uh_sum = in_cksum(m, len + sizeof (struct ip));
484 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
485 		}
486 		if (uh_sum) {
487 			UDPSTAT_INC(udps_badsum);
488 			m_freem(m);
489 			return (IPPROTO_DONE);
490 		}
491 	} else
492 		UDPSTAT_INC(udps_nosum);
493 
494 	pcbinfo = get_inpcbinfo(proto);
495 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
496 	    in_broadcast(ip->ip_dst, ifp)) {
497 		struct inpcb *last;
498 		struct inpcbhead *pcblist;
499 		struct ip_moptions *imo;
500 
501 		INP_INFO_RLOCK(pcbinfo);
502 		pcblist = get_pcblist(proto);
503 		last = NULL;
504 		LIST_FOREACH(inp, pcblist, inp_list) {
505 			if (inp->inp_lport != uh->uh_dport)
506 				continue;
507 #ifdef INET6
508 			if ((inp->inp_vflag & INP_IPV4) == 0)
509 				continue;
510 #endif
511 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
512 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
513 				continue;
514 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
515 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
516 				continue;
517 			if (inp->inp_fport != 0 &&
518 			    inp->inp_fport != uh->uh_sport)
519 				continue;
520 
521 			INP_RLOCK(inp);
522 
523 			/*
524 			 * XXXRW: Because we weren't holding either the inpcb
525 			 * or the hash lock when we checked for a match
526 			 * before, we should probably recheck now that the
527 			 * inpcb lock is held.
528 			 */
529 
530 			/*
531 			 * Handle socket delivery policy for any-source
532 			 * and source-specific multicast. [RFC3678]
533 			 */
534 			imo = inp->inp_moptions;
535 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
536 				struct sockaddr_in	 group;
537 				int			 blocked;
538 				if (imo == NULL) {
539 					INP_RUNLOCK(inp);
540 					continue;
541 				}
542 				bzero(&group, sizeof(struct sockaddr_in));
543 				group.sin_len = sizeof(struct sockaddr_in);
544 				group.sin_family = AF_INET;
545 				group.sin_addr = ip->ip_dst;
546 
547 				blocked = imo_multi_filter(imo, ifp,
548 					(struct sockaddr *)&group,
549 					(struct sockaddr *)&udp_in);
550 				if (blocked != MCAST_PASS) {
551 					if (blocked == MCAST_NOTGMEMBER)
552 						IPSTAT_INC(ips_notmember);
553 					if (blocked == MCAST_NOTSMEMBER ||
554 					    blocked == MCAST_MUTED)
555 						UDPSTAT_INC(udps_filtermcast);
556 					INP_RUNLOCK(inp);
557 					continue;
558 				}
559 			}
560 			if (last != NULL) {
561 				struct mbuf *n;
562 
563 				n = m_copy(m, 0, M_COPYALL);
564 				udp_append(last, ip, n, iphlen, &udp_in);
565 				INP_RUNLOCK(last);
566 			}
567 			last = inp;
568 			/*
569 			 * Don't look for additional matches if this one does
570 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
571 			 * socket options set.  This heuristic avoids
572 			 * searching through all pcbs in the common case of a
573 			 * non-shared port.  It assumes that an application
574 			 * will never clear these options after setting them.
575 			 */
576 			if ((last->inp_socket->so_options &
577 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
578 				break;
579 		}
580 
581 		if (last == NULL) {
582 			/*
583 			 * No matching pcb found; discard datagram.  (No need
584 			 * to send an ICMP Port Unreachable for a broadcast
585 			 * or multicast datgram.)
586 			 */
587 			UDPSTAT_INC(udps_noportbcast);
588 			if (inp)
589 				INP_RUNLOCK(inp);
590 			INP_INFO_RUNLOCK(pcbinfo);
591 			goto badunlocked;
592 		}
593 		udp_append(last, ip, m, iphlen, &udp_in);
594 		INP_RUNLOCK(last);
595 		INP_INFO_RUNLOCK(pcbinfo);
596 		return (IPPROTO_DONE);
597 	}
598 
599 	/*
600 	 * Locate pcb for datagram.
601 	 */
602 
603 	/*
604 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
605 	 */
606 	if ((m->m_flags & M_IP_NEXTHOP) &&
607 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
608 		struct sockaddr_in *next_hop;
609 
610 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
611 
612 		/*
613 		 * Transparently forwarded. Pretend to be the destination.
614 		 * Already got one like this?
615 		 */
616 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
617 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
618 		if (!inp) {
619 			/*
620 			 * It's new.  Try to find the ambushing socket.
621 			 * Because we've rewritten the destination address,
622 			 * any hardware-generated hash is ignored.
623 			 */
624 			inp = in_pcblookup(pcbinfo, ip->ip_src,
625 			    uh->uh_sport, next_hop->sin_addr,
626 			    next_hop->sin_port ? htons(next_hop->sin_port) :
627 			    uh->uh_dport, INPLOOKUP_WILDCARD |
628 			    INPLOOKUP_RLOCKPCB, ifp);
629 		}
630 		/* Remove the tag from the packet. We don't need it anymore. */
631 		m_tag_delete(m, fwd_tag);
632 		m->m_flags &= ~M_IP_NEXTHOP;
633 	} else
634 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
635 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
636 		    INPLOOKUP_RLOCKPCB, ifp, m);
637 	if (inp == NULL) {
638 		if (udp_log_in_vain) {
639 			char buf[4*sizeof "123"];
640 
641 			strcpy(buf, inet_ntoa(ip->ip_dst));
642 			log(LOG_INFO,
643 			    "Connection attempt to UDP %s:%d from %s:%d\n",
644 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
645 			    ntohs(uh->uh_sport));
646 		}
647 		UDPSTAT_INC(udps_noport);
648 		if (m->m_flags & (M_BCAST | M_MCAST)) {
649 			UDPSTAT_INC(udps_noportbcast);
650 			goto badunlocked;
651 		}
652 		if (V_udp_blackhole)
653 			goto badunlocked;
654 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
655 			goto badunlocked;
656 		*ip = save_ip;
657 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
658 		return (IPPROTO_DONE);
659 	}
660 
661 	/*
662 	 * Check the minimum TTL for socket.
663 	 */
664 	INP_RLOCK_ASSERT(inp);
665 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
666 		INP_RUNLOCK(inp);
667 		m_freem(m);
668 		return (IPPROTO_DONE);
669 	}
670 	if (cscov_partial) {
671 		struct udpcb *up;
672 
673 		up = intoudpcb(inp);
674 		if (up->u_rxcslen > len) {
675 			INP_RUNLOCK(inp);
676 			m_freem(m);
677 			return (IPPROTO_DONE);
678 		}
679 	}
680 
681 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
682 	udp_append(inp, ip, m, iphlen, &udp_in);
683 	INP_RUNLOCK(inp);
684 	return (IPPROTO_DONE);
685 
686 badunlocked:
687 	m_freem(m);
688 	return (IPPROTO_DONE);
689 }
690 #endif /* INET */
691 
692 /*
693  * Notify a udp user of an asynchronous error; just wake up so that they can
694  * collect error status.
695  */
696 struct inpcb *
697 udp_notify(struct inpcb *inp, int errno)
698 {
699 
700 	/*
701 	 * While udp_ctlinput() always calls udp_notify() with a read lock
702 	 * when invoking it directly, in_pcbnotifyall() currently uses write
703 	 * locks due to sharing code with TCP.  For now, accept either a read
704 	 * or a write lock, but a read lock is sufficient.
705 	 */
706 	INP_LOCK_ASSERT(inp);
707 
708 	inp->inp_socket->so_error = errno;
709 	sorwakeup(inp->inp_socket);
710 	sowwakeup(inp->inp_socket);
711 	return (inp);
712 }
713 
714 #ifdef INET
715 static void
716 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
717     struct inpcbinfo *pcbinfo)
718 {
719 	struct ip *ip = vip;
720 	struct udphdr *uh;
721 	struct in_addr faddr;
722 	struct inpcb *inp;
723 
724 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
725 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
726 		return;
727 
728 	/*
729 	 * Redirects don't need to be handled up here.
730 	 */
731 	if (PRC_IS_REDIRECT(cmd))
732 		return;
733 
734 	/*
735 	 * Hostdead is ugly because it goes linearly through all PCBs.
736 	 *
737 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
738 	 * DoS attack on machines with many connections.
739 	 */
740 	if (cmd == PRC_HOSTDEAD)
741 		ip = NULL;
742 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
743 		return;
744 	if (ip != NULL) {
745 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
746 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
747 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
748 		if (inp != NULL) {
749 			INP_RLOCK_ASSERT(inp);
750 			if (inp->inp_socket != NULL) {
751 				udp_notify(inp, inetctlerrmap[cmd]);
752 			}
753 			INP_RUNLOCK(inp);
754 		}
755 	} else
756 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
757 		    udp_notify);
758 }
759 void
760 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
761 {
762 
763 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
764 }
765 
766 void
767 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
768 {
769 
770 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
771 }
772 #endif /* INET */
773 
774 static int
775 udp_pcblist(SYSCTL_HANDLER_ARGS)
776 {
777 	int error, i, n;
778 	struct inpcb *inp, **inp_list;
779 	inp_gen_t gencnt;
780 	struct xinpgen xig;
781 
782 	/*
783 	 * The process of preparing the PCB list is too time-consuming and
784 	 * resource-intensive to repeat twice on every request.
785 	 */
786 	if (req->oldptr == 0) {
787 		n = V_udbinfo.ipi_count;
788 		n += imax(n / 8, 10);
789 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
790 		return (0);
791 	}
792 
793 	if (req->newptr != 0)
794 		return (EPERM);
795 
796 	/*
797 	 * OK, now we're committed to doing something.
798 	 */
799 	INP_INFO_RLOCK(&V_udbinfo);
800 	gencnt = V_udbinfo.ipi_gencnt;
801 	n = V_udbinfo.ipi_count;
802 	INP_INFO_RUNLOCK(&V_udbinfo);
803 
804 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
805 		+ n * sizeof(struct xinpcb));
806 	if (error != 0)
807 		return (error);
808 
809 	xig.xig_len = sizeof xig;
810 	xig.xig_count = n;
811 	xig.xig_gen = gencnt;
812 	xig.xig_sogen = so_gencnt;
813 	error = SYSCTL_OUT(req, &xig, sizeof xig);
814 	if (error)
815 		return (error);
816 
817 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
818 	if (inp_list == 0)
819 		return (ENOMEM);
820 
821 	INP_INFO_RLOCK(&V_udbinfo);
822 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
823 	     inp = LIST_NEXT(inp, inp_list)) {
824 		INP_WLOCK(inp);
825 		if (inp->inp_gencnt <= gencnt &&
826 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
827 			in_pcbref(inp);
828 			inp_list[i++] = inp;
829 		}
830 		INP_WUNLOCK(inp);
831 	}
832 	INP_INFO_RUNLOCK(&V_udbinfo);
833 	n = i;
834 
835 	error = 0;
836 	for (i = 0; i < n; i++) {
837 		inp = inp_list[i];
838 		INP_RLOCK(inp);
839 		if (inp->inp_gencnt <= gencnt) {
840 			struct xinpcb xi;
841 
842 			bzero(&xi, sizeof(xi));
843 			xi.xi_len = sizeof xi;
844 			/* XXX should avoid extra copy */
845 			bcopy(inp, &xi.xi_inp, sizeof *inp);
846 			if (inp->inp_socket)
847 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
848 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
849 			INP_RUNLOCK(inp);
850 			error = SYSCTL_OUT(req, &xi, sizeof xi);
851 		} else
852 			INP_RUNLOCK(inp);
853 	}
854 	INP_INFO_WLOCK(&V_udbinfo);
855 	for (i = 0; i < n; i++) {
856 		inp = inp_list[i];
857 		INP_RLOCK(inp);
858 		if (!in_pcbrele_rlocked(inp))
859 			INP_RUNLOCK(inp);
860 	}
861 	INP_INFO_WUNLOCK(&V_udbinfo);
862 
863 	if (!error) {
864 		/*
865 		 * Give the user an updated idea of our state.  If the
866 		 * generation differs from what we told her before, she knows
867 		 * that something happened while we were processing this
868 		 * request, and it might be necessary to retry.
869 		 */
870 		INP_INFO_RLOCK(&V_udbinfo);
871 		xig.xig_gen = V_udbinfo.ipi_gencnt;
872 		xig.xig_sogen = so_gencnt;
873 		xig.xig_count = V_udbinfo.ipi_count;
874 		INP_INFO_RUNLOCK(&V_udbinfo);
875 		error = SYSCTL_OUT(req, &xig, sizeof xig);
876 	}
877 	free(inp_list, M_TEMP);
878 	return (error);
879 }
880 
881 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
882     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
883     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
884 
885 #ifdef INET
886 static int
887 udp_getcred(SYSCTL_HANDLER_ARGS)
888 {
889 	struct xucred xuc;
890 	struct sockaddr_in addrs[2];
891 	struct inpcb *inp;
892 	int error;
893 
894 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
895 	if (error)
896 		return (error);
897 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
898 	if (error)
899 		return (error);
900 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
901 	    addrs[0].sin_addr, addrs[0].sin_port,
902 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
903 	if (inp != NULL) {
904 		INP_RLOCK_ASSERT(inp);
905 		if (inp->inp_socket == NULL)
906 			error = ENOENT;
907 		if (error == 0)
908 			error = cr_canseeinpcb(req->td->td_ucred, inp);
909 		if (error == 0)
910 			cru2x(inp->inp_cred, &xuc);
911 		INP_RUNLOCK(inp);
912 	} else
913 		error = ENOENT;
914 	if (error == 0)
915 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
916 	return (error);
917 }
918 
919 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
920     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
921     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
922 #endif /* INET */
923 
924 int
925 udp_ctloutput(struct socket *so, struct sockopt *sopt)
926 {
927 	struct inpcb *inp;
928 	struct udpcb *up;
929 	int isudplite, error, optval;
930 
931 	error = 0;
932 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
933 	inp = sotoinpcb(so);
934 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
935 	INP_WLOCK(inp);
936 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
937 #ifdef INET6
938 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
939 			INP_WUNLOCK(inp);
940 			error = ip6_ctloutput(so, sopt);
941 		}
942 #endif
943 #if defined(INET) && defined(INET6)
944 		else
945 #endif
946 #ifdef INET
947 		{
948 			INP_WUNLOCK(inp);
949 			error = ip_ctloutput(so, sopt);
950 		}
951 #endif
952 		return (error);
953 	}
954 
955 	switch (sopt->sopt_dir) {
956 	case SOPT_SET:
957 		switch (sopt->sopt_name) {
958 		case UDP_ENCAP:
959 			INP_WUNLOCK(inp);
960 			error = sooptcopyin(sopt, &optval, sizeof optval,
961 					    sizeof optval);
962 			if (error)
963 				break;
964 			inp = sotoinpcb(so);
965 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
966 			INP_WLOCK(inp);
967 #ifdef IPSEC_NAT_T
968 			up = intoudpcb(inp);
969 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
970 #endif
971 			switch (optval) {
972 			case 0:
973 				/* Clear all UDP encap. */
974 #ifdef IPSEC_NAT_T
975 				up->u_flags &= ~UF_ESPINUDP_ALL;
976 #endif
977 				break;
978 #ifdef IPSEC_NAT_T
979 			case UDP_ENCAP_ESPINUDP:
980 			case UDP_ENCAP_ESPINUDP_NON_IKE:
981 				up->u_flags &= ~UF_ESPINUDP_ALL;
982 				if (optval == UDP_ENCAP_ESPINUDP)
983 					up->u_flags |= UF_ESPINUDP;
984 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
985 					up->u_flags |= UF_ESPINUDP_NON_IKE;
986 				break;
987 #endif
988 			default:
989 				error = EINVAL;
990 				break;
991 			}
992 			INP_WUNLOCK(inp);
993 			break;
994 		case UDPLITE_SEND_CSCOV:
995 		case UDPLITE_RECV_CSCOV:
996 			if (!isudplite) {
997 				INP_WUNLOCK(inp);
998 				error = ENOPROTOOPT;
999 				break;
1000 			}
1001 			INP_WUNLOCK(inp);
1002 			error = sooptcopyin(sopt, &optval, sizeof(optval),
1003 			    sizeof(optval));
1004 			if (error != 0)
1005 				break;
1006 			inp = sotoinpcb(so);
1007 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1008 			INP_WLOCK(inp);
1009 			up = intoudpcb(inp);
1010 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1011 			if (optval != 0 && optval < 8) {
1012 				INP_WUNLOCK(inp);
1013 				error = EINVAL;
1014 				break;
1015 			}
1016 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1017 				up->u_txcslen = optval;
1018 			else
1019 				up->u_rxcslen = optval;
1020 			INP_WUNLOCK(inp);
1021 			break;
1022 		default:
1023 			INP_WUNLOCK(inp);
1024 			error = ENOPROTOOPT;
1025 			break;
1026 		}
1027 		break;
1028 	case SOPT_GET:
1029 		switch (sopt->sopt_name) {
1030 #ifdef IPSEC_NAT_T
1031 		case UDP_ENCAP:
1032 			up = intoudpcb(inp);
1033 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1034 			optval = up->u_flags & UF_ESPINUDP_ALL;
1035 			INP_WUNLOCK(inp);
1036 			error = sooptcopyout(sopt, &optval, sizeof optval);
1037 			break;
1038 #endif
1039 		case UDPLITE_SEND_CSCOV:
1040 		case UDPLITE_RECV_CSCOV:
1041 			if (!isudplite) {
1042 				INP_WUNLOCK(inp);
1043 				error = ENOPROTOOPT;
1044 				break;
1045 			}
1046 			up = intoudpcb(inp);
1047 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1048 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1049 				optval = up->u_txcslen;
1050 			else
1051 				optval = up->u_rxcslen;
1052 			INP_WUNLOCK(inp);
1053 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1054 			break;
1055 		default:
1056 			INP_WUNLOCK(inp);
1057 			error = ENOPROTOOPT;
1058 			break;
1059 		}
1060 		break;
1061 	}
1062 	return (error);
1063 }
1064 
1065 #ifdef INET
1066 #define	UH_WLOCKED	2
1067 #define	UH_RLOCKED	1
1068 #define	UH_UNLOCKED	0
1069 static int
1070 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1071     struct mbuf *control, struct thread *td)
1072 {
1073 	struct udpiphdr *ui;
1074 	int len = m->m_pkthdr.len;
1075 	struct in_addr faddr, laddr;
1076 	struct cmsghdr *cm;
1077 	struct inpcbinfo *pcbinfo;
1078 	struct sockaddr_in *sin, src;
1079 	int cscov_partial = 0;
1080 	int error = 0;
1081 	int ipflags;
1082 	u_short fport, lport;
1083 	int unlock_udbinfo;
1084 	u_char tos;
1085 	uint8_t pr;
1086 	uint16_t cscov = 0;
1087 
1088 	/*
1089 	 * udp_output() may need to temporarily bind or connect the current
1090 	 * inpcb.  As such, we don't know up front whether we will need the
1091 	 * pcbinfo lock or not.  Do any work to decide what is needed up
1092 	 * front before acquiring any locks.
1093 	 */
1094 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1095 		if (control)
1096 			m_freem(control);
1097 		m_freem(m);
1098 		return (EMSGSIZE);
1099 	}
1100 
1101 	src.sin_family = 0;
1102 	INP_RLOCK(inp);
1103 	tos = inp->inp_ip_tos;
1104 	if (control != NULL) {
1105 		/*
1106 		 * XXX: Currently, we assume all the optional information is
1107 		 * stored in a single mbuf.
1108 		 */
1109 		if (control->m_next) {
1110 			INP_RUNLOCK(inp);
1111 			m_freem(control);
1112 			m_freem(m);
1113 			return (EINVAL);
1114 		}
1115 		for (; control->m_len > 0;
1116 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
1117 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1118 			cm = mtod(control, struct cmsghdr *);
1119 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1120 			    || cm->cmsg_len > control->m_len) {
1121 				error = EINVAL;
1122 				break;
1123 			}
1124 			if (cm->cmsg_level != IPPROTO_IP)
1125 				continue;
1126 
1127 			switch (cm->cmsg_type) {
1128 			case IP_SENDSRCADDR:
1129 				if (cm->cmsg_len !=
1130 				    CMSG_LEN(sizeof(struct in_addr))) {
1131 					error = EINVAL;
1132 					break;
1133 				}
1134 				bzero(&src, sizeof(src));
1135 				src.sin_family = AF_INET;
1136 				src.sin_len = sizeof(src);
1137 				src.sin_port = inp->inp_lport;
1138 				src.sin_addr =
1139 				    *(struct in_addr *)CMSG_DATA(cm);
1140 				break;
1141 
1142 			case IP_TOS:
1143 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1144 					error = EINVAL;
1145 					break;
1146 				}
1147 				tos = *(u_char *)CMSG_DATA(cm);
1148 				break;
1149 
1150 			default:
1151 				error = ENOPROTOOPT;
1152 				break;
1153 			}
1154 			if (error)
1155 				break;
1156 		}
1157 		m_freem(control);
1158 	}
1159 	if (error) {
1160 		INP_RUNLOCK(inp);
1161 		m_freem(m);
1162 		return (error);
1163 	}
1164 
1165 	/*
1166 	 * Depending on whether or not the application has bound or connected
1167 	 * the socket, we may have to do varying levels of work.  The optimal
1168 	 * case is for a connected UDP socket, as a global lock isn't
1169 	 * required at all.
1170 	 *
1171 	 * In order to decide which we need, we require stability of the
1172 	 * inpcb binding, which we ensure by acquiring a read lock on the
1173 	 * inpcb.  This doesn't strictly follow the lock order, so we play
1174 	 * the trylock and retry game; note that we may end up with more
1175 	 * conservative locks than required the second time around, so later
1176 	 * assertions have to accept that.  Further analysis of the number of
1177 	 * misses under contention is required.
1178 	 *
1179 	 * XXXRW: Check that hash locking update here is correct.
1180 	 */
1181 	pr = inp->inp_socket->so_proto->pr_protocol;
1182 	pcbinfo = get_inpcbinfo(pr);
1183 	sin = (struct sockaddr_in *)addr;
1184 	if (sin != NULL &&
1185 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1186 		INP_RUNLOCK(inp);
1187 		INP_WLOCK(inp);
1188 		INP_HASH_WLOCK(pcbinfo);
1189 		unlock_udbinfo = UH_WLOCKED;
1190 	} else if ((sin != NULL && (
1191 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
1192 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1193 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
1194 	    (inp->inp_lport == 0))) ||
1195 	    (src.sin_family == AF_INET)) {
1196 		INP_HASH_RLOCK(pcbinfo);
1197 		unlock_udbinfo = UH_RLOCKED;
1198 	} else
1199 		unlock_udbinfo = UH_UNLOCKED;
1200 
1201 	/*
1202 	 * If the IP_SENDSRCADDR control message was specified, override the
1203 	 * source address for this datagram.  Its use is invalidated if the
1204 	 * address thus specified is incomplete or clobbers other inpcbs.
1205 	 */
1206 	laddr = inp->inp_laddr;
1207 	lport = inp->inp_lport;
1208 	if (src.sin_family == AF_INET) {
1209 		INP_HASH_LOCK_ASSERT(pcbinfo);
1210 		if ((lport == 0) ||
1211 		    (laddr.s_addr == INADDR_ANY &&
1212 		     src.sin_addr.s_addr == INADDR_ANY)) {
1213 			error = EINVAL;
1214 			goto release;
1215 		}
1216 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1217 		    &laddr.s_addr, &lport, td->td_ucred);
1218 		if (error)
1219 			goto release;
1220 	}
1221 
1222 	/*
1223 	 * If a UDP socket has been connected, then a local address/port will
1224 	 * have been selected and bound.
1225 	 *
1226 	 * If a UDP socket has not been connected to, then an explicit
1227 	 * destination address must be used, in which case a local
1228 	 * address/port may not have been selected and bound.
1229 	 */
1230 	if (sin != NULL) {
1231 		INP_LOCK_ASSERT(inp);
1232 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1233 			error = EISCONN;
1234 			goto release;
1235 		}
1236 
1237 		/*
1238 		 * Jail may rewrite the destination address, so let it do
1239 		 * that before we use it.
1240 		 */
1241 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1242 		if (error)
1243 			goto release;
1244 
1245 		/*
1246 		 * If a local address or port hasn't yet been selected, or if
1247 		 * the destination address needs to be rewritten due to using
1248 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1249 		 * to do the heavy lifting.  Once a port is selected, we
1250 		 * commit the binding back to the socket; we also commit the
1251 		 * binding of the address if in jail.
1252 		 *
1253 		 * If we already have a valid binding and we're not
1254 		 * requesting a destination address rewrite, use a fast path.
1255 		 */
1256 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1257 		    inp->inp_lport == 0 ||
1258 		    sin->sin_addr.s_addr == INADDR_ANY ||
1259 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1260 			INP_HASH_LOCK_ASSERT(pcbinfo);
1261 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1262 			    &lport, &faddr.s_addr, &fport, NULL,
1263 			    td->td_ucred);
1264 			if (error)
1265 				goto release;
1266 
1267 			/*
1268 			 * XXXRW: Why not commit the port if the address is
1269 			 * !INADDR_ANY?
1270 			 */
1271 			/* Commit the local port if newly assigned. */
1272 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1273 			    inp->inp_lport == 0) {
1274 				INP_WLOCK_ASSERT(inp);
1275 				INP_HASH_WLOCK_ASSERT(pcbinfo);
1276 				/*
1277 				 * Remember addr if jailed, to prevent
1278 				 * rebinding.
1279 				 */
1280 				if (prison_flag(td->td_ucred, PR_IP4))
1281 					inp->inp_laddr = laddr;
1282 				inp->inp_lport = lport;
1283 				if (in_pcbinshash(inp) != 0) {
1284 					inp->inp_lport = 0;
1285 					error = EAGAIN;
1286 					goto release;
1287 				}
1288 				inp->inp_flags |= INP_ANONPORT;
1289 			}
1290 		} else {
1291 			faddr = sin->sin_addr;
1292 			fport = sin->sin_port;
1293 		}
1294 	} else {
1295 		INP_LOCK_ASSERT(inp);
1296 		faddr = inp->inp_faddr;
1297 		fport = inp->inp_fport;
1298 		if (faddr.s_addr == INADDR_ANY) {
1299 			error = ENOTCONN;
1300 			goto release;
1301 		}
1302 	}
1303 
1304 	/*
1305 	 * Calculate data length and get a mbuf for UDP, IP, and possible
1306 	 * link-layer headers.  Immediate slide the data pointer back forward
1307 	 * since we won't use that space at this layer.
1308 	 */
1309 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1310 	if (m == NULL) {
1311 		error = ENOBUFS;
1312 		goto release;
1313 	}
1314 	m->m_data += max_linkhdr;
1315 	m->m_len -= max_linkhdr;
1316 	m->m_pkthdr.len -= max_linkhdr;
1317 
1318 	/*
1319 	 * Fill in mbuf with extended UDP header and addresses and length put
1320 	 * into network format.
1321 	 */
1322 	ui = mtod(m, struct udpiphdr *);
1323 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1324 	ui->ui_pr = pr;
1325 	ui->ui_src = laddr;
1326 	ui->ui_dst = faddr;
1327 	ui->ui_sport = lport;
1328 	ui->ui_dport = fport;
1329 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1330 	if (pr == IPPROTO_UDPLITE) {
1331 		struct udpcb *up;
1332 		uint16_t plen;
1333 
1334 		up = intoudpcb(inp);
1335 		cscov = up->u_txcslen;
1336 		plen = (u_short)len + sizeof(struct udphdr);
1337 		if (cscov >= plen)
1338 			cscov = 0;
1339 		ui->ui_len = htons(plen);
1340 		ui->ui_ulen = htons(cscov);
1341 		/*
1342 		 * For UDP-Lite, checksum coverage length of zero means
1343 		 * the entire UDPLite packet is covered by the checksum.
1344 		 */
1345 		cscov_partial = (cscov == 0) ? 0 : 1;
1346 	} else
1347 		ui->ui_v = IPVERSION << 4;
1348 
1349 	/*
1350 	 * Set the Don't Fragment bit in the IP header.
1351 	 */
1352 	if (inp->inp_flags & INP_DONTFRAG) {
1353 		struct ip *ip;
1354 
1355 		ip = (struct ip *)&ui->ui_i;
1356 		ip->ip_off |= htons(IP_DF);
1357 	}
1358 
1359 	ipflags = 0;
1360 	if (inp->inp_socket->so_options & SO_DONTROUTE)
1361 		ipflags |= IP_ROUTETOIF;
1362 	if (inp->inp_socket->so_options & SO_BROADCAST)
1363 		ipflags |= IP_ALLOWBROADCAST;
1364 	if (inp->inp_flags & INP_ONESBCAST)
1365 		ipflags |= IP_SENDONES;
1366 
1367 #ifdef MAC
1368 	mac_inpcb_create_mbuf(inp, m);
1369 #endif
1370 
1371 	/*
1372 	 * Set up checksum and output datagram.
1373 	 */
1374 	ui->ui_sum = 0;
1375 	if (pr == IPPROTO_UDPLITE) {
1376 		if (inp->inp_flags & INP_ONESBCAST)
1377 			faddr.s_addr = INADDR_BROADCAST;
1378 		if (cscov_partial) {
1379 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1380 				ui->ui_sum = 0xffff;
1381 		} else {
1382 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1383 				ui->ui_sum = 0xffff;
1384 		}
1385 	} else if (V_udp_cksum) {
1386 		if (inp->inp_flags & INP_ONESBCAST)
1387 			faddr.s_addr = INADDR_BROADCAST;
1388 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1389 		    htons((u_short)len + sizeof(struct udphdr) + pr));
1390 		m->m_pkthdr.csum_flags = CSUM_UDP;
1391 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1392 	}
1393 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1394 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1395 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1396 	UDPSTAT_INC(udps_opackets);
1397 
1398 	if (unlock_udbinfo == UH_WLOCKED)
1399 		INP_HASH_WUNLOCK(pcbinfo);
1400 	else if (unlock_udbinfo == UH_RLOCKED)
1401 		INP_HASH_RUNLOCK(pcbinfo);
1402 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1403 	error = ip_output(m, inp->inp_options, NULL, ipflags,
1404 	    inp->inp_moptions, inp);
1405 	if (unlock_udbinfo == UH_WLOCKED)
1406 		INP_WUNLOCK(inp);
1407 	else
1408 		INP_RUNLOCK(inp);
1409 	return (error);
1410 
1411 release:
1412 	if (unlock_udbinfo == UH_WLOCKED) {
1413 		INP_HASH_WUNLOCK(pcbinfo);
1414 		INP_WUNLOCK(inp);
1415 	} else if (unlock_udbinfo == UH_RLOCKED) {
1416 		INP_HASH_RUNLOCK(pcbinfo);
1417 		INP_RUNLOCK(inp);
1418 	} else
1419 		INP_RUNLOCK(inp);
1420 	m_freem(m);
1421 	return (error);
1422 }
1423 
1424 
1425 #if defined(IPSEC) && defined(IPSEC_NAT_T)
1426 /*
1427  * Potentially decap ESP in UDP frame.  Check for an ESP header
1428  * and optional marker; if present, strip the UDP header and
1429  * push the result through IPSec.
1430  *
1431  * Returns mbuf to be processed (potentially re-allocated) or
1432  * NULL if consumed and/or processed.
1433  */
1434 static struct mbuf *
1435 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1436 {
1437 	size_t minlen, payload, skip, iphlen;
1438 	caddr_t data;
1439 	struct udpcb *up;
1440 	struct m_tag *tag;
1441 	struct udphdr *udphdr;
1442 	struct ip *ip;
1443 
1444 	INP_RLOCK_ASSERT(inp);
1445 
1446 	/*
1447 	 * Pull up data so the longest case is contiguous:
1448 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
1449 	 */
1450 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1451 	if (minlen > m->m_pkthdr.len)
1452 		minlen = m->m_pkthdr.len;
1453 	if ((m = m_pullup(m, minlen)) == NULL) {
1454 		IPSECSTAT_INC(ips_in_inval);
1455 		return (NULL);		/* Bypass caller processing. */
1456 	}
1457 	data = mtod(m, caddr_t);	/* Points to ip header. */
1458 	payload = m->m_len - off;	/* Size of payload. */
1459 
1460 	if (payload == 1 && data[off] == '\xff')
1461 		return (m);		/* NB: keepalive packet, no decap. */
1462 
1463 	up = intoudpcb(inp);
1464 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1465 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1466 	    ("u_flags 0x%x", up->u_flags));
1467 
1468 	/*
1469 	 * Check that the payload is large enough to hold an
1470 	 * ESP header and compute the amount of data to remove.
1471 	 *
1472 	 * NB: the caller has already done a pullup for us.
1473 	 * XXX can we assume alignment and eliminate bcopys?
1474 	 */
1475 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1476 		/*
1477 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1478 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1479 		 * possible AH mode non-IKE marker+non-ESP marker
1480 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
1481 		 */
1482 		uint64_t marker;
1483 
1484 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1485 			return (m);	/* NB: no decap. */
1486 		bcopy(data + off, &marker, sizeof(uint64_t));
1487 		if (marker != 0)	/* Non-IKE marker. */
1488 			return (m);	/* NB: no decap. */
1489 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
1490 	} else {
1491 		uint32_t spi;
1492 
1493 		if (payload <= sizeof(struct esp)) {
1494 			IPSECSTAT_INC(ips_in_inval);
1495 			m_freem(m);
1496 			return (NULL);	/* Discard. */
1497 		}
1498 		bcopy(data + off, &spi, sizeof(uint32_t));
1499 		if (spi == 0)		/* Non-ESP marker. */
1500 			return (m);	/* NB: no decap. */
1501 		skip = sizeof(struct udphdr);
1502 	}
1503 
1504 	/*
1505 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1506 	 * the UDP ports. This is required if we want to select
1507 	 * the right SPD for multiple hosts behind same NAT.
1508 	 *
1509 	 * NB: ports are maintained in network byte order everywhere
1510 	 *     in the NAT-T code.
1511 	 */
1512 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1513 		2 * sizeof(uint16_t), M_NOWAIT);
1514 	if (tag == NULL) {
1515 		IPSECSTAT_INC(ips_in_nomem);
1516 		m_freem(m);
1517 		return (NULL);		/* Discard. */
1518 	}
1519 	iphlen = off - sizeof(struct udphdr);
1520 	udphdr = (struct udphdr *)(data + iphlen);
1521 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1522 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1523 	m_tag_prepend(m, tag);
1524 
1525 	/*
1526 	 * Remove the UDP header (and possibly the non ESP marker)
1527 	 * IP header length is iphlen
1528 	 * Before:
1529 	 *   <--- off --->
1530 	 *   +----+------+-----+
1531 	 *   | IP |  UDP | ESP |
1532 	 *   +----+------+-----+
1533 	 *        <-skip->
1534 	 * After:
1535 	 *          +----+-----+
1536 	 *          | IP | ESP |
1537 	 *          +----+-----+
1538 	 *   <-skip->
1539 	 */
1540 	ovbcopy(data, data + skip, iphlen);
1541 	m_adj(m, skip);
1542 
1543 	ip = mtod(m, struct ip *);
1544 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
1545 	ip->ip_p = IPPROTO_ESP;
1546 
1547 	/*
1548 	 * We cannot yet update the cksums so clear any
1549 	 * h/w cksum flags as they are no longer valid.
1550 	 */
1551 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1552 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1553 
1554 	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
1555 	return (NULL);			/* NB: consumed, bypass processing. */
1556 }
1557 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1558 
1559 static void
1560 udp_abort(struct socket *so)
1561 {
1562 	struct inpcb *inp;
1563 	struct inpcbinfo *pcbinfo;
1564 
1565 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1566 	inp = sotoinpcb(so);
1567 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1568 	INP_WLOCK(inp);
1569 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1570 		INP_HASH_WLOCK(pcbinfo);
1571 		in_pcbdisconnect(inp);
1572 		inp->inp_laddr.s_addr = INADDR_ANY;
1573 		INP_HASH_WUNLOCK(pcbinfo);
1574 		soisdisconnected(so);
1575 	}
1576 	INP_WUNLOCK(inp);
1577 }
1578 
1579 static int
1580 udp_attach(struct socket *so, int proto, struct thread *td)
1581 {
1582 	struct inpcb *inp;
1583 	struct inpcbinfo *pcbinfo;
1584 	int error;
1585 
1586 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1587 	inp = sotoinpcb(so);
1588 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1589 	error = soreserve(so, udp_sendspace, udp_recvspace);
1590 	if (error)
1591 		return (error);
1592 	INP_INFO_WLOCK(pcbinfo);
1593 	error = in_pcballoc(so, pcbinfo);
1594 	if (error) {
1595 		INP_INFO_WUNLOCK(pcbinfo);
1596 		return (error);
1597 	}
1598 
1599 	inp = sotoinpcb(so);
1600 	inp->inp_vflag |= INP_IPV4;
1601 	inp->inp_ip_ttl = V_ip_defttl;
1602 
1603 	error = udp_newudpcb(inp);
1604 	if (error) {
1605 		in_pcbdetach(inp);
1606 		in_pcbfree(inp);
1607 		INP_INFO_WUNLOCK(pcbinfo);
1608 		return (error);
1609 	}
1610 
1611 	INP_WUNLOCK(inp);
1612 	INP_INFO_WUNLOCK(pcbinfo);
1613 	return (0);
1614 }
1615 #endif /* INET */
1616 
1617 int
1618 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
1619 {
1620 	struct inpcb *inp;
1621 	struct udpcb *up;
1622 
1623 	KASSERT(so->so_type == SOCK_DGRAM,
1624 	    ("udp_set_kernel_tunneling: !dgram"));
1625 	inp = sotoinpcb(so);
1626 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1627 	INP_WLOCK(inp);
1628 	up = intoudpcb(inp);
1629 	if (up->u_tun_func != NULL) {
1630 		INP_WUNLOCK(inp);
1631 		return (EBUSY);
1632 	}
1633 	up->u_tun_func = f;
1634 	INP_WUNLOCK(inp);
1635 	return (0);
1636 }
1637 
1638 #ifdef INET
1639 static int
1640 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1641 {
1642 	struct inpcb *inp;
1643 	struct inpcbinfo *pcbinfo;
1644 	int error;
1645 
1646 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1647 	inp = sotoinpcb(so);
1648 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1649 	INP_WLOCK(inp);
1650 	INP_HASH_WLOCK(pcbinfo);
1651 	error = in_pcbbind(inp, nam, td->td_ucred);
1652 	INP_HASH_WUNLOCK(pcbinfo);
1653 	INP_WUNLOCK(inp);
1654 	return (error);
1655 }
1656 
1657 static void
1658 udp_close(struct socket *so)
1659 {
1660 	struct inpcb *inp;
1661 	struct inpcbinfo *pcbinfo;
1662 
1663 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1664 	inp = sotoinpcb(so);
1665 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1666 	INP_WLOCK(inp);
1667 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1668 		INP_HASH_WLOCK(pcbinfo);
1669 		in_pcbdisconnect(inp);
1670 		inp->inp_laddr.s_addr = INADDR_ANY;
1671 		INP_HASH_WUNLOCK(pcbinfo);
1672 		soisdisconnected(so);
1673 	}
1674 	INP_WUNLOCK(inp);
1675 }
1676 
1677 static int
1678 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1679 {
1680 	struct inpcb *inp;
1681 	struct inpcbinfo *pcbinfo;
1682 	struct sockaddr_in *sin;
1683 	int error;
1684 
1685 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1686 	inp = sotoinpcb(so);
1687 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1688 	INP_WLOCK(inp);
1689 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1690 		INP_WUNLOCK(inp);
1691 		return (EISCONN);
1692 	}
1693 	sin = (struct sockaddr_in *)nam;
1694 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1695 	if (error != 0) {
1696 		INP_WUNLOCK(inp);
1697 		return (error);
1698 	}
1699 	INP_HASH_WLOCK(pcbinfo);
1700 	error = in_pcbconnect(inp, nam, td->td_ucred);
1701 	INP_HASH_WUNLOCK(pcbinfo);
1702 	if (error == 0)
1703 		soisconnected(so);
1704 	INP_WUNLOCK(inp);
1705 	return (error);
1706 }
1707 
1708 static void
1709 udp_detach(struct socket *so)
1710 {
1711 	struct inpcb *inp;
1712 	struct inpcbinfo *pcbinfo;
1713 	struct udpcb *up;
1714 
1715 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1716 	inp = sotoinpcb(so);
1717 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1718 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1719 	    ("udp_detach: not disconnected"));
1720 	INP_INFO_WLOCK(pcbinfo);
1721 	INP_WLOCK(inp);
1722 	up = intoudpcb(inp);
1723 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1724 	inp->inp_ppcb = NULL;
1725 	in_pcbdetach(inp);
1726 	in_pcbfree(inp);
1727 	INP_INFO_WUNLOCK(pcbinfo);
1728 	udp_discardcb(up);
1729 }
1730 
1731 static int
1732 udp_disconnect(struct socket *so)
1733 {
1734 	struct inpcb *inp;
1735 	struct inpcbinfo *pcbinfo;
1736 
1737 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1738 	inp = sotoinpcb(so);
1739 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1740 	INP_WLOCK(inp);
1741 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1742 		INP_WUNLOCK(inp);
1743 		return (ENOTCONN);
1744 	}
1745 	INP_HASH_WLOCK(pcbinfo);
1746 	in_pcbdisconnect(inp);
1747 	inp->inp_laddr.s_addr = INADDR_ANY;
1748 	INP_HASH_WUNLOCK(pcbinfo);
1749 	SOCK_LOCK(so);
1750 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1751 	SOCK_UNLOCK(so);
1752 	INP_WUNLOCK(inp);
1753 	return (0);
1754 }
1755 
1756 static int
1757 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1758     struct mbuf *control, struct thread *td)
1759 {
1760 	struct inpcb *inp;
1761 
1762 	inp = sotoinpcb(so);
1763 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1764 	return (udp_output(inp, m, addr, control, td));
1765 }
1766 #endif /* INET */
1767 
1768 int
1769 udp_shutdown(struct socket *so)
1770 {
1771 	struct inpcb *inp;
1772 
1773 	inp = sotoinpcb(so);
1774 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1775 	INP_WLOCK(inp);
1776 	socantsendmore(so);
1777 	INP_WUNLOCK(inp);
1778 	return (0);
1779 }
1780 
1781 #ifdef INET
1782 struct pr_usrreqs udp_usrreqs = {
1783 	.pru_abort =		udp_abort,
1784 	.pru_attach =		udp_attach,
1785 	.pru_bind =		udp_bind,
1786 	.pru_connect =		udp_connect,
1787 	.pru_control =		in_control,
1788 	.pru_detach =		udp_detach,
1789 	.pru_disconnect =	udp_disconnect,
1790 	.pru_peeraddr =		in_getpeeraddr,
1791 	.pru_send =		udp_send,
1792 	.pru_soreceive =	soreceive_dgram,
1793 	.pru_sosend =		sosend_dgram,
1794 	.pru_shutdown =		udp_shutdown,
1795 	.pru_sockaddr =		in_getsockaddr,
1796 	.pru_sosetlabel =	in_pcbsosetlabel,
1797 	.pru_close =		udp_close,
1798 };
1799 #endif /* INET */
1800