xref: /freebsd/sys/netinet/udp_usrreq.c (revision 955c8cbb4960e6cf3602de144b1b9154a5092968)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2008 Robert N. M. Watson
5  * Copyright (c) 2010-2011 Juniper Networks, Inc.
6  * All rights reserved.
7  *
8  * Portions of this software were developed by Robert N. M. Watson under
9  * contract to Juniper Networks, Inc.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 
41 #include "opt_ipfw.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ipsec.h"
45 
46 #include <sys/param.h>
47 #include <sys/domain.h>
48 #include <sys/eventhandler.h>
49 #include <sys/jail.h>
50 #include <sys/kernel.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/priv.h>
55 #include <sys/proc.h>
56 #include <sys/protosw.h>
57 #include <sys/signalvar.h>
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sx.h>
61 #include <sys/sysctl.h>
62 #include <sys/syslog.h>
63 #include <sys/systm.h>
64 
65 #include <vm/uma.h>
66 
67 #include <net/if.h>
68 #include <net/route.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_pcb.h>
72 #include <netinet/in_systm.h>
73 #include <netinet/in_var.h>
74 #include <netinet/ip.h>
75 #ifdef INET6
76 #include <netinet/ip6.h>
77 #endif
78 #include <netinet/ip_icmp.h>
79 #include <netinet/icmp_var.h>
80 #include <netinet/ip_var.h>
81 #include <netinet/ip_options.h>
82 #ifdef INET6
83 #include <netinet6/ip6_var.h>
84 #endif
85 #include <netinet/udp.h>
86 #include <netinet/udp_var.h>
87 
88 #ifdef IPSEC
89 #include <netipsec/ipsec.h>
90 #include <netipsec/esp.h>
91 #endif
92 
93 #include <machine/in_cksum.h>
94 
95 #include <security/mac/mac_framework.h>
96 
97 /*
98  * UDP protocol implementation.
99  * Per RFC 768, August, 1980.
100  */
101 
102 /*
103  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
104  * removes the only data integrity mechanism for packets and malformed
105  * packets that would otherwise be discarded due to bad checksums, and may
106  * cause problems (especially for NFS data blocks).
107  */
108 VNET_DEFINE(int, udp_cksum) = 1;
109 SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
110     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
111 
112 int	udp_log_in_vain = 0;
113 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
114     &udp_log_in_vain, 0, "Log all incoming UDP packets");
115 
116 VNET_DEFINE(int, udp_blackhole) = 0;
117 SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
118     &VNET_NAME(udp_blackhole), 0,
119     "Do not send port unreachables for refused connects");
120 
121 u_long	udp_sendspace = 9216;		/* really max datagram size */
122 					/* 40 1K datagrams */
123 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
124     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
125 
126 u_long	udp_recvspace = 40 * (1024 +
127 #ifdef INET6
128 				      sizeof(struct sockaddr_in6)
129 #else
130 				      sizeof(struct sockaddr_in)
131 #endif
132 				      );
133 
134 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
135     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
136 
137 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
138 VNET_DEFINE(struct inpcbinfo, udbinfo);
139 static VNET_DEFINE(uma_zone_t, udpcb_zone);
140 #define	V_udpcb_zone			VNET(udpcb_zone)
141 
142 #ifndef UDBHASHSIZE
143 #define	UDBHASHSIZE	128
144 #endif
145 
146 VNET_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
147 SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
148     &VNET_NAME(udpstat), udpstat,
149     "UDP statistics (struct udpstat, netinet/udp_var.h)");
150 
151 #ifdef INET
152 static void	udp_detach(struct socket *so);
153 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
154 		    struct mbuf *, struct thread *);
155 #endif
156 
157 #ifdef IPSEC
158 #ifdef IPSEC_NAT_T
159 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
160 #ifdef INET
161 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
162 #endif
163 #endif /* IPSEC_NAT_T */
164 #endif /* IPSEC */
165 
166 static void
167 udp_zone_change(void *tag)
168 {
169 
170 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
171 	uma_zone_set_max(V_udpcb_zone, maxsockets);
172 }
173 
174 static int
175 udp_inpcb_init(void *mem, int size, int flags)
176 {
177 	struct inpcb *inp;
178 
179 	inp = mem;
180 	INP_LOCK_INIT(inp, "inp", "udpinp");
181 	return (0);
182 }
183 
184 void
185 udp_init(void)
186 {
187 
188 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
189 	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
190 	    IPI_HASHFIELDS_2TUPLE);
191 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
192 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
193 	uma_zone_set_max(V_udpcb_zone, maxsockets);
194 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
195 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
196 	    EVENTHANDLER_PRI_ANY);
197 }
198 
199 /*
200  * Kernel module interface for updating udpstat.  The argument is an index
201  * into udpstat treated as an array of u_long.  While this encodes the
202  * general layout of udpstat into the caller, it doesn't encode its location,
203  * so that future changes to add, for example, per-CPU stats support won't
204  * cause binary compatibility problems for kernel modules.
205  */
206 void
207 kmod_udpstat_inc(int statnum)
208 {
209 
210 	(*((u_long *)&V_udpstat + statnum))++;
211 }
212 
213 int
214 udp_newudpcb(struct inpcb *inp)
215 {
216 	struct udpcb *up;
217 
218 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
219 	if (up == NULL)
220 		return (ENOBUFS);
221 	inp->inp_ppcb = up;
222 	return (0);
223 }
224 
225 void
226 udp_discardcb(struct udpcb *up)
227 {
228 
229 	uma_zfree(V_udpcb_zone, up);
230 }
231 
232 #ifdef VIMAGE
233 void
234 udp_destroy(void)
235 {
236 
237 	in_pcbinfo_destroy(&V_udbinfo);
238 	uma_zdestroy(V_udpcb_zone);
239 }
240 #endif
241 
242 #ifdef INET
243 /*
244  * Subroutine of udp_input(), which appends the provided mbuf chain to the
245  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
246  * contains the source address.  If the socket ends up being an IPv6 socket,
247  * udp_append() will convert to a sockaddr_in6 before passing the address
248  * into the socket code.
249  */
250 static void
251 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
252     struct sockaddr_in *udp_in)
253 {
254 	struct sockaddr *append_sa;
255 	struct socket *so;
256 	struct mbuf *opts = 0;
257 #ifdef INET6
258 	struct sockaddr_in6 udp_in6;
259 #endif
260 	struct udpcb *up;
261 
262 	INP_LOCK_ASSERT(inp);
263 
264 	/*
265 	 * Engage the tunneling protocol.
266 	 */
267 	up = intoudpcb(inp);
268 	if (up->u_tun_func != NULL) {
269 		(*up->u_tun_func)(n, off, inp);
270 		return;
271 	}
272 
273 	if (n == NULL)
274 		return;
275 
276 	off += sizeof(struct udphdr);
277 
278 #ifdef IPSEC
279 	/* Check AH/ESP integrity. */
280 	if (ipsec4_in_reject(n, inp)) {
281 		m_freem(n);
282 		V_ipsec4stat.in_polvio++;
283 		return;
284 	}
285 #ifdef IPSEC_NAT_T
286 	up = intoudpcb(inp);
287 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
288 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
289 		n = udp4_espdecap(inp, n, off);
290 		if (n == NULL)				/* Consumed. */
291 			return;
292 	}
293 #endif /* IPSEC_NAT_T */
294 #endif /* IPSEC */
295 #ifdef MAC
296 	if (mac_inpcb_check_deliver(inp, n) != 0) {
297 		m_freem(n);
298 		return;
299 	}
300 #endif /* MAC */
301 	if (inp->inp_flags & INP_CONTROLOPTS ||
302 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
303 #ifdef INET6
304 		if (inp->inp_vflag & INP_IPV6)
305 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
306 		else
307 #endif /* INET6 */
308 			ip_savecontrol(inp, &opts, ip, n);
309 	}
310 #ifdef INET6
311 	if (inp->inp_vflag & INP_IPV6) {
312 		bzero(&udp_in6, sizeof(udp_in6));
313 		udp_in6.sin6_len = sizeof(udp_in6);
314 		udp_in6.sin6_family = AF_INET6;
315 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
316 		append_sa = (struct sockaddr *)&udp_in6;
317 	} else
318 #endif /* INET6 */
319 		append_sa = (struct sockaddr *)udp_in;
320 	m_adj(n, off);
321 
322 	so = inp->inp_socket;
323 	SOCKBUF_LOCK(&so->so_rcv);
324 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
325 		SOCKBUF_UNLOCK(&so->so_rcv);
326 		m_freem(n);
327 		if (opts)
328 			m_freem(opts);
329 		UDPSTAT_INC(udps_fullsock);
330 	} else
331 		sorwakeup_locked(so);
332 }
333 
334 void
335 udp_input(struct mbuf *m, int off)
336 {
337 	int iphlen = off;
338 	struct ip *ip;
339 	struct udphdr *uh;
340 	struct ifnet *ifp;
341 	struct inpcb *inp;
342 	uint16_t len, ip_len;
343 	struct ip save_ip;
344 	struct sockaddr_in udp_in;
345 	struct m_tag *fwd_tag;
346 
347 	ifp = m->m_pkthdr.rcvif;
348 	UDPSTAT_INC(udps_ipackets);
349 
350 	/*
351 	 * Strip IP options, if any; should skip this, make available to
352 	 * user, and use on returned packets, but we don't yet have a way to
353 	 * check the checksum with options still present.
354 	 */
355 	if (iphlen > sizeof (struct ip)) {
356 		ip_stripoptions(m);
357 		iphlen = sizeof(struct ip);
358 	}
359 
360 	/*
361 	 * Get IP and UDP header together in first mbuf.
362 	 */
363 	ip = mtod(m, struct ip *);
364 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
365 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
366 			UDPSTAT_INC(udps_hdrops);
367 			return;
368 		}
369 		ip = mtod(m, struct ip *);
370 	}
371 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
372 
373 	/*
374 	 * Destination port of 0 is illegal, based on RFC768.
375 	 */
376 	if (uh->uh_dport == 0)
377 		goto badunlocked;
378 
379 	/*
380 	 * Construct sockaddr format source address.  Stuff source address
381 	 * and datagram in user buffer.
382 	 */
383 	bzero(&udp_in, sizeof(udp_in));
384 	udp_in.sin_len = sizeof(udp_in);
385 	udp_in.sin_family = AF_INET;
386 	udp_in.sin_port = uh->uh_sport;
387 	udp_in.sin_addr = ip->ip_src;
388 
389 	/*
390 	 * Make mbuf data length reflect UDP length.  If not enough data to
391 	 * reflect UDP length, drop.
392 	 */
393 	len = ntohs((u_short)uh->uh_ulen);
394 	ip_len = ntohs(ip->ip_len) - iphlen;
395 	if (ip_len != len) {
396 		if (len > ip_len || len < sizeof(struct udphdr)) {
397 			UDPSTAT_INC(udps_badlen);
398 			goto badunlocked;
399 		}
400 		m_adj(m, len - ip_len);
401 	}
402 
403 	/*
404 	 * Save a copy of the IP header in case we want restore it for
405 	 * sending an ICMP error message in response.
406 	 */
407 	if (!V_udp_blackhole)
408 		save_ip = *ip;
409 	else
410 		memset(&save_ip, 0, sizeof(save_ip));
411 
412 	/*
413 	 * Checksum extended UDP header and data.
414 	 */
415 	if (uh->uh_sum) {
416 		u_short uh_sum;
417 
418 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
419 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
420 				uh_sum = m->m_pkthdr.csum_data;
421 			else
422 				uh_sum = in_pseudo(ip->ip_src.s_addr,
423 				    ip->ip_dst.s_addr, htonl((u_short)len +
424 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
425 			uh_sum ^= 0xffff;
426 		} else {
427 			char b[9];
428 
429 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
430 			bzero(((struct ipovly *)ip)->ih_x1, 9);
431 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
432 			uh_sum = in_cksum(m, len + sizeof (struct ip));
433 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
434 		}
435 		if (uh_sum) {
436 			UDPSTAT_INC(udps_badsum);
437 			m_freem(m);
438 			return;
439 		}
440 	} else
441 		UDPSTAT_INC(udps_nosum);
442 
443 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
444 	    in_broadcast(ip->ip_dst, ifp)) {
445 		struct inpcb *last;
446 		struct ip_moptions *imo;
447 
448 		INP_INFO_RLOCK(&V_udbinfo);
449 		last = NULL;
450 		LIST_FOREACH(inp, &V_udb, inp_list) {
451 			if (inp->inp_lport != uh->uh_dport)
452 				continue;
453 #ifdef INET6
454 			if ((inp->inp_vflag & INP_IPV4) == 0)
455 				continue;
456 #endif
457 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
458 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
459 				continue;
460 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
461 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
462 				continue;
463 			if (inp->inp_fport != 0 &&
464 			    inp->inp_fport != uh->uh_sport)
465 				continue;
466 
467 			INP_RLOCK(inp);
468 
469 			/*
470 			 * XXXRW: Because we weren't holding either the inpcb
471 			 * or the hash lock when we checked for a match
472 			 * before, we should probably recheck now that the
473 			 * inpcb lock is held.
474 			 */
475 
476 			/*
477 			 * Handle socket delivery policy for any-source
478 			 * and source-specific multicast. [RFC3678]
479 			 */
480 			imo = inp->inp_moptions;
481 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
482 				struct sockaddr_in	 group;
483 				int			 blocked;
484 				if (imo == NULL) {
485 					INP_RUNLOCK(inp);
486 					continue;
487 				}
488 				bzero(&group, sizeof(struct sockaddr_in));
489 				group.sin_len = sizeof(struct sockaddr_in);
490 				group.sin_family = AF_INET;
491 				group.sin_addr = ip->ip_dst;
492 
493 				blocked = imo_multi_filter(imo, ifp,
494 					(struct sockaddr *)&group,
495 					(struct sockaddr *)&udp_in);
496 				if (blocked != MCAST_PASS) {
497 					if (blocked == MCAST_NOTGMEMBER)
498 						IPSTAT_INC(ips_notmember);
499 					if (blocked == MCAST_NOTSMEMBER ||
500 					    blocked == MCAST_MUTED)
501 						UDPSTAT_INC(udps_filtermcast);
502 					INP_RUNLOCK(inp);
503 					continue;
504 				}
505 			}
506 			if (last != NULL) {
507 				struct mbuf *n;
508 
509 				n = m_copy(m, 0, M_COPYALL);
510 				udp_append(last, ip, n, iphlen, &udp_in);
511 				INP_RUNLOCK(last);
512 			}
513 			last = inp;
514 			/*
515 			 * Don't look for additional matches if this one does
516 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
517 			 * socket options set.  This heuristic avoids
518 			 * searching through all pcbs in the common case of a
519 			 * non-shared port.  It assumes that an application
520 			 * will never clear these options after setting them.
521 			 */
522 			if ((last->inp_socket->so_options &
523 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
524 				break;
525 		}
526 
527 		if (last == NULL) {
528 			/*
529 			 * No matching pcb found; discard datagram.  (No need
530 			 * to send an ICMP Port Unreachable for a broadcast
531 			 * or multicast datgram.)
532 			 */
533 			UDPSTAT_INC(udps_noportbcast);
534 			if (inp)
535 				INP_RUNLOCK(inp);
536 			INP_INFO_RUNLOCK(&V_udbinfo);
537 			goto badunlocked;
538 		}
539 		udp_append(last, ip, m, iphlen, &udp_in);
540 		INP_RUNLOCK(last);
541 		INP_INFO_RUNLOCK(&V_udbinfo);
542 		return;
543 	}
544 
545 	/*
546 	 * Locate pcb for datagram.
547 	 */
548 
549 	/*
550 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
551 	 */
552 	if ((m->m_flags & M_IP_NEXTHOP) &&
553 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
554 		struct sockaddr_in *next_hop;
555 
556 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
557 
558 		/*
559 		 * Transparently forwarded. Pretend to be the destination.
560 		 * Already got one like this?
561 		 */
562 		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
563 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
564 		if (!inp) {
565 			/*
566 			 * It's new.  Try to find the ambushing socket.
567 			 * Because we've rewritten the destination address,
568 			 * any hardware-generated hash is ignored.
569 			 */
570 			inp = in_pcblookup(&V_udbinfo, ip->ip_src,
571 			    uh->uh_sport, next_hop->sin_addr,
572 			    next_hop->sin_port ? htons(next_hop->sin_port) :
573 			    uh->uh_dport, INPLOOKUP_WILDCARD |
574 			    INPLOOKUP_RLOCKPCB, ifp);
575 		}
576 		/* Remove the tag from the packet. We don't need it anymore. */
577 		m_tag_delete(m, fwd_tag);
578 		m->m_flags &= ~M_IP_NEXTHOP;
579 	} else
580 		inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
581 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
582 		    INPLOOKUP_RLOCKPCB, ifp, m);
583 	if (inp == NULL) {
584 		if (udp_log_in_vain) {
585 			char buf[4*sizeof "123"];
586 
587 			strcpy(buf, inet_ntoa(ip->ip_dst));
588 			log(LOG_INFO,
589 			    "Connection attempt to UDP %s:%d from %s:%d\n",
590 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
591 			    ntohs(uh->uh_sport));
592 		}
593 		UDPSTAT_INC(udps_noport);
594 		if (m->m_flags & (M_BCAST | M_MCAST)) {
595 			UDPSTAT_INC(udps_noportbcast);
596 			goto badunlocked;
597 		}
598 		if (V_udp_blackhole)
599 			goto badunlocked;
600 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
601 			goto badunlocked;
602 		*ip = save_ip;
603 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
604 		return;
605 	}
606 
607 	/*
608 	 * Check the minimum TTL for socket.
609 	 */
610 	INP_RLOCK_ASSERT(inp);
611 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
612 		INP_RUNLOCK(inp);
613 		m_freem(m);
614 		return;
615 	}
616 	udp_append(inp, ip, m, iphlen, &udp_in);
617 	INP_RUNLOCK(inp);
618 	return;
619 
620 badunlocked:
621 	m_freem(m);
622 }
623 #endif /* INET */
624 
625 /*
626  * Notify a udp user of an asynchronous error; just wake up so that they can
627  * collect error status.
628  */
629 struct inpcb *
630 udp_notify(struct inpcb *inp, int errno)
631 {
632 
633 	/*
634 	 * While udp_ctlinput() always calls udp_notify() with a read lock
635 	 * when invoking it directly, in_pcbnotifyall() currently uses write
636 	 * locks due to sharing code with TCP.  For now, accept either a read
637 	 * or a write lock, but a read lock is sufficient.
638 	 */
639 	INP_LOCK_ASSERT(inp);
640 
641 	inp->inp_socket->so_error = errno;
642 	sorwakeup(inp->inp_socket);
643 	sowwakeup(inp->inp_socket);
644 	return (inp);
645 }
646 
647 #ifdef INET
648 void
649 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
650 {
651 	struct ip *ip = vip;
652 	struct udphdr *uh;
653 	struct in_addr faddr;
654 	struct inpcb *inp;
655 
656 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
657 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
658 		return;
659 
660 	/*
661 	 * Redirects don't need to be handled up here.
662 	 */
663 	if (PRC_IS_REDIRECT(cmd))
664 		return;
665 
666 	/*
667 	 * Hostdead is ugly because it goes linearly through all PCBs.
668 	 *
669 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
670 	 * DoS attack on machines with many connections.
671 	 */
672 	if (cmd == PRC_HOSTDEAD)
673 		ip = NULL;
674 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
675 		return;
676 	if (ip != NULL) {
677 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
678 		inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport,
679 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
680 		if (inp != NULL) {
681 			INP_RLOCK_ASSERT(inp);
682 			if (inp->inp_socket != NULL) {
683 				udp_notify(inp, inetctlerrmap[cmd]);
684 			}
685 			INP_RUNLOCK(inp);
686 		}
687 	} else
688 		in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
689 		    udp_notify);
690 }
691 #endif /* INET */
692 
693 static int
694 udp_pcblist(SYSCTL_HANDLER_ARGS)
695 {
696 	int error, i, n;
697 	struct inpcb *inp, **inp_list;
698 	inp_gen_t gencnt;
699 	struct xinpgen xig;
700 
701 	/*
702 	 * The process of preparing the PCB list is too time-consuming and
703 	 * resource-intensive to repeat twice on every request.
704 	 */
705 	if (req->oldptr == 0) {
706 		n = V_udbinfo.ipi_count;
707 		n += imax(n / 8, 10);
708 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
709 		return (0);
710 	}
711 
712 	if (req->newptr != 0)
713 		return (EPERM);
714 
715 	/*
716 	 * OK, now we're committed to doing something.
717 	 */
718 	INP_INFO_RLOCK(&V_udbinfo);
719 	gencnt = V_udbinfo.ipi_gencnt;
720 	n = V_udbinfo.ipi_count;
721 	INP_INFO_RUNLOCK(&V_udbinfo);
722 
723 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
724 		+ n * sizeof(struct xinpcb));
725 	if (error != 0)
726 		return (error);
727 
728 	xig.xig_len = sizeof xig;
729 	xig.xig_count = n;
730 	xig.xig_gen = gencnt;
731 	xig.xig_sogen = so_gencnt;
732 	error = SYSCTL_OUT(req, &xig, sizeof xig);
733 	if (error)
734 		return (error);
735 
736 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
737 	if (inp_list == 0)
738 		return (ENOMEM);
739 
740 	INP_INFO_RLOCK(&V_udbinfo);
741 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
742 	     inp = LIST_NEXT(inp, inp_list)) {
743 		INP_WLOCK(inp);
744 		if (inp->inp_gencnt <= gencnt &&
745 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
746 			in_pcbref(inp);
747 			inp_list[i++] = inp;
748 		}
749 		INP_WUNLOCK(inp);
750 	}
751 	INP_INFO_RUNLOCK(&V_udbinfo);
752 	n = i;
753 
754 	error = 0;
755 	for (i = 0; i < n; i++) {
756 		inp = inp_list[i];
757 		INP_RLOCK(inp);
758 		if (inp->inp_gencnt <= gencnt) {
759 			struct xinpcb xi;
760 
761 			bzero(&xi, sizeof(xi));
762 			xi.xi_len = sizeof xi;
763 			/* XXX should avoid extra copy */
764 			bcopy(inp, &xi.xi_inp, sizeof *inp);
765 			if (inp->inp_socket)
766 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
767 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
768 			INP_RUNLOCK(inp);
769 			error = SYSCTL_OUT(req, &xi, sizeof xi);
770 		} else
771 			INP_RUNLOCK(inp);
772 	}
773 	INP_INFO_WLOCK(&V_udbinfo);
774 	for (i = 0; i < n; i++) {
775 		inp = inp_list[i];
776 		INP_RLOCK(inp);
777 		if (!in_pcbrele_rlocked(inp))
778 			INP_RUNLOCK(inp);
779 	}
780 	INP_INFO_WUNLOCK(&V_udbinfo);
781 
782 	if (!error) {
783 		/*
784 		 * Give the user an updated idea of our state.  If the
785 		 * generation differs from what we told her before, she knows
786 		 * that something happened while we were processing this
787 		 * request, and it might be necessary to retry.
788 		 */
789 		INP_INFO_RLOCK(&V_udbinfo);
790 		xig.xig_gen = V_udbinfo.ipi_gencnt;
791 		xig.xig_sogen = so_gencnt;
792 		xig.xig_count = V_udbinfo.ipi_count;
793 		INP_INFO_RUNLOCK(&V_udbinfo);
794 		error = SYSCTL_OUT(req, &xig, sizeof xig);
795 	}
796 	free(inp_list, M_TEMP);
797 	return (error);
798 }
799 
800 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
801     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
802     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
803 
804 #ifdef INET
805 static int
806 udp_getcred(SYSCTL_HANDLER_ARGS)
807 {
808 	struct xucred xuc;
809 	struct sockaddr_in addrs[2];
810 	struct inpcb *inp;
811 	int error;
812 
813 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
814 	if (error)
815 		return (error);
816 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
817 	if (error)
818 		return (error);
819 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
820 	    addrs[0].sin_addr, addrs[0].sin_port,
821 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
822 	if (inp != NULL) {
823 		INP_RLOCK_ASSERT(inp);
824 		if (inp->inp_socket == NULL)
825 			error = ENOENT;
826 		if (error == 0)
827 			error = cr_canseeinpcb(req->td->td_ucred, inp);
828 		if (error == 0)
829 			cru2x(inp->inp_cred, &xuc);
830 		INP_RUNLOCK(inp);
831 	} else
832 		error = ENOENT;
833 	if (error == 0)
834 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
835 	return (error);
836 }
837 
838 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
839     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
840     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
841 #endif /* INET */
842 
843 int
844 udp_ctloutput(struct socket *so, struct sockopt *sopt)
845 {
846 	int error = 0, optval;
847 	struct inpcb *inp;
848 #ifdef IPSEC_NAT_T
849 	struct udpcb *up;
850 #endif
851 
852 	inp = sotoinpcb(so);
853 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
854 	INP_WLOCK(inp);
855 	if (sopt->sopt_level != IPPROTO_UDP) {
856 #ifdef INET6
857 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
858 			INP_WUNLOCK(inp);
859 			error = ip6_ctloutput(so, sopt);
860 		}
861 #endif
862 #if defined(INET) && defined(INET6)
863 		else
864 #endif
865 #ifdef INET
866 		{
867 			INP_WUNLOCK(inp);
868 			error = ip_ctloutput(so, sopt);
869 		}
870 #endif
871 		return (error);
872 	}
873 
874 	switch (sopt->sopt_dir) {
875 	case SOPT_SET:
876 		switch (sopt->sopt_name) {
877 		case UDP_ENCAP:
878 			INP_WUNLOCK(inp);
879 			error = sooptcopyin(sopt, &optval, sizeof optval,
880 					    sizeof optval);
881 			if (error)
882 				break;
883 			inp = sotoinpcb(so);
884 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
885 			INP_WLOCK(inp);
886 #ifdef IPSEC_NAT_T
887 			up = intoudpcb(inp);
888 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
889 #endif
890 			switch (optval) {
891 			case 0:
892 				/* Clear all UDP encap. */
893 #ifdef IPSEC_NAT_T
894 				up->u_flags &= ~UF_ESPINUDP_ALL;
895 #endif
896 				break;
897 #ifdef IPSEC_NAT_T
898 			case UDP_ENCAP_ESPINUDP:
899 			case UDP_ENCAP_ESPINUDP_NON_IKE:
900 				up->u_flags &= ~UF_ESPINUDP_ALL;
901 				if (optval == UDP_ENCAP_ESPINUDP)
902 					up->u_flags |= UF_ESPINUDP;
903 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
904 					up->u_flags |= UF_ESPINUDP_NON_IKE;
905 				break;
906 #endif
907 			default:
908 				error = EINVAL;
909 				break;
910 			}
911 			INP_WUNLOCK(inp);
912 			break;
913 		default:
914 			INP_WUNLOCK(inp);
915 			error = ENOPROTOOPT;
916 			break;
917 		}
918 		break;
919 	case SOPT_GET:
920 		switch (sopt->sopt_name) {
921 #ifdef IPSEC_NAT_T
922 		case UDP_ENCAP:
923 			up = intoudpcb(inp);
924 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
925 			optval = up->u_flags & UF_ESPINUDP_ALL;
926 			INP_WUNLOCK(inp);
927 			error = sooptcopyout(sopt, &optval, sizeof optval);
928 			break;
929 #endif
930 		default:
931 			INP_WUNLOCK(inp);
932 			error = ENOPROTOOPT;
933 			break;
934 		}
935 		break;
936 	}
937 	return (error);
938 }
939 
940 #ifdef INET
941 #define	UH_WLOCKED	2
942 #define	UH_RLOCKED	1
943 #define	UH_UNLOCKED	0
944 static int
945 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
946     struct mbuf *control, struct thread *td)
947 {
948 	struct udpiphdr *ui;
949 	int len = m->m_pkthdr.len;
950 	struct in_addr faddr, laddr;
951 	struct cmsghdr *cm;
952 	struct sockaddr_in *sin, src;
953 	int error = 0;
954 	int ipflags;
955 	u_short fport, lport;
956 	int unlock_udbinfo;
957 	u_char tos;
958 
959 	/*
960 	 * udp_output() may need to temporarily bind or connect the current
961 	 * inpcb.  As such, we don't know up front whether we will need the
962 	 * pcbinfo lock or not.  Do any work to decide what is needed up
963 	 * front before acquiring any locks.
964 	 */
965 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
966 		if (control)
967 			m_freem(control);
968 		m_freem(m);
969 		return (EMSGSIZE);
970 	}
971 
972 	src.sin_family = 0;
973 	INP_RLOCK(inp);
974 	tos = inp->inp_ip_tos;
975 	if (control != NULL) {
976 		/*
977 		 * XXX: Currently, we assume all the optional information is
978 		 * stored in a single mbuf.
979 		 */
980 		if (control->m_next) {
981 			INP_RUNLOCK(inp);
982 			m_freem(control);
983 			m_freem(m);
984 			return (EINVAL);
985 		}
986 		for (; control->m_len > 0;
987 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
988 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
989 			cm = mtod(control, struct cmsghdr *);
990 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
991 			    || cm->cmsg_len > control->m_len) {
992 				error = EINVAL;
993 				break;
994 			}
995 			if (cm->cmsg_level != IPPROTO_IP)
996 				continue;
997 
998 			switch (cm->cmsg_type) {
999 			case IP_SENDSRCADDR:
1000 				if (cm->cmsg_len !=
1001 				    CMSG_LEN(sizeof(struct in_addr))) {
1002 					error = EINVAL;
1003 					break;
1004 				}
1005 				bzero(&src, sizeof(src));
1006 				src.sin_family = AF_INET;
1007 				src.sin_len = sizeof(src);
1008 				src.sin_port = inp->inp_lport;
1009 				src.sin_addr =
1010 				    *(struct in_addr *)CMSG_DATA(cm);
1011 				break;
1012 
1013 			case IP_TOS:
1014 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1015 					error = EINVAL;
1016 					break;
1017 				}
1018 				tos = *(u_char *)CMSG_DATA(cm);
1019 				break;
1020 
1021 			default:
1022 				error = ENOPROTOOPT;
1023 				break;
1024 			}
1025 			if (error)
1026 				break;
1027 		}
1028 		m_freem(control);
1029 	}
1030 	if (error) {
1031 		INP_RUNLOCK(inp);
1032 		m_freem(m);
1033 		return (error);
1034 	}
1035 
1036 	/*
1037 	 * Depending on whether or not the application has bound or connected
1038 	 * the socket, we may have to do varying levels of work.  The optimal
1039 	 * case is for a connected UDP socket, as a global lock isn't
1040 	 * required at all.
1041 	 *
1042 	 * In order to decide which we need, we require stability of the
1043 	 * inpcb binding, which we ensure by acquiring a read lock on the
1044 	 * inpcb.  This doesn't strictly follow the lock order, so we play
1045 	 * the trylock and retry game; note that we may end up with more
1046 	 * conservative locks than required the second time around, so later
1047 	 * assertions have to accept that.  Further analysis of the number of
1048 	 * misses under contention is required.
1049 	 *
1050 	 * XXXRW: Check that hash locking update here is correct.
1051 	 */
1052 	sin = (struct sockaddr_in *)addr;
1053 	if (sin != NULL &&
1054 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1055 		INP_RUNLOCK(inp);
1056 		INP_WLOCK(inp);
1057 		INP_HASH_WLOCK(&V_udbinfo);
1058 		unlock_udbinfo = UH_WLOCKED;
1059 	} else if ((sin != NULL && (
1060 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
1061 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1062 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
1063 	    (inp->inp_lport == 0))) ||
1064 	    (src.sin_family == AF_INET)) {
1065 		INP_HASH_RLOCK(&V_udbinfo);
1066 		unlock_udbinfo = UH_RLOCKED;
1067 	} else
1068 		unlock_udbinfo = UH_UNLOCKED;
1069 
1070 	/*
1071 	 * If the IP_SENDSRCADDR control message was specified, override the
1072 	 * source address for this datagram.  Its use is invalidated if the
1073 	 * address thus specified is incomplete or clobbers other inpcbs.
1074 	 */
1075 	laddr = inp->inp_laddr;
1076 	lport = inp->inp_lport;
1077 	if (src.sin_family == AF_INET) {
1078 		INP_HASH_LOCK_ASSERT(&V_udbinfo);
1079 		if ((lport == 0) ||
1080 		    (laddr.s_addr == INADDR_ANY &&
1081 		     src.sin_addr.s_addr == INADDR_ANY)) {
1082 			error = EINVAL;
1083 			goto release;
1084 		}
1085 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1086 		    &laddr.s_addr, &lport, td->td_ucred);
1087 		if (error)
1088 			goto release;
1089 	}
1090 
1091 	/*
1092 	 * If a UDP socket has been connected, then a local address/port will
1093 	 * have been selected and bound.
1094 	 *
1095 	 * If a UDP socket has not been connected to, then an explicit
1096 	 * destination address must be used, in which case a local
1097 	 * address/port may not have been selected and bound.
1098 	 */
1099 	if (sin != NULL) {
1100 		INP_LOCK_ASSERT(inp);
1101 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1102 			error = EISCONN;
1103 			goto release;
1104 		}
1105 
1106 		/*
1107 		 * Jail may rewrite the destination address, so let it do
1108 		 * that before we use it.
1109 		 */
1110 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1111 		if (error)
1112 			goto release;
1113 
1114 		/*
1115 		 * If a local address or port hasn't yet been selected, or if
1116 		 * the destination address needs to be rewritten due to using
1117 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1118 		 * to do the heavy lifting.  Once a port is selected, we
1119 		 * commit the binding back to the socket; we also commit the
1120 		 * binding of the address if in jail.
1121 		 *
1122 		 * If we already have a valid binding and we're not
1123 		 * requesting a destination address rewrite, use a fast path.
1124 		 */
1125 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1126 		    inp->inp_lport == 0 ||
1127 		    sin->sin_addr.s_addr == INADDR_ANY ||
1128 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1129 			INP_HASH_LOCK_ASSERT(&V_udbinfo);
1130 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1131 			    &lport, &faddr.s_addr, &fport, NULL,
1132 			    td->td_ucred);
1133 			if (error)
1134 				goto release;
1135 
1136 			/*
1137 			 * XXXRW: Why not commit the port if the address is
1138 			 * !INADDR_ANY?
1139 			 */
1140 			/* Commit the local port if newly assigned. */
1141 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1142 			    inp->inp_lport == 0) {
1143 				INP_WLOCK_ASSERT(inp);
1144 				INP_HASH_WLOCK_ASSERT(&V_udbinfo);
1145 				/*
1146 				 * Remember addr if jailed, to prevent
1147 				 * rebinding.
1148 				 */
1149 				if (prison_flag(td->td_ucred, PR_IP4))
1150 					inp->inp_laddr = laddr;
1151 				inp->inp_lport = lport;
1152 				if (in_pcbinshash(inp) != 0) {
1153 					inp->inp_lport = 0;
1154 					error = EAGAIN;
1155 					goto release;
1156 				}
1157 				inp->inp_flags |= INP_ANONPORT;
1158 			}
1159 		} else {
1160 			faddr = sin->sin_addr;
1161 			fport = sin->sin_port;
1162 		}
1163 	} else {
1164 		INP_LOCK_ASSERT(inp);
1165 		faddr = inp->inp_faddr;
1166 		fport = inp->inp_fport;
1167 		if (faddr.s_addr == INADDR_ANY) {
1168 			error = ENOTCONN;
1169 			goto release;
1170 		}
1171 	}
1172 
1173 	/*
1174 	 * Calculate data length and get a mbuf for UDP, IP, and possible
1175 	 * link-layer headers.  Immediate slide the data pointer back forward
1176 	 * since we won't use that space at this layer.
1177 	 */
1178 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1179 	if (m == NULL) {
1180 		error = ENOBUFS;
1181 		goto release;
1182 	}
1183 	m->m_data += max_linkhdr;
1184 	m->m_len -= max_linkhdr;
1185 	m->m_pkthdr.len -= max_linkhdr;
1186 
1187 	/*
1188 	 * Fill in mbuf with extended UDP header and addresses and length put
1189 	 * into network format.
1190 	 */
1191 	ui = mtod(m, struct udpiphdr *);
1192 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1193 	ui->ui_pr = IPPROTO_UDP;
1194 	ui->ui_src = laddr;
1195 	ui->ui_dst = faddr;
1196 	ui->ui_sport = lport;
1197 	ui->ui_dport = fport;
1198 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1199 
1200 	/*
1201 	 * Set the Don't Fragment bit in the IP header.
1202 	 */
1203 	if (inp->inp_flags & INP_DONTFRAG) {
1204 		struct ip *ip;
1205 
1206 		ip = (struct ip *)&ui->ui_i;
1207 		ip->ip_off |= htons(IP_DF);
1208 	}
1209 
1210 	ipflags = 0;
1211 	if (inp->inp_socket->so_options & SO_DONTROUTE)
1212 		ipflags |= IP_ROUTETOIF;
1213 	if (inp->inp_socket->so_options & SO_BROADCAST)
1214 		ipflags |= IP_ALLOWBROADCAST;
1215 	if (inp->inp_flags & INP_ONESBCAST)
1216 		ipflags |= IP_SENDONES;
1217 
1218 #ifdef MAC
1219 	mac_inpcb_create_mbuf(inp, m);
1220 #endif
1221 
1222 	/*
1223 	 * Set up checksum and output datagram.
1224 	 */
1225 	if (V_udp_cksum) {
1226 		if (inp->inp_flags & INP_ONESBCAST)
1227 			faddr.s_addr = INADDR_BROADCAST;
1228 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1229 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
1230 		m->m_pkthdr.csum_flags = CSUM_UDP;
1231 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1232 	} else
1233 		ui->ui_sum = 0;
1234 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1235 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1236 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1237 	UDPSTAT_INC(udps_opackets);
1238 
1239 	if (unlock_udbinfo == UH_WLOCKED)
1240 		INP_HASH_WUNLOCK(&V_udbinfo);
1241 	else if (unlock_udbinfo == UH_RLOCKED)
1242 		INP_HASH_RUNLOCK(&V_udbinfo);
1243 	error = ip_output(m, inp->inp_options, NULL, ipflags,
1244 	    inp->inp_moptions, inp);
1245 	if (unlock_udbinfo == UH_WLOCKED)
1246 		INP_WUNLOCK(inp);
1247 	else
1248 		INP_RUNLOCK(inp);
1249 	return (error);
1250 
1251 release:
1252 	if (unlock_udbinfo == UH_WLOCKED) {
1253 		INP_HASH_WUNLOCK(&V_udbinfo);
1254 		INP_WUNLOCK(inp);
1255 	} else if (unlock_udbinfo == UH_RLOCKED) {
1256 		INP_HASH_RUNLOCK(&V_udbinfo);
1257 		INP_RUNLOCK(inp);
1258 	} else
1259 		INP_RUNLOCK(inp);
1260 	m_freem(m);
1261 	return (error);
1262 }
1263 
1264 
1265 #if defined(IPSEC) && defined(IPSEC_NAT_T)
1266 /*
1267  * Potentially decap ESP in UDP frame.  Check for an ESP header
1268  * and optional marker; if present, strip the UDP header and
1269  * push the result through IPSec.
1270  *
1271  * Returns mbuf to be processed (potentially re-allocated) or
1272  * NULL if consumed and/or processed.
1273  */
1274 static struct mbuf *
1275 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1276 {
1277 	size_t minlen, payload, skip, iphlen;
1278 	caddr_t data;
1279 	struct udpcb *up;
1280 	struct m_tag *tag;
1281 	struct udphdr *udphdr;
1282 	struct ip *ip;
1283 
1284 	INP_RLOCK_ASSERT(inp);
1285 
1286 	/*
1287 	 * Pull up data so the longest case is contiguous:
1288 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
1289 	 */
1290 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1291 	if (minlen > m->m_pkthdr.len)
1292 		minlen = m->m_pkthdr.len;
1293 	if ((m = m_pullup(m, minlen)) == NULL) {
1294 		V_ipsec4stat.in_inval++;
1295 		return (NULL);		/* Bypass caller processing. */
1296 	}
1297 	data = mtod(m, caddr_t);	/* Points to ip header. */
1298 	payload = m->m_len - off;	/* Size of payload. */
1299 
1300 	if (payload == 1 && data[off] == '\xff')
1301 		return (m);		/* NB: keepalive packet, no decap. */
1302 
1303 	up = intoudpcb(inp);
1304 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1305 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1306 	    ("u_flags 0x%x", up->u_flags));
1307 
1308 	/*
1309 	 * Check that the payload is large enough to hold an
1310 	 * ESP header and compute the amount of data to remove.
1311 	 *
1312 	 * NB: the caller has already done a pullup for us.
1313 	 * XXX can we assume alignment and eliminate bcopys?
1314 	 */
1315 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1316 		/*
1317 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1318 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1319 		 * possible AH mode non-IKE marker+non-ESP marker
1320 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
1321 		 */
1322 		uint64_t marker;
1323 
1324 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1325 			return (m);	/* NB: no decap. */
1326 		bcopy(data + off, &marker, sizeof(uint64_t));
1327 		if (marker != 0)	/* Non-IKE marker. */
1328 			return (m);	/* NB: no decap. */
1329 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
1330 	} else {
1331 		uint32_t spi;
1332 
1333 		if (payload <= sizeof(struct esp)) {
1334 			V_ipsec4stat.in_inval++;
1335 			m_freem(m);
1336 			return (NULL);	/* Discard. */
1337 		}
1338 		bcopy(data + off, &spi, sizeof(uint32_t));
1339 		if (spi == 0)		/* Non-ESP marker. */
1340 			return (m);	/* NB: no decap. */
1341 		skip = sizeof(struct udphdr);
1342 	}
1343 
1344 	/*
1345 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1346 	 * the UDP ports. This is required if we want to select
1347 	 * the right SPD for multiple hosts behind same NAT.
1348 	 *
1349 	 * NB: ports are maintained in network byte order everywhere
1350 	 *     in the NAT-T code.
1351 	 */
1352 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1353 		2 * sizeof(uint16_t), M_NOWAIT);
1354 	if (tag == NULL) {
1355 		V_ipsec4stat.in_nomem++;
1356 		m_freem(m);
1357 		return (NULL);		/* Discard. */
1358 	}
1359 	iphlen = off - sizeof(struct udphdr);
1360 	udphdr = (struct udphdr *)(data + iphlen);
1361 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1362 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1363 	m_tag_prepend(m, tag);
1364 
1365 	/*
1366 	 * Remove the UDP header (and possibly the non ESP marker)
1367 	 * IP header length is iphlen
1368 	 * Before:
1369 	 *   <--- off --->
1370 	 *   +----+------+-----+
1371 	 *   | IP |  UDP | ESP |
1372 	 *   +----+------+-----+
1373 	 *        <-skip->
1374 	 * After:
1375 	 *          +----+-----+
1376 	 *          | IP | ESP |
1377 	 *          +----+-----+
1378 	 *   <-skip->
1379 	 */
1380 	ovbcopy(data, data + skip, iphlen);
1381 	m_adj(m, skip);
1382 
1383 	ip = mtod(m, struct ip *);
1384 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
1385 	ip->ip_p = IPPROTO_ESP;
1386 
1387 	/*
1388 	 * We cannot yet update the cksums so clear any
1389 	 * h/w cksum flags as they are no longer valid.
1390 	 */
1391 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1392 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1393 
1394 	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
1395 	return (NULL);			/* NB: consumed, bypass processing. */
1396 }
1397 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1398 
1399 static void
1400 udp_abort(struct socket *so)
1401 {
1402 	struct inpcb *inp;
1403 
1404 	inp = sotoinpcb(so);
1405 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1406 	INP_WLOCK(inp);
1407 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1408 		INP_HASH_WLOCK(&V_udbinfo);
1409 		in_pcbdisconnect(inp);
1410 		inp->inp_laddr.s_addr = INADDR_ANY;
1411 		INP_HASH_WUNLOCK(&V_udbinfo);
1412 		soisdisconnected(so);
1413 	}
1414 	INP_WUNLOCK(inp);
1415 }
1416 
1417 static int
1418 udp_attach(struct socket *so, int proto, struct thread *td)
1419 {
1420 	struct inpcb *inp;
1421 	int error;
1422 
1423 	inp = sotoinpcb(so);
1424 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1425 	error = soreserve(so, udp_sendspace, udp_recvspace);
1426 	if (error)
1427 		return (error);
1428 	INP_INFO_WLOCK(&V_udbinfo);
1429 	error = in_pcballoc(so, &V_udbinfo);
1430 	if (error) {
1431 		INP_INFO_WUNLOCK(&V_udbinfo);
1432 		return (error);
1433 	}
1434 
1435 	inp = sotoinpcb(so);
1436 	inp->inp_vflag |= INP_IPV4;
1437 	inp->inp_ip_ttl = V_ip_defttl;
1438 
1439 	error = udp_newudpcb(inp);
1440 	if (error) {
1441 		in_pcbdetach(inp);
1442 		in_pcbfree(inp);
1443 		INP_INFO_WUNLOCK(&V_udbinfo);
1444 		return (error);
1445 	}
1446 
1447 	INP_WUNLOCK(inp);
1448 	INP_INFO_WUNLOCK(&V_udbinfo);
1449 	return (0);
1450 }
1451 #endif /* INET */
1452 
1453 int
1454 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
1455 {
1456 	struct inpcb *inp;
1457 	struct udpcb *up;
1458 
1459 	KASSERT(so->so_type == SOCK_DGRAM,
1460 	    ("udp_set_kernel_tunneling: !dgram"));
1461 	inp = sotoinpcb(so);
1462 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1463 	INP_WLOCK(inp);
1464 	up = intoudpcb(inp);
1465 	if (up->u_tun_func != NULL) {
1466 		INP_WUNLOCK(inp);
1467 		return (EBUSY);
1468 	}
1469 	up->u_tun_func = f;
1470 	INP_WUNLOCK(inp);
1471 	return (0);
1472 }
1473 
1474 #ifdef INET
1475 static int
1476 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1477 {
1478 	struct inpcb *inp;
1479 	int error;
1480 
1481 	inp = sotoinpcb(so);
1482 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1483 	INP_WLOCK(inp);
1484 	INP_HASH_WLOCK(&V_udbinfo);
1485 	error = in_pcbbind(inp, nam, td->td_ucred);
1486 	INP_HASH_WUNLOCK(&V_udbinfo);
1487 	INP_WUNLOCK(inp);
1488 	return (error);
1489 }
1490 
1491 static void
1492 udp_close(struct socket *so)
1493 {
1494 	struct inpcb *inp;
1495 
1496 	inp = sotoinpcb(so);
1497 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1498 	INP_WLOCK(inp);
1499 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1500 		INP_HASH_WLOCK(&V_udbinfo);
1501 		in_pcbdisconnect(inp);
1502 		inp->inp_laddr.s_addr = INADDR_ANY;
1503 		INP_HASH_WUNLOCK(&V_udbinfo);
1504 		soisdisconnected(so);
1505 	}
1506 	INP_WUNLOCK(inp);
1507 }
1508 
1509 static int
1510 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1511 {
1512 	struct inpcb *inp;
1513 	int error;
1514 	struct sockaddr_in *sin;
1515 
1516 	inp = sotoinpcb(so);
1517 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1518 	INP_WLOCK(inp);
1519 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1520 		INP_WUNLOCK(inp);
1521 		return (EISCONN);
1522 	}
1523 	sin = (struct sockaddr_in *)nam;
1524 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1525 	if (error != 0) {
1526 		INP_WUNLOCK(inp);
1527 		return (error);
1528 	}
1529 	INP_HASH_WLOCK(&V_udbinfo);
1530 	error = in_pcbconnect(inp, nam, td->td_ucred);
1531 	INP_HASH_WUNLOCK(&V_udbinfo);
1532 	if (error == 0)
1533 		soisconnected(so);
1534 	INP_WUNLOCK(inp);
1535 	return (error);
1536 }
1537 
1538 static void
1539 udp_detach(struct socket *so)
1540 {
1541 	struct inpcb *inp;
1542 	struct udpcb *up;
1543 
1544 	inp = sotoinpcb(so);
1545 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1546 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1547 	    ("udp_detach: not disconnected"));
1548 	INP_INFO_WLOCK(&V_udbinfo);
1549 	INP_WLOCK(inp);
1550 	up = intoudpcb(inp);
1551 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1552 	inp->inp_ppcb = NULL;
1553 	in_pcbdetach(inp);
1554 	in_pcbfree(inp);
1555 	INP_INFO_WUNLOCK(&V_udbinfo);
1556 	udp_discardcb(up);
1557 }
1558 
1559 static int
1560 udp_disconnect(struct socket *so)
1561 {
1562 	struct inpcb *inp;
1563 
1564 	inp = sotoinpcb(so);
1565 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1566 	INP_WLOCK(inp);
1567 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1568 		INP_WUNLOCK(inp);
1569 		return (ENOTCONN);
1570 	}
1571 	INP_HASH_WLOCK(&V_udbinfo);
1572 	in_pcbdisconnect(inp);
1573 	inp->inp_laddr.s_addr = INADDR_ANY;
1574 	INP_HASH_WUNLOCK(&V_udbinfo);
1575 	SOCK_LOCK(so);
1576 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1577 	SOCK_UNLOCK(so);
1578 	INP_WUNLOCK(inp);
1579 	return (0);
1580 }
1581 
1582 static int
1583 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1584     struct mbuf *control, struct thread *td)
1585 {
1586 	struct inpcb *inp;
1587 
1588 	inp = sotoinpcb(so);
1589 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1590 	return (udp_output(inp, m, addr, control, td));
1591 }
1592 #endif /* INET */
1593 
1594 int
1595 udp_shutdown(struct socket *so)
1596 {
1597 	struct inpcb *inp;
1598 
1599 	inp = sotoinpcb(so);
1600 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1601 	INP_WLOCK(inp);
1602 	socantsendmore(so);
1603 	INP_WUNLOCK(inp);
1604 	return (0);
1605 }
1606 
1607 #ifdef INET
1608 struct pr_usrreqs udp_usrreqs = {
1609 	.pru_abort =		udp_abort,
1610 	.pru_attach =		udp_attach,
1611 	.pru_bind =		udp_bind,
1612 	.pru_connect =		udp_connect,
1613 	.pru_control =		in_control,
1614 	.pru_detach =		udp_detach,
1615 	.pru_disconnect =	udp_disconnect,
1616 	.pru_peeraddr =		in_getpeeraddr,
1617 	.pru_send =		udp_send,
1618 	.pru_soreceive =	soreceive_dgram,
1619 	.pru_sosend =		sosend_dgram,
1620 	.pru_shutdown =		udp_shutdown,
1621 	.pru_sockaddr =		in_getsockaddr,
1622 	.pru_sosetlabel =	in_pcbsosetlabel,
1623 	.pru_close =		udp_close,
1624 };
1625 #endif /* INET */
1626