xref: /freebsd/sys/netinet/udp_usrreq.c (revision 69718b786d3943ea9a99eeeb5f5f6162f11c78b7)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2008 Robert N. M. Watson
5  * Copyright (c) 2010-2011 Juniper Networks, Inc.
6  * Copyright (c) 2014 Kevin Lo
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Robert N. M. Watson under
10  * contract to Juniper Networks, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ipsec.h"
45 #include "opt_rss.h"
46 
47 #include <sys/param.h>
48 #include <sys/domain.h>
49 #include <sys/eventhandler.h>
50 #include <sys/jail.h>
51 #include <sys/kernel.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/priv.h>
56 #include <sys/proc.h>
57 #include <sys/protosw.h>
58 #include <sys/sdt.h>
59 #include <sys/signalvar.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sx.h>
63 #include <sys/sysctl.h>
64 #include <sys/syslog.h>
65 #include <sys/systm.h>
66 
67 #include <vm/uma.h>
68 
69 #include <net/if.h>
70 #include <net/if_var.h>
71 #include <net/route.h>
72 #include <net/rss_config.h>
73 
74 #include <netinet/in.h>
75 #include <netinet/in_kdtrace.h>
76 #include <netinet/in_pcb.h>
77 #include <netinet/in_systm.h>
78 #include <netinet/in_var.h>
79 #include <netinet/ip.h>
80 #ifdef INET6
81 #include <netinet/ip6.h>
82 #endif
83 #include <netinet/ip_icmp.h>
84 #include <netinet/icmp_var.h>
85 #include <netinet/ip_var.h>
86 #include <netinet/ip_options.h>
87 #ifdef INET6
88 #include <netinet6/ip6_var.h>
89 #endif
90 #include <netinet/udp.h>
91 #include <netinet/udp_var.h>
92 #include <netinet/udplite.h>
93 #include <netinet/in_rss.h>
94 
95 #ifdef IPSEC
96 #include <netipsec/ipsec.h>
97 #include <netipsec/esp.h>
98 #endif
99 
100 #include <machine/in_cksum.h>
101 
102 #include <security/mac/mac_framework.h>
103 
104 /*
105  * UDP and UDP-Lite protocols implementation.
106  * Per RFC 768, August, 1980.
107  * Per RFC 3828, July, 2004.
108  */
109 
110 /*
111  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
112  * removes the only data integrity mechanism for packets and malformed
113  * packets that would otherwise be discarded due to bad checksums, and may
114  * cause problems (especially for NFS data blocks).
115  */
116 VNET_DEFINE(int, udp_cksum) = 1;
117 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
118     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
119 
120 int	udp_log_in_vain = 0;
121 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
122     &udp_log_in_vain, 0, "Log all incoming UDP packets");
123 
124 VNET_DEFINE(int, udp_blackhole) = 0;
125 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
126     &VNET_NAME(udp_blackhole), 0,
127     "Do not send port unreachables for refused connects");
128 
129 static VNET_DEFINE(int, udp_require_l2_bcast) = 0;
130 #define	V_udp_require_l2_bcast		VNET(udp_require_l2_bcast)
131 SYSCTL_INT(_net_inet_udp, OID_AUTO, require_l2_bcast, CTLFLAG_VNET | CTLFLAG_RW,
132     &VNET_NAME(udp_require_l2_bcast), 0,
133     "Only treat packets sent to an L2 broadcast address as broadcast packets");
134 
135 u_long	udp_sendspace = 9216;		/* really max datagram size */
136 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
137     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
138 
139 u_long	udp_recvspace = 40 * (1024 +
140 #ifdef INET6
141 				      sizeof(struct sockaddr_in6)
142 #else
143 				      sizeof(struct sockaddr_in)
144 #endif
145 				      );	/* 40 1K datagrams */
146 
147 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
148     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
149 
150 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
151 VNET_DEFINE(struct inpcbinfo, udbinfo);
152 VNET_DEFINE(struct inpcbhead, ulitecb);
153 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
154 static VNET_DEFINE(uma_zone_t, udpcb_zone);
155 #define	V_udpcb_zone			VNET(udpcb_zone)
156 
157 #ifndef UDBHASHSIZE
158 #define	UDBHASHSIZE	128
159 #endif
160 
161 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
162 VNET_PCPUSTAT_SYSINIT(udpstat);
163 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
164     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
165 
166 #ifdef VIMAGE
167 VNET_PCPUSTAT_SYSUNINIT(udpstat);
168 #endif /* VIMAGE */
169 #ifdef INET
170 static void	udp_detach(struct socket *so);
171 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
172 		    struct mbuf *, struct thread *);
173 #endif
174 
175 #ifdef IPSEC
176 #ifdef IPSEC_NAT_T
177 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
178 #ifdef INET
179 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
180 #endif
181 #endif /* IPSEC_NAT_T */
182 #endif /* IPSEC */
183 
184 static void
185 udp_zone_change(void *tag)
186 {
187 
188 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
189 	uma_zone_set_max(V_udpcb_zone, maxsockets);
190 }
191 
192 static int
193 udp_inpcb_init(void *mem, int size, int flags)
194 {
195 	struct inpcb *inp;
196 
197 	inp = mem;
198 	INP_LOCK_INIT(inp, "inp", "udpinp");
199 	return (0);
200 }
201 
202 static int
203 udplite_inpcb_init(void *mem, int size, int flags)
204 {
205 	struct inpcb *inp;
206 
207 	inp = mem;
208 	INP_LOCK_INIT(inp, "inp", "udpliteinp");
209 	return (0);
210 }
211 
212 void
213 udp_init(void)
214 {
215 
216 	/*
217 	 * For now default to 2-tuple UDP hashing - until the fragment
218 	 * reassembly code can also update the flowid.
219 	 *
220 	 * Once we can calculate the flowid that way and re-establish
221 	 * a 4-tuple, flip this to 4-tuple.
222 	 */
223 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
224 	    "udp_inpcb", udp_inpcb_init, NULL, 0,
225 	    IPI_HASHFIELDS_2TUPLE);
226 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
227 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
228 	uma_zone_set_max(V_udpcb_zone, maxsockets);
229 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
230 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
231 	    EVENTHANDLER_PRI_ANY);
232 }
233 
234 void
235 udplite_init(void)
236 {
237 
238 	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
239 	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
240 	    0, IPI_HASHFIELDS_2TUPLE);
241 }
242 
243 /*
244  * Kernel module interface for updating udpstat.  The argument is an index
245  * into udpstat treated as an array of u_long.  While this encodes the
246  * general layout of udpstat into the caller, it doesn't encode its location,
247  * so that future changes to add, for example, per-CPU stats support won't
248  * cause binary compatibility problems for kernel modules.
249  */
250 void
251 kmod_udpstat_inc(int statnum)
252 {
253 
254 	counter_u64_add(VNET(udpstat)[statnum], 1);
255 }
256 
257 int
258 udp_newudpcb(struct inpcb *inp)
259 {
260 	struct udpcb *up;
261 
262 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
263 	if (up == NULL)
264 		return (ENOBUFS);
265 	inp->inp_ppcb = up;
266 	return (0);
267 }
268 
269 void
270 udp_discardcb(struct udpcb *up)
271 {
272 
273 	uma_zfree(V_udpcb_zone, up);
274 }
275 
276 #ifdef VIMAGE
277 static void
278 udp_destroy(void *unused __unused)
279 {
280 
281 	in_pcbinfo_destroy(&V_udbinfo);
282 	uma_zdestroy(V_udpcb_zone);
283 }
284 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
285 
286 static void
287 udplite_destroy(void *unused __unused)
288 {
289 
290 	in_pcbinfo_destroy(&V_ulitecbinfo);
291 }
292 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
293     NULL);
294 #endif
295 
296 #ifdef INET
297 /*
298  * Subroutine of udp_input(), which appends the provided mbuf chain to the
299  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
300  * contains the source address.  If the socket ends up being an IPv6 socket,
301  * udp_append() will convert to a sockaddr_in6 before passing the address
302  * into the socket code.
303  *
304  * In the normal case udp_append() will return 0, indicating that you
305  * must unlock the inp. However if a tunneling protocol is in place we increment
306  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
307  * then decrement the reference count. If the inp_rele returns 1, indicating the
308  * inp is gone, we return that to the caller to tell them *not* to unlock
309  * the inp. In the case of multi-cast this will cause the distribution
310  * to stop (though most tunneling protocols known currently do *not* use
311  * multicast).
312  */
313 static int
314 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
315     struct sockaddr_in *udp_in)
316 {
317 	struct sockaddr *append_sa;
318 	struct socket *so;
319 	struct mbuf *opts = NULL;
320 #ifdef INET6
321 	struct sockaddr_in6 udp_in6;
322 #endif
323 	struct udpcb *up;
324 
325 	INP_LOCK_ASSERT(inp);
326 
327 	/*
328 	 * Engage the tunneling protocol.
329 	 */
330 	up = intoudpcb(inp);
331 	if (up->u_tun_func != NULL) {
332 		in_pcbref(inp);
333 		INP_RUNLOCK(inp);
334 		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in,
335 		    up->u_tun_ctx);
336 		INP_RLOCK(inp);
337 		return (in_pcbrele_rlocked(inp));
338 	}
339 
340 	off += sizeof(struct udphdr);
341 
342 #ifdef IPSEC
343 	/* Check AH/ESP integrity. */
344 	if (ipsec4_in_reject(n, inp)) {
345 		m_freem(n);
346 		return (0);
347 	}
348 #ifdef IPSEC_NAT_T
349 	up = intoudpcb(inp);
350 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
351 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
352 		n = udp4_espdecap(inp, n, off);
353 		if (n == NULL)				/* Consumed. */
354 			return (0);
355 	}
356 #endif /* IPSEC_NAT_T */
357 #endif /* IPSEC */
358 #ifdef MAC
359 	if (mac_inpcb_check_deliver(inp, n) != 0) {
360 		m_freem(n);
361 		return (0);
362 	}
363 #endif /* MAC */
364 	if (inp->inp_flags & INP_CONTROLOPTS ||
365 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
366 #ifdef INET6
367 		if (inp->inp_vflag & INP_IPV6)
368 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
369 		else
370 #endif /* INET6 */
371 			ip_savecontrol(inp, &opts, ip, n);
372 	}
373 #ifdef INET6
374 	if (inp->inp_vflag & INP_IPV6) {
375 		bzero(&udp_in6, sizeof(udp_in6));
376 		udp_in6.sin6_len = sizeof(udp_in6);
377 		udp_in6.sin6_family = AF_INET6;
378 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
379 		append_sa = (struct sockaddr *)&udp_in6;
380 	} else
381 #endif /* INET6 */
382 		append_sa = (struct sockaddr *)udp_in;
383 	m_adj(n, off);
384 
385 	so = inp->inp_socket;
386 	SOCKBUF_LOCK(&so->so_rcv);
387 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
388 		SOCKBUF_UNLOCK(&so->so_rcv);
389 		m_freem(n);
390 		if (opts)
391 			m_freem(opts);
392 		UDPSTAT_INC(udps_fullsock);
393 	} else
394 		sorwakeup_locked(so);
395 	return (0);
396 }
397 
398 int
399 udp_input(struct mbuf **mp, int *offp, int proto)
400 {
401 	struct ip *ip;
402 	struct udphdr *uh;
403 	struct ifnet *ifp;
404 	struct inpcb *inp;
405 	uint16_t len, ip_len;
406 	struct inpcbinfo *pcbinfo;
407 	struct ip save_ip;
408 	struct sockaddr_in udp_in;
409 	struct mbuf *m;
410 	struct m_tag *fwd_tag;
411 	int cscov_partial, iphlen;
412 
413 	m = *mp;
414 	iphlen = *offp;
415 	ifp = m->m_pkthdr.rcvif;
416 	*mp = NULL;
417 	UDPSTAT_INC(udps_ipackets);
418 
419 	/*
420 	 * Strip IP options, if any; should skip this, make available to
421 	 * user, and use on returned packets, but we don't yet have a way to
422 	 * check the checksum with options still present.
423 	 */
424 	if (iphlen > sizeof (struct ip)) {
425 		ip_stripoptions(m);
426 		iphlen = sizeof(struct ip);
427 	}
428 
429 	/*
430 	 * Get IP and UDP header together in first mbuf.
431 	 */
432 	ip = mtod(m, struct ip *);
433 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
434 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
435 			UDPSTAT_INC(udps_hdrops);
436 			return (IPPROTO_DONE);
437 		}
438 		ip = mtod(m, struct ip *);
439 	}
440 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
441 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
442 
443 	/*
444 	 * Destination port of 0 is illegal, based on RFC768.
445 	 */
446 	if (uh->uh_dport == 0)
447 		goto badunlocked;
448 
449 	/*
450 	 * Construct sockaddr format source address.  Stuff source address
451 	 * and datagram in user buffer.
452 	 */
453 	bzero(&udp_in, sizeof(udp_in));
454 	udp_in.sin_len = sizeof(udp_in);
455 	udp_in.sin_family = AF_INET;
456 	udp_in.sin_port = uh->uh_sport;
457 	udp_in.sin_addr = ip->ip_src;
458 
459 	/*
460 	 * Make mbuf data length reflect UDP length.  If not enough data to
461 	 * reflect UDP length, drop.
462 	 */
463 	len = ntohs((u_short)uh->uh_ulen);
464 	ip_len = ntohs(ip->ip_len) - iphlen;
465 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
466 		/* Zero means checksum over the complete packet. */
467 		if (len == 0)
468 			len = ip_len;
469 		cscov_partial = 0;
470 	}
471 	if (ip_len != len) {
472 		if (len > ip_len || len < sizeof(struct udphdr)) {
473 			UDPSTAT_INC(udps_badlen);
474 			goto badunlocked;
475 		}
476 		if (proto == IPPROTO_UDP)
477 			m_adj(m, len - ip_len);
478 	}
479 
480 	/*
481 	 * Save a copy of the IP header in case we want restore it for
482 	 * sending an ICMP error message in response.
483 	 */
484 	if (!V_udp_blackhole)
485 		save_ip = *ip;
486 	else
487 		memset(&save_ip, 0, sizeof(save_ip));
488 
489 	/*
490 	 * Checksum extended UDP header and data.
491 	 */
492 	if (uh->uh_sum) {
493 		u_short uh_sum;
494 
495 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
496 		    !cscov_partial) {
497 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
498 				uh_sum = m->m_pkthdr.csum_data;
499 			else
500 				uh_sum = in_pseudo(ip->ip_src.s_addr,
501 				    ip->ip_dst.s_addr, htonl((u_short)len +
502 				    m->m_pkthdr.csum_data + proto));
503 			uh_sum ^= 0xffff;
504 		} else {
505 			char b[9];
506 
507 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
508 			bzero(((struct ipovly *)ip)->ih_x1, 9);
509 			((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
510 			    uh->uh_ulen : htons(ip_len);
511 			uh_sum = in_cksum(m, len + sizeof (struct ip));
512 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
513 		}
514 		if (uh_sum) {
515 			UDPSTAT_INC(udps_badsum);
516 			m_freem(m);
517 			return (IPPROTO_DONE);
518 		}
519 	} else {
520 		if (proto == IPPROTO_UDP) {
521 			UDPSTAT_INC(udps_nosum);
522 		} else {
523 			/* UDPLite requires a checksum */
524 			/* XXX: What is the right UDPLite MIB counter here? */
525 			m_freem(m);
526 			return (IPPROTO_DONE);
527 		}
528 	}
529 
530 	pcbinfo = udp_get_inpcbinfo(proto);
531 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
532 	    ((!V_udp_require_l2_bcast || m->m_flags & M_BCAST) &&
533 	    in_broadcast(ip->ip_dst, ifp))) {
534 		struct inpcb *last;
535 		struct inpcbhead *pcblist;
536 		struct ip_moptions *imo;
537 
538 		INP_INFO_RLOCK(pcbinfo);
539 		pcblist = udp_get_pcblist(proto);
540 		last = NULL;
541 		LIST_FOREACH(inp, pcblist, inp_list) {
542 			if (inp->inp_lport != uh->uh_dport)
543 				continue;
544 #ifdef INET6
545 			if ((inp->inp_vflag & INP_IPV4) == 0)
546 				continue;
547 #endif
548 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
549 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
550 				continue;
551 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
552 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
553 				continue;
554 			if (inp->inp_fport != 0 &&
555 			    inp->inp_fport != uh->uh_sport)
556 				continue;
557 
558 			INP_RLOCK(inp);
559 
560 			/*
561 			 * XXXRW: Because we weren't holding either the inpcb
562 			 * or the hash lock when we checked for a match
563 			 * before, we should probably recheck now that the
564 			 * inpcb lock is held.
565 			 */
566 
567 			/*
568 			 * Handle socket delivery policy for any-source
569 			 * and source-specific multicast. [RFC3678]
570 			 */
571 			imo = inp->inp_moptions;
572 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
573 				struct sockaddr_in	 group;
574 				int			 blocked;
575 				if (imo == NULL) {
576 					INP_RUNLOCK(inp);
577 					continue;
578 				}
579 				bzero(&group, sizeof(struct sockaddr_in));
580 				group.sin_len = sizeof(struct sockaddr_in);
581 				group.sin_family = AF_INET;
582 				group.sin_addr = ip->ip_dst;
583 
584 				blocked = imo_multi_filter(imo, ifp,
585 					(struct sockaddr *)&group,
586 					(struct sockaddr *)&udp_in);
587 				if (blocked != MCAST_PASS) {
588 					if (blocked == MCAST_NOTGMEMBER)
589 						IPSTAT_INC(ips_notmember);
590 					if (blocked == MCAST_NOTSMEMBER ||
591 					    blocked == MCAST_MUTED)
592 						UDPSTAT_INC(udps_filtermcast);
593 					INP_RUNLOCK(inp);
594 					continue;
595 				}
596 			}
597 			if (last != NULL) {
598 				struct mbuf *n;
599 
600 				if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
601 				    NULL) {
602 					UDP_PROBE(receive, NULL, last, ip,
603 					    last, uh);
604 					if (udp_append(last, ip, n, iphlen,
605 						&udp_in)) {
606 						goto inp_lost;
607 					}
608 				}
609 				INP_RUNLOCK(last);
610 			}
611 			last = inp;
612 			/*
613 			 * Don't look for additional matches if this one does
614 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
615 			 * socket options set.  This heuristic avoids
616 			 * searching through all pcbs in the common case of a
617 			 * non-shared port.  It assumes that an application
618 			 * will never clear these options after setting them.
619 			 */
620 			if ((last->inp_socket->so_options &
621 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
622 				break;
623 		}
624 
625 		if (last == NULL) {
626 			/*
627 			 * No matching pcb found; discard datagram.  (No need
628 			 * to send an ICMP Port Unreachable for a broadcast
629 			 * or multicast datgram.)
630 			 */
631 			UDPSTAT_INC(udps_noportbcast);
632 			if (inp)
633 				INP_RUNLOCK(inp);
634 			INP_INFO_RUNLOCK(pcbinfo);
635 			goto badunlocked;
636 		}
637 		UDP_PROBE(receive, NULL, last, ip, last, uh);
638 		if (udp_append(last, ip, m, iphlen, &udp_in) == 0)
639 			INP_RUNLOCK(last);
640 	inp_lost:
641 		INP_INFO_RUNLOCK(pcbinfo);
642 		return (IPPROTO_DONE);
643 	}
644 
645 	/*
646 	 * Locate pcb for datagram.
647 	 */
648 
649 	/*
650 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
651 	 */
652 	if ((m->m_flags & M_IP_NEXTHOP) &&
653 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
654 		struct sockaddr_in *next_hop;
655 
656 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
657 
658 		/*
659 		 * Transparently forwarded. Pretend to be the destination.
660 		 * Already got one like this?
661 		 */
662 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
663 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
664 		if (!inp) {
665 			/*
666 			 * It's new.  Try to find the ambushing socket.
667 			 * Because we've rewritten the destination address,
668 			 * any hardware-generated hash is ignored.
669 			 */
670 			inp = in_pcblookup(pcbinfo, ip->ip_src,
671 			    uh->uh_sport, next_hop->sin_addr,
672 			    next_hop->sin_port ? htons(next_hop->sin_port) :
673 			    uh->uh_dport, INPLOOKUP_WILDCARD |
674 			    INPLOOKUP_RLOCKPCB, ifp);
675 		}
676 		/* Remove the tag from the packet. We don't need it anymore. */
677 		m_tag_delete(m, fwd_tag);
678 		m->m_flags &= ~M_IP_NEXTHOP;
679 	} else
680 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
681 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
682 		    INPLOOKUP_RLOCKPCB, ifp, m);
683 	if (inp == NULL) {
684 		if (udp_log_in_vain) {
685 			char buf[4*sizeof "123"];
686 
687 			strcpy(buf, inet_ntoa(ip->ip_dst));
688 			log(LOG_INFO,
689 			    "Connection attempt to UDP %s:%d from %s:%d\n",
690 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
691 			    ntohs(uh->uh_sport));
692 		}
693 		UDPSTAT_INC(udps_noport);
694 		if (m->m_flags & (M_BCAST | M_MCAST)) {
695 			UDPSTAT_INC(udps_noportbcast);
696 			goto badunlocked;
697 		}
698 		if (V_udp_blackhole)
699 			goto badunlocked;
700 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
701 			goto badunlocked;
702 		*ip = save_ip;
703 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
704 		return (IPPROTO_DONE);
705 	}
706 
707 	/*
708 	 * Check the minimum TTL for socket.
709 	 */
710 	INP_RLOCK_ASSERT(inp);
711 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
712 		INP_RUNLOCK(inp);
713 		m_freem(m);
714 		return (IPPROTO_DONE);
715 	}
716 	if (cscov_partial) {
717 		struct udpcb *up;
718 
719 		up = intoudpcb(inp);
720 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
721 			INP_RUNLOCK(inp);
722 			m_freem(m);
723 			return (IPPROTO_DONE);
724 		}
725 	}
726 
727 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
728 	if (udp_append(inp, ip, m, iphlen, &udp_in) == 0)
729 		INP_RUNLOCK(inp);
730 	return (IPPROTO_DONE);
731 
732 badunlocked:
733 	m_freem(m);
734 	return (IPPROTO_DONE);
735 }
736 #endif /* INET */
737 
738 /*
739  * Notify a udp user of an asynchronous error; just wake up so that they can
740  * collect error status.
741  */
742 struct inpcb *
743 udp_notify(struct inpcb *inp, int errno)
744 {
745 
746 	/*
747 	 * While udp_ctlinput() always calls udp_notify() with a read lock
748 	 * when invoking it directly, in_pcbnotifyall() currently uses write
749 	 * locks due to sharing code with TCP.  For now, accept either a read
750 	 * or a write lock, but a read lock is sufficient.
751 	 */
752 	INP_LOCK_ASSERT(inp);
753 	if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
754 	     errno == EHOSTDOWN) && inp->inp_route.ro_rt) {
755 		RTFREE(inp->inp_route.ro_rt);
756 		inp->inp_route.ro_rt = (struct rtentry *)NULL;
757 	}
758 
759 	inp->inp_socket->so_error = errno;
760 	sorwakeup(inp->inp_socket);
761 	sowwakeup(inp->inp_socket);
762 	return (inp);
763 }
764 
765 #ifdef INET
766 static void
767 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
768     struct inpcbinfo *pcbinfo)
769 {
770 	struct ip *ip = vip;
771 	struct udphdr *uh;
772 	struct in_addr faddr;
773 	struct inpcb *inp;
774 
775 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
776 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
777 		return;
778 
779 	if (PRC_IS_REDIRECT(cmd)) {
780 		/* signal EHOSTDOWN, as it flushes the cached route */
781 		in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify);
782 		return;
783 	}
784 
785 	/*
786 	 * Hostdead is ugly because it goes linearly through all PCBs.
787 	 *
788 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
789 	 * DoS attack on machines with many connections.
790 	 */
791 	if (cmd == PRC_HOSTDEAD)
792 		ip = NULL;
793 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
794 		return;
795 	if (ip != NULL) {
796 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
797 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
798 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
799 		if (inp != NULL) {
800 			INP_RLOCK_ASSERT(inp);
801 			if (inp->inp_socket != NULL) {
802 				udp_notify(inp, inetctlerrmap[cmd]);
803 			}
804 			INP_RUNLOCK(inp);
805 		} else {
806 			inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
807 					   ip->ip_src, uh->uh_sport,
808 					   INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
809 			if (inp != NULL) {
810 				struct udpcb *up;
811 
812 				up = intoudpcb(inp);
813 				if (up->u_icmp_func != NULL) {
814 					INP_RUNLOCK(inp);
815 					(*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx);
816 				} else {
817 					INP_RUNLOCK(inp);
818 				}
819 			}
820 		}
821 	} else
822 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
823 		    udp_notify);
824 }
825 void
826 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
827 {
828 
829 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
830 }
831 
832 void
833 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
834 {
835 
836 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
837 }
838 #endif /* INET */
839 
840 static int
841 udp_pcblist(SYSCTL_HANDLER_ARGS)
842 {
843 	int error, i, n;
844 	struct inpcb *inp, **inp_list;
845 	inp_gen_t gencnt;
846 	struct xinpgen xig;
847 
848 	/*
849 	 * The process of preparing the PCB list is too time-consuming and
850 	 * resource-intensive to repeat twice on every request.
851 	 */
852 	if (req->oldptr == 0) {
853 		n = V_udbinfo.ipi_count;
854 		n += imax(n / 8, 10);
855 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
856 		return (0);
857 	}
858 
859 	if (req->newptr != 0)
860 		return (EPERM);
861 
862 	/*
863 	 * OK, now we're committed to doing something.
864 	 */
865 	INP_INFO_RLOCK(&V_udbinfo);
866 	gencnt = V_udbinfo.ipi_gencnt;
867 	n = V_udbinfo.ipi_count;
868 	INP_INFO_RUNLOCK(&V_udbinfo);
869 
870 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
871 		+ n * sizeof(struct xinpcb));
872 	if (error != 0)
873 		return (error);
874 
875 	xig.xig_len = sizeof xig;
876 	xig.xig_count = n;
877 	xig.xig_gen = gencnt;
878 	xig.xig_sogen = so_gencnt;
879 	error = SYSCTL_OUT(req, &xig, sizeof xig);
880 	if (error)
881 		return (error);
882 
883 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
884 	if (inp_list == NULL)
885 		return (ENOMEM);
886 
887 	INP_INFO_RLOCK(&V_udbinfo);
888 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
889 	     inp = LIST_NEXT(inp, inp_list)) {
890 		INP_WLOCK(inp);
891 		if (inp->inp_gencnt <= gencnt &&
892 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
893 			in_pcbref(inp);
894 			inp_list[i++] = inp;
895 		}
896 		INP_WUNLOCK(inp);
897 	}
898 	INP_INFO_RUNLOCK(&V_udbinfo);
899 	n = i;
900 
901 	error = 0;
902 	for (i = 0; i < n; i++) {
903 		inp = inp_list[i];
904 		INP_RLOCK(inp);
905 		if (inp->inp_gencnt <= gencnt) {
906 			struct xinpcb xi;
907 
908 			bzero(&xi, sizeof(xi));
909 			xi.xi_len = sizeof xi;
910 			/* XXX should avoid extra copy */
911 			bcopy(inp, &xi.xi_inp, sizeof *inp);
912 			if (inp->inp_socket)
913 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
914 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
915 			INP_RUNLOCK(inp);
916 			error = SYSCTL_OUT(req, &xi, sizeof xi);
917 		} else
918 			INP_RUNLOCK(inp);
919 	}
920 	INP_INFO_WLOCK(&V_udbinfo);
921 	for (i = 0; i < n; i++) {
922 		inp = inp_list[i];
923 		INP_RLOCK(inp);
924 		if (!in_pcbrele_rlocked(inp))
925 			INP_RUNLOCK(inp);
926 	}
927 	INP_INFO_WUNLOCK(&V_udbinfo);
928 
929 	if (!error) {
930 		/*
931 		 * Give the user an updated idea of our state.  If the
932 		 * generation differs from what we told her before, she knows
933 		 * that something happened while we were processing this
934 		 * request, and it might be necessary to retry.
935 		 */
936 		INP_INFO_RLOCK(&V_udbinfo);
937 		xig.xig_gen = V_udbinfo.ipi_gencnt;
938 		xig.xig_sogen = so_gencnt;
939 		xig.xig_count = V_udbinfo.ipi_count;
940 		INP_INFO_RUNLOCK(&V_udbinfo);
941 		error = SYSCTL_OUT(req, &xig, sizeof xig);
942 	}
943 	free(inp_list, M_TEMP);
944 	return (error);
945 }
946 
947 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
948     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
949     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
950 
951 #ifdef INET
952 static int
953 udp_getcred(SYSCTL_HANDLER_ARGS)
954 {
955 	struct xucred xuc;
956 	struct sockaddr_in addrs[2];
957 	struct inpcb *inp;
958 	int error;
959 
960 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
961 	if (error)
962 		return (error);
963 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
964 	if (error)
965 		return (error);
966 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
967 	    addrs[0].sin_addr, addrs[0].sin_port,
968 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
969 	if (inp != NULL) {
970 		INP_RLOCK_ASSERT(inp);
971 		if (inp->inp_socket == NULL)
972 			error = ENOENT;
973 		if (error == 0)
974 			error = cr_canseeinpcb(req->td->td_ucred, inp);
975 		if (error == 0)
976 			cru2x(inp->inp_cred, &xuc);
977 		INP_RUNLOCK(inp);
978 	} else
979 		error = ENOENT;
980 	if (error == 0)
981 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
982 	return (error);
983 }
984 
985 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
986     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
987     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
988 #endif /* INET */
989 
990 int
991 udp_ctloutput(struct socket *so, struct sockopt *sopt)
992 {
993 	struct inpcb *inp;
994 	struct udpcb *up;
995 	int isudplite, error, optval;
996 
997 	error = 0;
998 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
999 	inp = sotoinpcb(so);
1000 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1001 	INP_WLOCK(inp);
1002 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
1003 #ifdef INET6
1004 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
1005 			INP_WUNLOCK(inp);
1006 			error = ip6_ctloutput(so, sopt);
1007 		}
1008 #endif
1009 #if defined(INET) && defined(INET6)
1010 		else
1011 #endif
1012 #ifdef INET
1013 		{
1014 			INP_WUNLOCK(inp);
1015 			error = ip_ctloutput(so, sopt);
1016 		}
1017 #endif
1018 		return (error);
1019 	}
1020 
1021 	switch (sopt->sopt_dir) {
1022 	case SOPT_SET:
1023 		switch (sopt->sopt_name) {
1024 		case UDP_ENCAP:
1025 			INP_WUNLOCK(inp);
1026 			error = sooptcopyin(sopt, &optval, sizeof optval,
1027 					    sizeof optval);
1028 			if (error)
1029 				break;
1030 			inp = sotoinpcb(so);
1031 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1032 			INP_WLOCK(inp);
1033 #ifdef IPSEC_NAT_T
1034 			up = intoudpcb(inp);
1035 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1036 #endif
1037 			switch (optval) {
1038 			case 0:
1039 				/* Clear all UDP encap. */
1040 #ifdef IPSEC_NAT_T
1041 				up->u_flags &= ~UF_ESPINUDP_ALL;
1042 #endif
1043 				break;
1044 #ifdef IPSEC_NAT_T
1045 			case UDP_ENCAP_ESPINUDP:
1046 			case UDP_ENCAP_ESPINUDP_NON_IKE:
1047 				up->u_flags &= ~UF_ESPINUDP_ALL;
1048 				if (optval == UDP_ENCAP_ESPINUDP)
1049 					up->u_flags |= UF_ESPINUDP;
1050 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
1051 					up->u_flags |= UF_ESPINUDP_NON_IKE;
1052 				break;
1053 #endif
1054 			default:
1055 				error = EINVAL;
1056 				break;
1057 			}
1058 			INP_WUNLOCK(inp);
1059 			break;
1060 		case UDPLITE_SEND_CSCOV:
1061 		case UDPLITE_RECV_CSCOV:
1062 			if (!isudplite) {
1063 				INP_WUNLOCK(inp);
1064 				error = ENOPROTOOPT;
1065 				break;
1066 			}
1067 			INP_WUNLOCK(inp);
1068 			error = sooptcopyin(sopt, &optval, sizeof(optval),
1069 			    sizeof(optval));
1070 			if (error != 0)
1071 				break;
1072 			inp = sotoinpcb(so);
1073 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1074 			INP_WLOCK(inp);
1075 			up = intoudpcb(inp);
1076 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1077 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
1078 				INP_WUNLOCK(inp);
1079 				error = EINVAL;
1080 				break;
1081 			}
1082 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1083 				up->u_txcslen = optval;
1084 			else
1085 				up->u_rxcslen = optval;
1086 			INP_WUNLOCK(inp);
1087 			break;
1088 		default:
1089 			INP_WUNLOCK(inp);
1090 			error = ENOPROTOOPT;
1091 			break;
1092 		}
1093 		break;
1094 	case SOPT_GET:
1095 		switch (sopt->sopt_name) {
1096 #ifdef IPSEC_NAT_T
1097 		case UDP_ENCAP:
1098 			up = intoudpcb(inp);
1099 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1100 			optval = up->u_flags & UF_ESPINUDP_ALL;
1101 			INP_WUNLOCK(inp);
1102 			error = sooptcopyout(sopt, &optval, sizeof optval);
1103 			break;
1104 #endif
1105 		case UDPLITE_SEND_CSCOV:
1106 		case UDPLITE_RECV_CSCOV:
1107 			if (!isudplite) {
1108 				INP_WUNLOCK(inp);
1109 				error = ENOPROTOOPT;
1110 				break;
1111 			}
1112 			up = intoudpcb(inp);
1113 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1114 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1115 				optval = up->u_txcslen;
1116 			else
1117 				optval = up->u_rxcslen;
1118 			INP_WUNLOCK(inp);
1119 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1120 			break;
1121 		default:
1122 			INP_WUNLOCK(inp);
1123 			error = ENOPROTOOPT;
1124 			break;
1125 		}
1126 		break;
1127 	}
1128 	return (error);
1129 }
1130 
1131 #ifdef INET
1132 #define	UH_WLOCKED	2
1133 #define	UH_RLOCKED	1
1134 #define	UH_UNLOCKED	0
1135 static int
1136 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1137     struct mbuf *control, struct thread *td)
1138 {
1139 	struct udpiphdr *ui;
1140 	int len = m->m_pkthdr.len;
1141 	struct in_addr faddr, laddr;
1142 	struct cmsghdr *cm;
1143 	struct inpcbinfo *pcbinfo;
1144 	struct sockaddr_in *sin, src;
1145 	int cscov_partial = 0;
1146 	int error = 0;
1147 	int ipflags;
1148 	u_short fport, lport;
1149 	int unlock_udbinfo, unlock_inp;
1150 	u_char tos;
1151 	uint8_t pr;
1152 	uint16_t cscov = 0;
1153 	uint32_t flowid = 0;
1154 	uint8_t flowtype = M_HASHTYPE_NONE;
1155 
1156 	/*
1157 	 * udp_output() may need to temporarily bind or connect the current
1158 	 * inpcb.  As such, we don't know up front whether we will need the
1159 	 * pcbinfo lock or not.  Do any work to decide what is needed up
1160 	 * front before acquiring any locks.
1161 	 */
1162 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1163 		if (control)
1164 			m_freem(control);
1165 		m_freem(m);
1166 		return (EMSGSIZE);
1167 	}
1168 
1169 	src.sin_family = 0;
1170 	sin = (struct sockaddr_in *)addr;
1171 	if (sin == NULL ||
1172 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1173 		INP_WLOCK(inp);
1174 		unlock_inp = UH_WLOCKED;
1175 	} else {
1176 		INP_RLOCK(inp);
1177 		unlock_inp = UH_RLOCKED;
1178 	}
1179 	tos = inp->inp_ip_tos;
1180 	if (control != NULL) {
1181 		/*
1182 		 * XXX: Currently, we assume all the optional information is
1183 		 * stored in a single mbuf.
1184 		 */
1185 		if (control->m_next) {
1186 			if (unlock_inp == UH_WLOCKED)
1187 				INP_WUNLOCK(inp);
1188 			else
1189 				INP_RUNLOCK(inp);
1190 			m_freem(control);
1191 			m_freem(m);
1192 			return (EINVAL);
1193 		}
1194 		for (; control->m_len > 0;
1195 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
1196 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1197 			cm = mtod(control, struct cmsghdr *);
1198 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1199 			    || cm->cmsg_len > control->m_len) {
1200 				error = EINVAL;
1201 				break;
1202 			}
1203 			if (cm->cmsg_level != IPPROTO_IP)
1204 				continue;
1205 
1206 			switch (cm->cmsg_type) {
1207 			case IP_SENDSRCADDR:
1208 				if (cm->cmsg_len !=
1209 				    CMSG_LEN(sizeof(struct in_addr))) {
1210 					error = EINVAL;
1211 					break;
1212 				}
1213 				bzero(&src, sizeof(src));
1214 				src.sin_family = AF_INET;
1215 				src.sin_len = sizeof(src);
1216 				src.sin_port = inp->inp_lport;
1217 				src.sin_addr =
1218 				    *(struct in_addr *)CMSG_DATA(cm);
1219 				break;
1220 
1221 			case IP_TOS:
1222 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1223 					error = EINVAL;
1224 					break;
1225 				}
1226 				tos = *(u_char *)CMSG_DATA(cm);
1227 				break;
1228 
1229 			case IP_FLOWID:
1230 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1231 					error = EINVAL;
1232 					break;
1233 				}
1234 				flowid = *(uint32_t *) CMSG_DATA(cm);
1235 				break;
1236 
1237 			case IP_FLOWTYPE:
1238 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1239 					error = EINVAL;
1240 					break;
1241 				}
1242 				flowtype = *(uint32_t *) CMSG_DATA(cm);
1243 				break;
1244 
1245 #ifdef	RSS
1246 			case IP_RSSBUCKETID:
1247 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1248 					error = EINVAL;
1249 					break;
1250 				}
1251 				/* This is just a placeholder for now */
1252 				break;
1253 #endif	/* RSS */
1254 			default:
1255 				error = ENOPROTOOPT;
1256 				break;
1257 			}
1258 			if (error)
1259 				break;
1260 		}
1261 		m_freem(control);
1262 	}
1263 	if (error) {
1264 		if (unlock_inp == UH_WLOCKED)
1265 			INP_WUNLOCK(inp);
1266 		else
1267 			INP_RUNLOCK(inp);
1268 		m_freem(m);
1269 		return (error);
1270 	}
1271 
1272 	/*
1273 	 * Depending on whether or not the application has bound or connected
1274 	 * the socket, we may have to do varying levels of work.  The optimal
1275 	 * case is for a connected UDP socket, as a global lock isn't
1276 	 * required at all.
1277 	 *
1278 	 * In order to decide which we need, we require stability of the
1279 	 * inpcb binding, which we ensure by acquiring a read lock on the
1280 	 * inpcb.  This doesn't strictly follow the lock order, so we play
1281 	 * the trylock and retry game; note that we may end up with more
1282 	 * conservative locks than required the second time around, so later
1283 	 * assertions have to accept that.  Further analysis of the number of
1284 	 * misses under contention is required.
1285 	 *
1286 	 * XXXRW: Check that hash locking update here is correct.
1287 	 */
1288 	pr = inp->inp_socket->so_proto->pr_protocol;
1289 	pcbinfo = udp_get_inpcbinfo(pr);
1290 	sin = (struct sockaddr_in *)addr;
1291 	if (sin != NULL &&
1292 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1293 		INP_HASH_WLOCK(pcbinfo);
1294 		unlock_udbinfo = UH_WLOCKED;
1295 	} else if ((sin != NULL && (
1296 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
1297 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1298 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
1299 	    (inp->inp_lport == 0))) ||
1300 	    (src.sin_family == AF_INET)) {
1301 		INP_HASH_RLOCK(pcbinfo);
1302 		unlock_udbinfo = UH_RLOCKED;
1303 	} else
1304 		unlock_udbinfo = UH_UNLOCKED;
1305 
1306 	/*
1307 	 * If the IP_SENDSRCADDR control message was specified, override the
1308 	 * source address for this datagram.  Its use is invalidated if the
1309 	 * address thus specified is incomplete or clobbers other inpcbs.
1310 	 */
1311 	laddr = inp->inp_laddr;
1312 	lport = inp->inp_lport;
1313 	if (src.sin_family == AF_INET) {
1314 		INP_HASH_LOCK_ASSERT(pcbinfo);
1315 		if ((lport == 0) ||
1316 		    (laddr.s_addr == INADDR_ANY &&
1317 		     src.sin_addr.s_addr == INADDR_ANY)) {
1318 			error = EINVAL;
1319 			goto release;
1320 		}
1321 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1322 		    &laddr.s_addr, &lport, td->td_ucred);
1323 		if (error)
1324 			goto release;
1325 	}
1326 
1327 	/*
1328 	 * If a UDP socket has been connected, then a local address/port will
1329 	 * have been selected and bound.
1330 	 *
1331 	 * If a UDP socket has not been connected to, then an explicit
1332 	 * destination address must be used, in which case a local
1333 	 * address/port may not have been selected and bound.
1334 	 */
1335 	if (sin != NULL) {
1336 		INP_LOCK_ASSERT(inp);
1337 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1338 			error = EISCONN;
1339 			goto release;
1340 		}
1341 
1342 		/*
1343 		 * Jail may rewrite the destination address, so let it do
1344 		 * that before we use it.
1345 		 */
1346 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1347 		if (error)
1348 			goto release;
1349 
1350 		/*
1351 		 * If a local address or port hasn't yet been selected, or if
1352 		 * the destination address needs to be rewritten due to using
1353 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1354 		 * to do the heavy lifting.  Once a port is selected, we
1355 		 * commit the binding back to the socket; we also commit the
1356 		 * binding of the address if in jail.
1357 		 *
1358 		 * If we already have a valid binding and we're not
1359 		 * requesting a destination address rewrite, use a fast path.
1360 		 */
1361 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1362 		    inp->inp_lport == 0 ||
1363 		    sin->sin_addr.s_addr == INADDR_ANY ||
1364 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1365 			INP_HASH_LOCK_ASSERT(pcbinfo);
1366 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1367 			    &lport, &faddr.s_addr, &fport, NULL,
1368 			    td->td_ucred);
1369 			if (error)
1370 				goto release;
1371 
1372 			/*
1373 			 * XXXRW: Why not commit the port if the address is
1374 			 * !INADDR_ANY?
1375 			 */
1376 			/* Commit the local port if newly assigned. */
1377 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1378 			    inp->inp_lport == 0) {
1379 				INP_WLOCK_ASSERT(inp);
1380 				INP_HASH_WLOCK_ASSERT(pcbinfo);
1381 				/*
1382 				 * Remember addr if jailed, to prevent
1383 				 * rebinding.
1384 				 */
1385 				if (prison_flag(td->td_ucred, PR_IP4))
1386 					inp->inp_laddr = laddr;
1387 				inp->inp_lport = lport;
1388 				if (in_pcbinshash(inp) != 0) {
1389 					inp->inp_lport = 0;
1390 					error = EAGAIN;
1391 					goto release;
1392 				}
1393 				inp->inp_flags |= INP_ANONPORT;
1394 			}
1395 		} else {
1396 			faddr = sin->sin_addr;
1397 			fport = sin->sin_port;
1398 		}
1399 	} else {
1400 		INP_LOCK_ASSERT(inp);
1401 		faddr = inp->inp_faddr;
1402 		fport = inp->inp_fport;
1403 		if (faddr.s_addr == INADDR_ANY) {
1404 			error = ENOTCONN;
1405 			goto release;
1406 		}
1407 	}
1408 
1409 	/*
1410 	 * Calculate data length and get a mbuf for UDP, IP, and possible
1411 	 * link-layer headers.  Immediate slide the data pointer back forward
1412 	 * since we won't use that space at this layer.
1413 	 */
1414 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1415 	if (m == NULL) {
1416 		error = ENOBUFS;
1417 		goto release;
1418 	}
1419 	m->m_data += max_linkhdr;
1420 	m->m_len -= max_linkhdr;
1421 	m->m_pkthdr.len -= max_linkhdr;
1422 
1423 	/*
1424 	 * Fill in mbuf with extended UDP header and addresses and length put
1425 	 * into network format.
1426 	 */
1427 	ui = mtod(m, struct udpiphdr *);
1428 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1429 	ui->ui_pr = pr;
1430 	ui->ui_src = laddr;
1431 	ui->ui_dst = faddr;
1432 	ui->ui_sport = lport;
1433 	ui->ui_dport = fport;
1434 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1435 	if (pr == IPPROTO_UDPLITE) {
1436 		struct udpcb *up;
1437 		uint16_t plen;
1438 
1439 		up = intoudpcb(inp);
1440 		cscov = up->u_txcslen;
1441 		plen = (u_short)len + sizeof(struct udphdr);
1442 		if (cscov >= plen)
1443 			cscov = 0;
1444 		ui->ui_len = htons(plen);
1445 		ui->ui_ulen = htons(cscov);
1446 		/*
1447 		 * For UDP-Lite, checksum coverage length of zero means
1448 		 * the entire UDPLite packet is covered by the checksum.
1449 		 */
1450 		cscov_partial = (cscov == 0) ? 0 : 1;
1451 	} else
1452 		ui->ui_v = IPVERSION << 4;
1453 
1454 	/*
1455 	 * Set the Don't Fragment bit in the IP header.
1456 	 */
1457 	if (inp->inp_flags & INP_DONTFRAG) {
1458 		struct ip *ip;
1459 
1460 		ip = (struct ip *)&ui->ui_i;
1461 		ip->ip_off |= htons(IP_DF);
1462 	}
1463 
1464 	ipflags = 0;
1465 	if (inp->inp_socket->so_options & SO_DONTROUTE)
1466 		ipflags |= IP_ROUTETOIF;
1467 	if (inp->inp_socket->so_options & SO_BROADCAST)
1468 		ipflags |= IP_ALLOWBROADCAST;
1469 	if (inp->inp_flags & INP_ONESBCAST)
1470 		ipflags |= IP_SENDONES;
1471 
1472 #ifdef MAC
1473 	mac_inpcb_create_mbuf(inp, m);
1474 #endif
1475 
1476 	/*
1477 	 * Set up checksum and output datagram.
1478 	 */
1479 	ui->ui_sum = 0;
1480 	if (pr == IPPROTO_UDPLITE) {
1481 		if (inp->inp_flags & INP_ONESBCAST)
1482 			faddr.s_addr = INADDR_BROADCAST;
1483 		if (cscov_partial) {
1484 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1485 				ui->ui_sum = 0xffff;
1486 		} else {
1487 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1488 				ui->ui_sum = 0xffff;
1489 		}
1490 	} else if (V_udp_cksum) {
1491 		if (inp->inp_flags & INP_ONESBCAST)
1492 			faddr.s_addr = INADDR_BROADCAST;
1493 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1494 		    htons((u_short)len + sizeof(struct udphdr) + pr));
1495 		m->m_pkthdr.csum_flags = CSUM_UDP;
1496 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1497 	}
1498 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1499 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1500 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1501 	UDPSTAT_INC(udps_opackets);
1502 
1503 	/*
1504 	 * Setup flowid / RSS information for outbound socket.
1505 	 *
1506 	 * Once the UDP code decides to set a flowid some other way,
1507 	 * this allows the flowid to be overridden by userland.
1508 	 */
1509 	if (flowtype != M_HASHTYPE_NONE) {
1510 		m->m_pkthdr.flowid = flowid;
1511 		M_HASHTYPE_SET(m, flowtype);
1512 #ifdef	RSS
1513 	} else {
1514 		uint32_t hash_val, hash_type;
1515 		/*
1516 		 * Calculate an appropriate RSS hash for UDP and
1517 		 * UDP Lite.
1518 		 *
1519 		 * The called function will take care of figuring out
1520 		 * whether a 2-tuple or 4-tuple hash is required based
1521 		 * on the currently configured scheme.
1522 		 *
1523 		 * Later later on connected socket values should be
1524 		 * cached in the inpcb and reused, rather than constantly
1525 		 * re-calculating it.
1526 		 *
1527 		 * UDP Lite is a different protocol number and will
1528 		 * likely end up being hashed as a 2-tuple until
1529 		 * RSS / NICs grow UDP Lite protocol awareness.
1530 		 */
1531 		if (rss_proto_software_hash_v4(faddr, laddr, fport, lport,
1532 		    pr, &hash_val, &hash_type) == 0) {
1533 			m->m_pkthdr.flowid = hash_val;
1534 			M_HASHTYPE_SET(m, hash_type);
1535 		}
1536 #endif
1537 	}
1538 
1539 #ifdef	RSS
1540 	/*
1541 	 * Don't override with the inp cached flowid value.
1542 	 *
1543 	 * Depending upon the kind of send being done, the inp
1544 	 * flowid/flowtype values may actually not be appropriate
1545 	 * for this particular socket send.
1546 	 *
1547 	 * We should either leave the flowid at zero (which is what is
1548 	 * currently done) or set it to some software generated
1549 	 * hash value based on the packet contents.
1550 	 */
1551 	ipflags |= IP_NODEFAULTFLOWID;
1552 #endif	/* RSS */
1553 
1554 	if (unlock_udbinfo == UH_WLOCKED)
1555 		INP_HASH_WUNLOCK(pcbinfo);
1556 	else if (unlock_udbinfo == UH_RLOCKED)
1557 		INP_HASH_RUNLOCK(pcbinfo);
1558 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1559 	error = ip_output(m, inp->inp_options,
1560 	    (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags,
1561 	    inp->inp_moptions, inp);
1562 	if (unlock_inp == UH_WLOCKED)
1563 		INP_WUNLOCK(inp);
1564 	else
1565 		INP_RUNLOCK(inp);
1566 	return (error);
1567 
1568 release:
1569 	if (unlock_udbinfo == UH_WLOCKED) {
1570 		KASSERT(unlock_inp == UH_WLOCKED,
1571 		    ("%s: excl udbinfo lock, shared inp lock", __func__));
1572 		INP_HASH_WUNLOCK(pcbinfo);
1573 		INP_WUNLOCK(inp);
1574 	} else if (unlock_udbinfo == UH_RLOCKED) {
1575 		KASSERT(unlock_inp == UH_RLOCKED,
1576 		    ("%s: shared udbinfo lock, excl inp lock", __func__));
1577 		INP_HASH_RUNLOCK(pcbinfo);
1578 		INP_RUNLOCK(inp);
1579 	} else if (unlock_inp == UH_WLOCKED)
1580 		INP_WUNLOCK(inp);
1581 	else
1582 		INP_RUNLOCK(inp);
1583 	m_freem(m);
1584 	return (error);
1585 }
1586 
1587 
1588 #if defined(IPSEC) && defined(IPSEC_NAT_T)
1589 /*
1590  * Potentially decap ESP in UDP frame.  Check for an ESP header
1591  * and optional marker; if present, strip the UDP header and
1592  * push the result through IPSec.
1593  *
1594  * Returns mbuf to be processed (potentially re-allocated) or
1595  * NULL if consumed and/or processed.
1596  */
1597 static struct mbuf *
1598 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1599 {
1600 	size_t minlen, payload, skip, iphlen;
1601 	caddr_t data;
1602 	struct udpcb *up;
1603 	struct m_tag *tag;
1604 	struct udphdr *udphdr;
1605 	struct ip *ip;
1606 
1607 	INP_RLOCK_ASSERT(inp);
1608 
1609 	/*
1610 	 * Pull up data so the longest case is contiguous:
1611 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
1612 	 */
1613 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1614 	if (minlen > m->m_pkthdr.len)
1615 		minlen = m->m_pkthdr.len;
1616 	if ((m = m_pullup(m, minlen)) == NULL) {
1617 		IPSECSTAT_INC(ips_in_inval);
1618 		return (NULL);		/* Bypass caller processing. */
1619 	}
1620 	data = mtod(m, caddr_t);	/* Points to ip header. */
1621 	payload = m->m_len - off;	/* Size of payload. */
1622 
1623 	if (payload == 1 && data[off] == '\xff')
1624 		return (m);		/* NB: keepalive packet, no decap. */
1625 
1626 	up = intoudpcb(inp);
1627 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1628 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1629 	    ("u_flags 0x%x", up->u_flags));
1630 
1631 	/*
1632 	 * Check that the payload is large enough to hold an
1633 	 * ESP header and compute the amount of data to remove.
1634 	 *
1635 	 * NB: the caller has already done a pullup for us.
1636 	 * XXX can we assume alignment and eliminate bcopys?
1637 	 */
1638 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1639 		/*
1640 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1641 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1642 		 * possible AH mode non-IKE marker+non-ESP marker
1643 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
1644 		 */
1645 		uint64_t marker;
1646 
1647 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1648 			return (m);	/* NB: no decap. */
1649 		bcopy(data + off, &marker, sizeof(uint64_t));
1650 		if (marker != 0)	/* Non-IKE marker. */
1651 			return (m);	/* NB: no decap. */
1652 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
1653 	} else {
1654 		uint32_t spi;
1655 
1656 		if (payload <= sizeof(struct esp)) {
1657 			IPSECSTAT_INC(ips_in_inval);
1658 			m_freem(m);
1659 			return (NULL);	/* Discard. */
1660 		}
1661 		bcopy(data + off, &spi, sizeof(uint32_t));
1662 		if (spi == 0)		/* Non-ESP marker. */
1663 			return (m);	/* NB: no decap. */
1664 		skip = sizeof(struct udphdr);
1665 	}
1666 
1667 	/*
1668 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1669 	 * the UDP ports. This is required if we want to select
1670 	 * the right SPD for multiple hosts behind same NAT.
1671 	 *
1672 	 * NB: ports are maintained in network byte order everywhere
1673 	 *     in the NAT-T code.
1674 	 */
1675 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1676 		2 * sizeof(uint16_t), M_NOWAIT);
1677 	if (tag == NULL) {
1678 		IPSECSTAT_INC(ips_in_nomem);
1679 		m_freem(m);
1680 		return (NULL);		/* Discard. */
1681 	}
1682 	iphlen = off - sizeof(struct udphdr);
1683 	udphdr = (struct udphdr *)(data + iphlen);
1684 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1685 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1686 	m_tag_prepend(m, tag);
1687 
1688 	/*
1689 	 * Remove the UDP header (and possibly the non ESP marker)
1690 	 * IP header length is iphlen
1691 	 * Before:
1692 	 *   <--- off --->
1693 	 *   +----+------+-----+
1694 	 *   | IP |  UDP | ESP |
1695 	 *   +----+------+-----+
1696 	 *        <-skip->
1697 	 * After:
1698 	 *          +----+-----+
1699 	 *          | IP | ESP |
1700 	 *          +----+-----+
1701 	 *   <-skip->
1702 	 */
1703 	ovbcopy(data, data + skip, iphlen);
1704 	m_adj(m, skip);
1705 
1706 	ip = mtod(m, struct ip *);
1707 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
1708 	ip->ip_p = IPPROTO_ESP;
1709 
1710 	/*
1711 	 * We cannot yet update the cksums so clear any
1712 	 * h/w cksum flags as they are no longer valid.
1713 	 */
1714 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1715 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1716 
1717 	(void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p),
1718 				AF_INET, ip->ip_p);
1719 	return (NULL);			/* NB: consumed, bypass processing. */
1720 }
1721 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1722 
1723 static void
1724 udp_abort(struct socket *so)
1725 {
1726 	struct inpcb *inp;
1727 	struct inpcbinfo *pcbinfo;
1728 
1729 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1730 	inp = sotoinpcb(so);
1731 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1732 	INP_WLOCK(inp);
1733 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1734 		INP_HASH_WLOCK(pcbinfo);
1735 		in_pcbdisconnect(inp);
1736 		inp->inp_laddr.s_addr = INADDR_ANY;
1737 		INP_HASH_WUNLOCK(pcbinfo);
1738 		soisdisconnected(so);
1739 	}
1740 	INP_WUNLOCK(inp);
1741 }
1742 
1743 static int
1744 udp_attach(struct socket *so, int proto, struct thread *td)
1745 {
1746 	struct inpcb *inp;
1747 	struct inpcbinfo *pcbinfo;
1748 	int error;
1749 
1750 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1751 	inp = sotoinpcb(so);
1752 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1753 	error = soreserve(so, udp_sendspace, udp_recvspace);
1754 	if (error)
1755 		return (error);
1756 	INP_INFO_WLOCK(pcbinfo);
1757 	error = in_pcballoc(so, pcbinfo);
1758 	if (error) {
1759 		INP_INFO_WUNLOCK(pcbinfo);
1760 		return (error);
1761 	}
1762 
1763 	inp = sotoinpcb(so);
1764 	inp->inp_vflag |= INP_IPV4;
1765 	inp->inp_ip_ttl = V_ip_defttl;
1766 
1767 	error = udp_newudpcb(inp);
1768 	if (error) {
1769 		in_pcbdetach(inp);
1770 		in_pcbfree(inp);
1771 		INP_INFO_WUNLOCK(pcbinfo);
1772 		return (error);
1773 	}
1774 
1775 	INP_WUNLOCK(inp);
1776 	INP_INFO_WUNLOCK(pcbinfo);
1777 	return (0);
1778 }
1779 #endif /* INET */
1780 
1781 int
1782 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1783 {
1784 	struct inpcb *inp;
1785 	struct udpcb *up;
1786 
1787 	KASSERT(so->so_type == SOCK_DGRAM,
1788 	    ("udp_set_kernel_tunneling: !dgram"));
1789 	inp = sotoinpcb(so);
1790 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1791 	INP_WLOCK(inp);
1792 	up = intoudpcb(inp);
1793 	if ((up->u_tun_func != NULL) ||
1794 	    (up->u_icmp_func != NULL)) {
1795 		INP_WUNLOCK(inp);
1796 		return (EBUSY);
1797 	}
1798 	up->u_tun_func = f;
1799 	up->u_icmp_func = i;
1800 	up->u_tun_ctx = ctx;
1801 	INP_WUNLOCK(inp);
1802 	return (0);
1803 }
1804 
1805 #ifdef INET
1806 static int
1807 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1808 {
1809 	struct inpcb *inp;
1810 	struct inpcbinfo *pcbinfo;
1811 	int error;
1812 
1813 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1814 	inp = sotoinpcb(so);
1815 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1816 	INP_WLOCK(inp);
1817 	INP_HASH_WLOCK(pcbinfo);
1818 	error = in_pcbbind(inp, nam, td->td_ucred);
1819 	INP_HASH_WUNLOCK(pcbinfo);
1820 	INP_WUNLOCK(inp);
1821 	return (error);
1822 }
1823 
1824 static void
1825 udp_close(struct socket *so)
1826 {
1827 	struct inpcb *inp;
1828 	struct inpcbinfo *pcbinfo;
1829 
1830 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1831 	inp = sotoinpcb(so);
1832 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1833 	INP_WLOCK(inp);
1834 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1835 		INP_HASH_WLOCK(pcbinfo);
1836 		in_pcbdisconnect(inp);
1837 		inp->inp_laddr.s_addr = INADDR_ANY;
1838 		INP_HASH_WUNLOCK(pcbinfo);
1839 		soisdisconnected(so);
1840 	}
1841 	INP_WUNLOCK(inp);
1842 }
1843 
1844 static int
1845 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1846 {
1847 	struct inpcb *inp;
1848 	struct inpcbinfo *pcbinfo;
1849 	struct sockaddr_in *sin;
1850 	int error;
1851 
1852 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1853 	inp = sotoinpcb(so);
1854 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1855 	INP_WLOCK(inp);
1856 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1857 		INP_WUNLOCK(inp);
1858 		return (EISCONN);
1859 	}
1860 	sin = (struct sockaddr_in *)nam;
1861 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1862 	if (error != 0) {
1863 		INP_WUNLOCK(inp);
1864 		return (error);
1865 	}
1866 	INP_HASH_WLOCK(pcbinfo);
1867 	error = in_pcbconnect(inp, nam, td->td_ucred);
1868 	INP_HASH_WUNLOCK(pcbinfo);
1869 	if (error == 0)
1870 		soisconnected(so);
1871 	INP_WUNLOCK(inp);
1872 	return (error);
1873 }
1874 
1875 static void
1876 udp_detach(struct socket *so)
1877 {
1878 	struct inpcb *inp;
1879 	struct inpcbinfo *pcbinfo;
1880 	struct udpcb *up;
1881 
1882 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1883 	inp = sotoinpcb(so);
1884 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1885 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1886 	    ("udp_detach: not disconnected"));
1887 	INP_INFO_WLOCK(pcbinfo);
1888 	INP_WLOCK(inp);
1889 	up = intoudpcb(inp);
1890 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1891 	inp->inp_ppcb = NULL;
1892 	in_pcbdetach(inp);
1893 	in_pcbfree(inp);
1894 	INP_INFO_WUNLOCK(pcbinfo);
1895 	udp_discardcb(up);
1896 }
1897 
1898 static int
1899 udp_disconnect(struct socket *so)
1900 {
1901 	struct inpcb *inp;
1902 	struct inpcbinfo *pcbinfo;
1903 
1904 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1905 	inp = sotoinpcb(so);
1906 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1907 	INP_WLOCK(inp);
1908 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1909 		INP_WUNLOCK(inp);
1910 		return (ENOTCONN);
1911 	}
1912 	INP_HASH_WLOCK(pcbinfo);
1913 	in_pcbdisconnect(inp);
1914 	inp->inp_laddr.s_addr = INADDR_ANY;
1915 	INP_HASH_WUNLOCK(pcbinfo);
1916 	SOCK_LOCK(so);
1917 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1918 	SOCK_UNLOCK(so);
1919 	INP_WUNLOCK(inp);
1920 	return (0);
1921 }
1922 
1923 static int
1924 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1925     struct mbuf *control, struct thread *td)
1926 {
1927 	struct inpcb *inp;
1928 
1929 	inp = sotoinpcb(so);
1930 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1931 	return (udp_output(inp, m, addr, control, td));
1932 }
1933 #endif /* INET */
1934 
1935 int
1936 udp_shutdown(struct socket *so)
1937 {
1938 	struct inpcb *inp;
1939 
1940 	inp = sotoinpcb(so);
1941 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1942 	INP_WLOCK(inp);
1943 	socantsendmore(so);
1944 	INP_WUNLOCK(inp);
1945 	return (0);
1946 }
1947 
1948 #ifdef INET
1949 struct pr_usrreqs udp_usrreqs = {
1950 	.pru_abort =		udp_abort,
1951 	.pru_attach =		udp_attach,
1952 	.pru_bind =		udp_bind,
1953 	.pru_connect =		udp_connect,
1954 	.pru_control =		in_control,
1955 	.pru_detach =		udp_detach,
1956 	.pru_disconnect =	udp_disconnect,
1957 	.pru_peeraddr =		in_getpeeraddr,
1958 	.pru_send =		udp_send,
1959 	.pru_soreceive =	soreceive_dgram,
1960 	.pru_sosend =		sosend_dgram,
1961 	.pru_shutdown =		udp_shutdown,
1962 	.pru_sockaddr =		in_getsockaddr,
1963 	.pru_sosetlabel =	in_pcbsosetlabel,
1964 	.pru_close =		udp_close,
1965 };
1966 #endif /* INET */
1967