xref: /freebsd/sys/netinet/udp_usrreq.c (revision 454630c72556d45e401f29f56b3317c2fb0499a0)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2008 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2014 Kevin Lo
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_inet.h"
45 #include "opt_inet6.h"
46 #include "opt_ipsec.h"
47 #include "opt_route.h"
48 #include "opt_rss.h"
49 
50 #include <sys/param.h>
51 #include <sys/domain.h>
52 #include <sys/eventhandler.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mbuf.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/protosw.h>
61 #include <sys/sdt.h>
62 #include <sys/signalvar.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sx.h>
66 #include <sys/sysctl.h>
67 #include <sys/syslog.h>
68 #include <sys/systm.h>
69 
70 #include <vm/uma.h>
71 
72 #include <net/if.h>
73 #include <net/if_var.h>
74 #include <net/route.h>
75 #include <net/route/nhop.h>
76 #include <net/rss_config.h>
77 
78 #include <netinet/in.h>
79 #include <netinet/in_kdtrace.h>
80 #include <netinet/in_fib.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/in_var.h>
84 #include <netinet/ip.h>
85 #ifdef INET6
86 #include <netinet/ip6.h>
87 #endif
88 #include <netinet/ip_icmp.h>
89 #include <netinet/icmp_var.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/ip_options.h>
92 #ifdef INET6
93 #include <netinet6/ip6_var.h>
94 #endif
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/udplite.h>
98 #include <netinet/in_rss.h>
99 
100 #include <netipsec/ipsec_support.h>
101 
102 #include <machine/in_cksum.h>
103 
104 #include <security/mac/mac_framework.h>
105 
106 /*
107  * UDP and UDP-Lite protocols implementation.
108  * Per RFC 768, August, 1980.
109  * Per RFC 3828, July, 2004.
110  */
111 
112 /*
113  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
114  * removes the only data integrity mechanism for packets and malformed
115  * packets that would otherwise be discarded due to bad checksums, and may
116  * cause problems (especially for NFS data blocks).
117  */
118 VNET_DEFINE(int, udp_cksum) = 1;
119 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
120     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
121 
122 VNET_DEFINE(int, udp_log_in_vain) = 0;
123 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
124     &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
125 
126 VNET_DEFINE(int, udp_blackhole) = 0;
127 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
128     &VNET_NAME(udp_blackhole), 0,
129     "Do not send port unreachables for refused connects");
130 VNET_DEFINE(bool, udp_blackhole_local) = false;
131 SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
132     CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false,
133     "Enforce net.inet.udp.blackhole for locally originated packets");
134 
135 u_long	udp_sendspace = 9216;		/* really max datagram size */
136 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
137     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
138 
139 u_long	udp_recvspace = 40 * (1024 +
140 #ifdef INET6
141 				      sizeof(struct sockaddr_in6)
142 #else
143 				      sizeof(struct sockaddr_in)
144 #endif
145 				      );	/* 40 1K datagrams */
146 
147 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
148     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
149 
150 VNET_DEFINE(struct inpcbinfo, udbinfo);
151 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
152 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
153 #define	V_udpcb_zone			VNET(udpcb_zone)
154 
155 #ifndef UDBHASHSIZE
156 #define	UDBHASHSIZE	128
157 #endif
158 
159 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
160 VNET_PCPUSTAT_SYSINIT(udpstat);
161 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
162     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
163 
164 #ifdef VIMAGE
165 VNET_PCPUSTAT_SYSUNINIT(udpstat);
166 #endif /* VIMAGE */
167 #ifdef INET
168 static void	udp_detach(struct socket *so);
169 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
170 		    struct mbuf *, struct thread *, int);
171 #endif
172 
173 INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
174 INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
175     "udplitehash");
176 
177 static void
178 udp_init(void *arg __unused)
179 {
180 
181 	/*
182 	 * For now default to 2-tuple UDP hashing - until the fragment
183 	 * reassembly code can also update the flowid.
184 	 *
185 	 * Once we can calculate the flowid that way and re-establish
186 	 * a 4-tuple, flip this to 4-tuple.
187 	 */
188 	in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
189 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
190 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
191 	uma_zone_set_max(V_udpcb_zone, maxsockets);
192 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
193 
194 	/* Additional pcbinfo for UDP-Lite */
195 	in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
196 	    UDBHASHSIZE);
197 }
198 VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);
199 
200 /*
201  * Kernel module interface for updating udpstat.  The argument is an index
202  * into udpstat treated as an array of u_long.  While this encodes the
203  * general layout of udpstat into the caller, it doesn't encode its location,
204  * so that future changes to add, for example, per-CPU stats support won't
205  * cause binary compatibility problems for kernel modules.
206  */
207 void
208 kmod_udpstat_inc(int statnum)
209 {
210 
211 	counter_u64_add(VNET(udpstat)[statnum], 1);
212 }
213 
214 int
215 udp_newudpcb(struct inpcb *inp)
216 {
217 	struct udpcb *up;
218 
219 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
220 	if (up == NULL)
221 		return (ENOBUFS);
222 	inp->inp_ppcb = up;
223 	return (0);
224 }
225 
226 void
227 udp_discardcb(struct udpcb *up)
228 {
229 
230 	uma_zfree(V_udpcb_zone, up);
231 }
232 
233 #ifdef VIMAGE
234 static void
235 udp_destroy(void *unused __unused)
236 {
237 
238 	in_pcbinfo_destroy(&V_udbinfo);
239 	uma_zdestroy(V_udpcb_zone);
240 }
241 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
242 
243 static void
244 udplite_destroy(void *unused __unused)
245 {
246 
247 	in_pcbinfo_destroy(&V_ulitecbinfo);
248 }
249 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
250     NULL);
251 #endif
252 
253 #ifdef INET
254 /*
255  * Subroutine of udp_input(), which appends the provided mbuf chain to the
256  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
257  * contains the source address.  If the socket ends up being an IPv6 socket,
258  * udp_append() will convert to a sockaddr_in6 before passing the address
259  * into the socket code.
260  *
261  * In the normal case udp_append() will return 0, indicating that you
262  * must unlock the inp. However if a tunneling protocol is in place we increment
263  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
264  * then decrement the reference count. If the inp_rele returns 1, indicating the
265  * inp is gone, we return that to the caller to tell them *not* to unlock
266  * the inp. In the case of multi-cast this will cause the distribution
267  * to stop (though most tunneling protocols known currently do *not* use
268  * multicast).
269  */
270 static int
271 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
272     struct sockaddr_in *udp_in)
273 {
274 	struct sockaddr *append_sa;
275 	struct socket *so;
276 	struct mbuf *tmpopts, *opts = NULL;
277 #ifdef INET6
278 	struct sockaddr_in6 udp_in6;
279 #endif
280 	struct udpcb *up;
281 	bool filtered;
282 
283 	INP_LOCK_ASSERT(inp);
284 
285 	/*
286 	 * Engage the tunneling protocol.
287 	 */
288 	up = intoudpcb(inp);
289 	if (up->u_tun_func != NULL) {
290 		in_pcbref(inp);
291 		INP_RUNLOCK(inp);
292 		filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
293 		    up->u_tun_ctx);
294 		INP_RLOCK(inp);
295 		if (filtered)
296 			return (in_pcbrele_rlocked(inp));
297 	}
298 
299 	off += sizeof(struct udphdr);
300 
301 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
302 	/* Check AH/ESP integrity. */
303 	if (IPSEC_ENABLED(ipv4) &&
304 	    IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
305 		m_freem(n);
306 		return (0);
307 	}
308 	if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
309 		if (IPSEC_ENABLED(ipv4) &&
310 		    UDPENCAP_INPUT(n, off, AF_INET) != 0)
311 			return (0);	/* Consumed. */
312 	}
313 #endif /* IPSEC */
314 #ifdef MAC
315 	if (mac_inpcb_check_deliver(inp, n) != 0) {
316 		m_freem(n);
317 		return (0);
318 	}
319 #endif /* MAC */
320 	if (inp->inp_flags & INP_CONTROLOPTS ||
321 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
322 #ifdef INET6
323 		if (inp->inp_vflag & INP_IPV6)
324 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
325 		else
326 #endif /* INET6 */
327 			ip_savecontrol(inp, &opts, ip, n);
328 	}
329 	if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
330 		tmpopts = sbcreatecontrol((caddr_t)&udp_in[1],
331 			sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP);
332 		if (tmpopts) {
333 			if (opts) {
334 				tmpopts->m_next = opts;
335 				opts = tmpopts;
336 			} else
337 				opts = tmpopts;
338 		}
339 	}
340 #ifdef INET6
341 	if (inp->inp_vflag & INP_IPV6) {
342 		bzero(&udp_in6, sizeof(udp_in6));
343 		udp_in6.sin6_len = sizeof(udp_in6);
344 		udp_in6.sin6_family = AF_INET6;
345 		in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
346 		append_sa = (struct sockaddr *)&udp_in6;
347 	} else
348 #endif /* INET6 */
349 		append_sa = (struct sockaddr *)&udp_in[0];
350 	m_adj(n, off);
351 
352 	so = inp->inp_socket;
353 	SOCKBUF_LOCK(&so->so_rcv);
354 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
355 		soroverflow_locked(so);
356 		m_freem(n);
357 		if (opts)
358 			m_freem(opts);
359 		UDPSTAT_INC(udps_fullsock);
360 	} else
361 		sorwakeup_locked(so);
362 	return (0);
363 }
364 
365 static bool
366 udp_multi_match(const struct inpcb *inp, void *v)
367 {
368 	struct ip *ip = v;
369 	struct udphdr *uh = (struct udphdr *)(ip + 1);
370 
371 	if (inp->inp_lport != uh->uh_dport)
372 		return (false);
373 #ifdef INET6
374 	if ((inp->inp_vflag & INP_IPV4) == 0)
375 		return (false);
376 #endif
377 	if (inp->inp_laddr.s_addr != INADDR_ANY &&
378 	    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
379 		return (false);
380 	if (inp->inp_faddr.s_addr != INADDR_ANY &&
381 	    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
382 		return (false);
383 	if (inp->inp_fport != 0 &&
384 	    inp->inp_fport != uh->uh_sport)
385 		return (false);
386 
387 	return (true);
388 }
389 
390 static int
391 udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
392 {
393 	struct ip *ip = mtod(m, struct ip *);
394 	struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
395 	    INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
396 #ifdef KDTRACE_HOOKS
397 	struct udphdr *uh = (struct udphdr *)(ip + 1);
398 #endif
399 	struct inpcb *inp;
400 	struct mbuf *n;
401 	int appends = 0;
402 
403 	MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
404 
405 	while ((inp = inp_next(&inpi)) != NULL) {
406 		/*
407 		 * XXXRW: Because we weren't holding either the inpcb
408 		 * or the hash lock when we checked for a match
409 		 * before, we should probably recheck now that the
410 		 * inpcb lock is held.
411 		 */
412 		/*
413 		 * Handle socket delivery policy for any-source
414 		 * and source-specific multicast. [RFC3678]
415 		 */
416 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
417 			struct ip_moptions	*imo;
418 			struct sockaddr_in	 group;
419 			int			 blocked;
420 
421 			imo = inp->inp_moptions;
422 			if (imo == NULL)
423 				continue;
424 			bzero(&group, sizeof(struct sockaddr_in));
425 			group.sin_len = sizeof(struct sockaddr_in);
426 			group.sin_family = AF_INET;
427 			group.sin_addr = ip->ip_dst;
428 
429 			blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
430 				(struct sockaddr *)&group,
431 				(struct sockaddr *)&udp_in[0]);
432 			if (blocked != MCAST_PASS) {
433 				if (blocked == MCAST_NOTGMEMBER)
434 					IPSTAT_INC(ips_notmember);
435 				if (blocked == MCAST_NOTSMEMBER ||
436 				    blocked == MCAST_MUTED)
437 					UDPSTAT_INC(udps_filtermcast);
438 				continue;
439 			}
440 		}
441 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
442 			if (proto == IPPROTO_UDPLITE)
443 				UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
444 			else
445 				UDP_PROBE(receive, NULL, inp, ip, inp, uh);
446 			if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
447 				INP_RUNLOCK(inp);
448 				break;
449 			} else
450 				appends++;
451 		}
452 		/*
453 		 * Don't look for additional matches if this one does
454 		 * not have either the SO_REUSEPORT or SO_REUSEADDR
455 		 * socket options set.  This heuristic avoids
456 		 * searching through all pcbs in the common case of a
457 		 * non-shared port.  It assumes that an application
458 		 * will never clear these options after setting them.
459 		 */
460 		if ((inp->inp_socket->so_options &
461 		    (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
462 			INP_RUNLOCK(inp);
463 			break;
464 		}
465 	}
466 
467 	if (appends == 0) {
468 		/*
469 		 * No matching pcb found; discard datagram.  (No need
470 		 * to send an ICMP Port Unreachable for a broadcast
471 		 * or multicast datgram.)
472 		 */
473 		UDPSTAT_INC(udps_noport);
474 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
475 			UDPSTAT_INC(udps_noportmcast);
476 		else
477 			UDPSTAT_INC(udps_noportbcast);
478 	}
479 	m_freem(m);
480 
481 	return (IPPROTO_DONE);
482 }
483 
484 int
485 udp_input(struct mbuf **mp, int *offp, int proto)
486 {
487 	struct ip *ip;
488 	struct udphdr *uh;
489 	struct ifnet *ifp;
490 	struct inpcb *inp;
491 	uint16_t len, ip_len;
492 	struct inpcbinfo *pcbinfo;
493 	struct sockaddr_in udp_in[2];
494 	struct mbuf *m;
495 	struct m_tag *fwd_tag;
496 	int cscov_partial, iphlen;
497 
498 	m = *mp;
499 	iphlen = *offp;
500 	ifp = m->m_pkthdr.rcvif;
501 	*mp = NULL;
502 	UDPSTAT_INC(udps_ipackets);
503 
504 	/*
505 	 * Strip IP options, if any; should skip this, make available to
506 	 * user, and use on returned packets, but we don't yet have a way to
507 	 * check the checksum with options still present.
508 	 */
509 	if (iphlen > sizeof (struct ip)) {
510 		ip_stripoptions(m);
511 		iphlen = sizeof(struct ip);
512 	}
513 
514 	/*
515 	 * Get IP and UDP header together in first mbuf.
516 	 */
517 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
518 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
519 			UDPSTAT_INC(udps_hdrops);
520 			return (IPPROTO_DONE);
521 		}
522 	}
523 	ip = mtod(m, struct ip *);
524 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
525 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
526 
527 	/*
528 	 * Destination port of 0 is illegal, based on RFC768.
529 	 */
530 	if (uh->uh_dport == 0)
531 		goto badunlocked;
532 
533 	/*
534 	 * Construct sockaddr format source address.  Stuff source address
535 	 * and datagram in user buffer.
536 	 */
537 	bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
538 	udp_in[0].sin_len = sizeof(struct sockaddr_in);
539 	udp_in[0].sin_family = AF_INET;
540 	udp_in[0].sin_port = uh->uh_sport;
541 	udp_in[0].sin_addr = ip->ip_src;
542 	udp_in[1].sin_len = sizeof(struct sockaddr_in);
543 	udp_in[1].sin_family = AF_INET;
544 	udp_in[1].sin_port = uh->uh_dport;
545 	udp_in[1].sin_addr = ip->ip_dst;
546 
547 	/*
548 	 * Make mbuf data length reflect UDP length.  If not enough data to
549 	 * reflect UDP length, drop.
550 	 */
551 	len = ntohs((u_short)uh->uh_ulen);
552 	ip_len = ntohs(ip->ip_len) - iphlen;
553 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
554 		/* Zero means checksum over the complete packet. */
555 		if (len == 0)
556 			len = ip_len;
557 		cscov_partial = 0;
558 	}
559 	if (ip_len != len) {
560 		if (len > ip_len || len < sizeof(struct udphdr)) {
561 			UDPSTAT_INC(udps_badlen);
562 			goto badunlocked;
563 		}
564 		if (proto == IPPROTO_UDP)
565 			m_adj(m, len - ip_len);
566 	}
567 
568 	/*
569 	 * Checksum extended UDP header and data.
570 	 */
571 	if (uh->uh_sum) {
572 		u_short uh_sum;
573 
574 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
575 		    !cscov_partial) {
576 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
577 				uh_sum = m->m_pkthdr.csum_data;
578 			else
579 				uh_sum = in_pseudo(ip->ip_src.s_addr,
580 				    ip->ip_dst.s_addr, htonl((u_short)len +
581 				    m->m_pkthdr.csum_data + proto));
582 			uh_sum ^= 0xffff;
583 		} else {
584 			char b[offsetof(struct ipovly, ih_src)];
585 			struct ipovly *ipov = (struct ipovly *)ip;
586 
587 			bcopy(ipov, b, sizeof(b));
588 			bzero(ipov, sizeof(ipov->ih_x1));
589 			ipov->ih_len = (proto == IPPROTO_UDP) ?
590 			    uh->uh_ulen : htons(ip_len);
591 			uh_sum = in_cksum(m, len + sizeof (struct ip));
592 			bcopy(b, ipov, sizeof(b));
593 		}
594 		if (uh_sum) {
595 			UDPSTAT_INC(udps_badsum);
596 			m_freem(m);
597 			return (IPPROTO_DONE);
598 		}
599 	} else {
600 		if (proto == IPPROTO_UDP) {
601 			UDPSTAT_INC(udps_nosum);
602 		} else {
603 			/* UDPLite requires a checksum */
604 			/* XXX: What is the right UDPLite MIB counter here? */
605 			m_freem(m);
606 			return (IPPROTO_DONE);
607 		}
608 	}
609 
610 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
611 	    in_broadcast(ip->ip_dst, ifp))
612 		return (udp_multi_input(m, proto, udp_in));
613 
614 	pcbinfo = udp_get_inpcbinfo(proto);
615 
616 	/*
617 	 * Locate pcb for datagram.
618 	 *
619 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
620 	 */
621 	if ((m->m_flags & M_IP_NEXTHOP) &&
622 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
623 		struct sockaddr_in *next_hop;
624 
625 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
626 
627 		/*
628 		 * Transparently forwarded. Pretend to be the destination.
629 		 * Already got one like this?
630 		 */
631 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
632 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
633 		if (!inp) {
634 			/*
635 			 * It's new.  Try to find the ambushing socket.
636 			 * Because we've rewritten the destination address,
637 			 * any hardware-generated hash is ignored.
638 			 */
639 			inp = in_pcblookup(pcbinfo, ip->ip_src,
640 			    uh->uh_sport, next_hop->sin_addr,
641 			    next_hop->sin_port ? htons(next_hop->sin_port) :
642 			    uh->uh_dport, INPLOOKUP_WILDCARD |
643 			    INPLOOKUP_RLOCKPCB, ifp);
644 		}
645 		/* Remove the tag from the packet. We don't need it anymore. */
646 		m_tag_delete(m, fwd_tag);
647 		m->m_flags &= ~M_IP_NEXTHOP;
648 	} else
649 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
650 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
651 		    INPLOOKUP_RLOCKPCB, ifp, m);
652 	if (inp == NULL) {
653 		if (V_udp_log_in_vain) {
654 			char src[INET_ADDRSTRLEN];
655 			char dst[INET_ADDRSTRLEN];
656 
657 			log(LOG_INFO,
658 			    "Connection attempt to UDP %s:%d from %s:%d\n",
659 			    inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
660 			    inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
661 		}
662 		if (proto == IPPROTO_UDPLITE)
663 			UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
664 		else
665 			UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
666 		UDPSTAT_INC(udps_noport);
667 		if (m->m_flags & (M_BCAST | M_MCAST)) {
668 			UDPSTAT_INC(udps_noportbcast);
669 			goto badunlocked;
670 		}
671 		if (V_udp_blackhole && (V_udp_blackhole_local ||
672 		    !in_localip(ip->ip_src)))
673 			goto badunlocked;
674 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
675 			goto badunlocked;
676 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
677 		return (IPPROTO_DONE);
678 	}
679 
680 	/*
681 	 * Check the minimum TTL for socket.
682 	 */
683 	INP_RLOCK_ASSERT(inp);
684 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
685 		if (proto == IPPROTO_UDPLITE)
686 			UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
687 		else
688 			UDP_PROBE(receive, NULL, inp, ip, inp, uh);
689 		INP_RUNLOCK(inp);
690 		m_freem(m);
691 		return (IPPROTO_DONE);
692 	}
693 	if (cscov_partial) {
694 		struct udpcb *up;
695 
696 		up = intoudpcb(inp);
697 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
698 			INP_RUNLOCK(inp);
699 			m_freem(m);
700 			return (IPPROTO_DONE);
701 		}
702 	}
703 
704 	if (proto == IPPROTO_UDPLITE)
705 		UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
706 	else
707 		UDP_PROBE(receive, NULL, inp, ip, inp, uh);
708 	if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
709 		INP_RUNLOCK(inp);
710 	return (IPPROTO_DONE);
711 
712 badunlocked:
713 	m_freem(m);
714 	return (IPPROTO_DONE);
715 }
716 #endif /* INET */
717 
718 /*
719  * Notify a udp user of an asynchronous error; just wake up so that they can
720  * collect error status.
721  */
722 struct inpcb *
723 udp_notify(struct inpcb *inp, int errno)
724 {
725 
726 	INP_WLOCK_ASSERT(inp);
727 	if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
728 	     errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
729 		NH_FREE(inp->inp_route.ro_nh);
730 		inp->inp_route.ro_nh = (struct nhop_object *)NULL;
731 	}
732 
733 	inp->inp_socket->so_error = errno;
734 	sorwakeup(inp->inp_socket);
735 	sowwakeup(inp->inp_socket);
736 	return (inp);
737 }
738 
739 #ifdef INET
740 static void
741 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
742     struct inpcbinfo *pcbinfo)
743 {
744 	struct ip *ip = vip;
745 	struct udphdr *uh;
746 	struct in_addr faddr;
747 	struct inpcb *inp;
748 
749 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
750 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
751 		return;
752 
753 	if (PRC_IS_REDIRECT(cmd)) {
754 		/* signal EHOSTDOWN, as it flushes the cached route */
755 		in_pcbnotifyall(pcbinfo, faddr, EHOSTDOWN, udp_notify);
756 		return;
757 	}
758 
759 	/*
760 	 * Hostdead is ugly because it goes linearly through all PCBs.
761 	 *
762 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
763 	 * DoS attack on machines with many connections.
764 	 */
765 	if (cmd == PRC_HOSTDEAD)
766 		ip = NULL;
767 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
768 		return;
769 	if (ip != NULL) {
770 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
771 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
772 		    ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
773 		if (inp != NULL) {
774 			INP_WLOCK_ASSERT(inp);
775 			if (inp->inp_socket != NULL) {
776 				udp_notify(inp, inetctlerrmap[cmd]);
777 			}
778 			INP_WUNLOCK(inp);
779 		} else {
780 			inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
781 					   ip->ip_src, uh->uh_sport,
782 					   INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
783 			if (inp != NULL) {
784 				struct udpcb *up;
785 				void *ctx;
786 				udp_tun_icmp_t func;
787 
788 				up = intoudpcb(inp);
789 				ctx = up->u_tun_ctx;
790 				func = up->u_icmp_func;
791 				INP_RUNLOCK(inp);
792 				if (func != NULL)
793 					(*func)(cmd, sa, vip, ctx);
794 			}
795 		}
796 	} else
797 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
798 		    udp_notify);
799 }
800 void
801 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
802 {
803 
804 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
805 }
806 
807 void
808 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
809 {
810 
811 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
812 }
813 #endif /* INET */
814 
815 static int
816 udp_pcblist(SYSCTL_HANDLER_ARGS)
817 {
818 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
819 	    INPLOOKUP_RLOCKPCB);
820 	struct xinpgen xig;
821 	struct inpcb *inp;
822 	int error;
823 
824 	if (req->newptr != 0)
825 		return (EPERM);
826 
827 	if (req->oldptr == 0) {
828 		int n;
829 
830 		n = V_udbinfo.ipi_count;
831 		n += imax(n / 8, 10);
832 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
833 		return (0);
834 	}
835 
836 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
837 		return (error);
838 
839 	bzero(&xig, sizeof(xig));
840 	xig.xig_len = sizeof xig;
841 	xig.xig_count = V_udbinfo.ipi_count;
842 	xig.xig_gen = V_udbinfo.ipi_gencnt;
843 	xig.xig_sogen = so_gencnt;
844 	error = SYSCTL_OUT(req, &xig, sizeof xig);
845 	if (error)
846 		return (error);
847 
848 	while ((inp = inp_next(&inpi)) != NULL) {
849 		if (inp->inp_gencnt <= xig.xig_gen &&
850 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
851 			struct xinpcb xi;
852 
853 			in_pcbtoxinpcb(inp, &xi);
854 			error = SYSCTL_OUT(req, &xi, sizeof xi);
855 			if (error) {
856 				INP_RUNLOCK(inp);
857 				break;
858 			}
859 		}
860 	}
861 
862 	if (!error) {
863 		/*
864 		 * Give the user an updated idea of our state.  If the
865 		 * generation differs from what we told her before, she knows
866 		 * that something happened while we were processing this
867 		 * request, and it might be necessary to retry.
868 		 */
869 		xig.xig_gen = V_udbinfo.ipi_gencnt;
870 		xig.xig_sogen = so_gencnt;
871 		xig.xig_count = V_udbinfo.ipi_count;
872 		error = SYSCTL_OUT(req, &xig, sizeof xig);
873 	}
874 
875 	return (error);
876 }
877 
878 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
879     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
880     udp_pcblist, "S,xinpcb",
881     "List of active UDP sockets");
882 
883 #ifdef INET
884 static int
885 udp_getcred(SYSCTL_HANDLER_ARGS)
886 {
887 	struct xucred xuc;
888 	struct sockaddr_in addrs[2];
889 	struct epoch_tracker et;
890 	struct inpcb *inp;
891 	int error;
892 
893 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
894 	if (error)
895 		return (error);
896 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
897 	if (error)
898 		return (error);
899 	NET_EPOCH_ENTER(et);
900 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
901 	    addrs[0].sin_addr, addrs[0].sin_port,
902 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
903 	NET_EPOCH_EXIT(et);
904 	if (inp != NULL) {
905 		INP_RLOCK_ASSERT(inp);
906 		if (inp->inp_socket == NULL)
907 			error = ENOENT;
908 		if (error == 0)
909 			error = cr_canseeinpcb(req->td->td_ucred, inp);
910 		if (error == 0)
911 			cru2x(inp->inp_cred, &xuc);
912 		INP_RUNLOCK(inp);
913 	} else
914 		error = ENOENT;
915 	if (error == 0)
916 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
917 	return (error);
918 }
919 
920 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
921     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
922     0, 0, udp_getcred, "S,xucred",
923     "Get the xucred of a UDP connection");
924 #endif /* INET */
925 
926 int
927 udp_ctloutput(struct socket *so, struct sockopt *sopt)
928 {
929 	struct inpcb *inp;
930 	struct udpcb *up;
931 	int isudplite, error, optval;
932 
933 	error = 0;
934 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
935 	inp = sotoinpcb(so);
936 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
937 	INP_WLOCK(inp);
938 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
939 #ifdef INET6
940 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
941 			INP_WUNLOCK(inp);
942 			error = ip6_ctloutput(so, sopt);
943 		}
944 #endif
945 #if defined(INET) && defined(INET6)
946 		else
947 #endif
948 #ifdef INET
949 		{
950 			INP_WUNLOCK(inp);
951 			error = ip_ctloutput(so, sopt);
952 		}
953 #endif
954 		return (error);
955 	}
956 
957 	switch (sopt->sopt_dir) {
958 	case SOPT_SET:
959 		switch (sopt->sopt_name) {
960 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
961 #ifdef INET
962 		case UDP_ENCAP:
963 			if (!IPSEC_ENABLED(ipv4)) {
964 				INP_WUNLOCK(inp);
965 				return (ENOPROTOOPT);
966 			}
967 			error = UDPENCAP_PCBCTL(inp, sopt);
968 			break;
969 #endif /* INET */
970 #endif /* IPSEC */
971 		case UDPLITE_SEND_CSCOV:
972 		case UDPLITE_RECV_CSCOV:
973 			if (!isudplite) {
974 				INP_WUNLOCK(inp);
975 				error = ENOPROTOOPT;
976 				break;
977 			}
978 			INP_WUNLOCK(inp);
979 			error = sooptcopyin(sopt, &optval, sizeof(optval),
980 			    sizeof(optval));
981 			if (error != 0)
982 				break;
983 			inp = sotoinpcb(so);
984 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
985 			INP_WLOCK(inp);
986 			up = intoudpcb(inp);
987 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
988 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
989 				INP_WUNLOCK(inp);
990 				error = EINVAL;
991 				break;
992 			}
993 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
994 				up->u_txcslen = optval;
995 			else
996 				up->u_rxcslen = optval;
997 			INP_WUNLOCK(inp);
998 			break;
999 		default:
1000 			INP_WUNLOCK(inp);
1001 			error = ENOPROTOOPT;
1002 			break;
1003 		}
1004 		break;
1005 	case SOPT_GET:
1006 		switch (sopt->sopt_name) {
1007 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1008 #ifdef INET
1009 		case UDP_ENCAP:
1010 			if (!IPSEC_ENABLED(ipv4)) {
1011 				INP_WUNLOCK(inp);
1012 				return (ENOPROTOOPT);
1013 			}
1014 			error = UDPENCAP_PCBCTL(inp, sopt);
1015 			break;
1016 #endif /* INET */
1017 #endif /* IPSEC */
1018 		case UDPLITE_SEND_CSCOV:
1019 		case UDPLITE_RECV_CSCOV:
1020 			if (!isudplite) {
1021 				INP_WUNLOCK(inp);
1022 				error = ENOPROTOOPT;
1023 				break;
1024 			}
1025 			up = intoudpcb(inp);
1026 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1027 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1028 				optval = up->u_txcslen;
1029 			else
1030 				optval = up->u_rxcslen;
1031 			INP_WUNLOCK(inp);
1032 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1033 			break;
1034 		default:
1035 			INP_WUNLOCK(inp);
1036 			error = ENOPROTOOPT;
1037 			break;
1038 		}
1039 		break;
1040 	}
1041 	return (error);
1042 }
1043 
1044 #ifdef INET
1045 #ifdef INET6
1046 /* The logic here is derived from ip6_setpktopt(). See comments there. */
1047 static int
1048 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1049     struct inpcb *inp, int flags)
1050 {
1051 	struct ifnet *ifp;
1052 	struct in6_pktinfo *pktinfo;
1053 	struct in_addr ia;
1054 
1055 	if ((flags & PRUS_IPV6) == 0)
1056 		return (0);
1057 
1058 	if (cm->cmsg_level != IPPROTO_IPV6)
1059 		return (0);
1060 
1061 	if  (cm->cmsg_type != IPV6_2292PKTINFO &&
1062 	    cm->cmsg_type != IPV6_PKTINFO)
1063 		return (0);
1064 
1065 	if (cm->cmsg_len !=
1066 	    CMSG_LEN(sizeof(struct in6_pktinfo)))
1067 		return (EINVAL);
1068 
1069 	pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1070 	if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1071 	    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1072 		return (EINVAL);
1073 
1074 	/* Validate the interface index if specified. */
1075 	if (pktinfo->ipi6_ifindex) {
1076 		struct epoch_tracker et;
1077 
1078 		NET_EPOCH_ENTER(et);
1079 		ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1080 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
1081 		if (ifp == NULL)
1082 			return (ENXIO);
1083 	} else
1084 		ifp = NULL;
1085 	if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1086 		ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1087 		if (in_ifhasaddr(ifp, ia) == 0)
1088 			return (EADDRNOTAVAIL);
1089 	}
1090 
1091 	bzero(src, sizeof(*src));
1092 	src->sin_family = AF_INET;
1093 	src->sin_len = sizeof(*src);
1094 	src->sin_port = inp->inp_lport;
1095 	src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1096 
1097 	return (0);
1098 }
1099 #endif
1100 
1101 static int
1102 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1103     struct mbuf *control, struct thread *td, int flags)
1104 {
1105 	struct udpiphdr *ui;
1106 	int len = m->m_pkthdr.len;
1107 	struct in_addr faddr, laddr;
1108 	struct cmsghdr *cm;
1109 	struct inpcbinfo *pcbinfo;
1110 	struct sockaddr_in *sin, src;
1111 	struct epoch_tracker et;
1112 	int cscov_partial = 0;
1113 	int error = 0;
1114 	int ipflags = 0;
1115 	u_short fport, lport;
1116 	u_char tos;
1117 	uint8_t pr;
1118 	uint16_t cscov = 0;
1119 	uint32_t flowid = 0;
1120 	uint8_t flowtype = M_HASHTYPE_NONE;
1121 
1122 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1123 		if (control)
1124 			m_freem(control);
1125 		m_freem(m);
1126 		return (EMSGSIZE);
1127 	}
1128 
1129 	src.sin_family = 0;
1130 	sin = (struct sockaddr_in *)addr;
1131 
1132 	/*
1133 	 * udp_output() may need to temporarily bind or connect the current
1134 	 * inpcb.  As such, we don't know up front whether we will need the
1135 	 * pcbinfo lock or not.  Do any work to decide what is needed up
1136 	 * front before acquiring any locks.
1137 	 *
1138 	 * We will need network epoch in either case, to safely lookup into
1139 	 * pcb hash.
1140 	 */
1141 	if (sin == NULL ||
1142 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
1143 		INP_WLOCK(inp);
1144 	else
1145 		INP_RLOCK(inp);
1146 	NET_EPOCH_ENTER(et);
1147 	tos = inp->inp_ip_tos;
1148 	if (control != NULL) {
1149 		/*
1150 		 * XXX: Currently, we assume all the optional information is
1151 		 * stored in a single mbuf.
1152 		 */
1153 		if (control->m_next) {
1154 			m_freem(control);
1155 			error = EINVAL;
1156 			goto release;
1157 		}
1158 		for (; control->m_len > 0;
1159 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
1160 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1161 			cm = mtod(control, struct cmsghdr *);
1162 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1163 			    || cm->cmsg_len > control->m_len) {
1164 				error = EINVAL;
1165 				break;
1166 			}
1167 #ifdef INET6
1168 			error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1169 			if (error != 0)
1170 				break;
1171 #endif
1172 			if (cm->cmsg_level != IPPROTO_IP)
1173 				continue;
1174 
1175 			switch (cm->cmsg_type) {
1176 			case IP_SENDSRCADDR:
1177 				if (cm->cmsg_len !=
1178 				    CMSG_LEN(sizeof(struct in_addr))) {
1179 					error = EINVAL;
1180 					break;
1181 				}
1182 				bzero(&src, sizeof(src));
1183 				src.sin_family = AF_INET;
1184 				src.sin_len = sizeof(src);
1185 				src.sin_port = inp->inp_lport;
1186 				src.sin_addr =
1187 				    *(struct in_addr *)CMSG_DATA(cm);
1188 				break;
1189 
1190 			case IP_TOS:
1191 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1192 					error = EINVAL;
1193 					break;
1194 				}
1195 				tos = *(u_char *)CMSG_DATA(cm);
1196 				break;
1197 
1198 			case IP_FLOWID:
1199 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1200 					error = EINVAL;
1201 					break;
1202 				}
1203 				flowid = *(uint32_t *) CMSG_DATA(cm);
1204 				break;
1205 
1206 			case IP_FLOWTYPE:
1207 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1208 					error = EINVAL;
1209 					break;
1210 				}
1211 				flowtype = *(uint32_t *) CMSG_DATA(cm);
1212 				break;
1213 
1214 #ifdef	RSS
1215 			case IP_RSSBUCKETID:
1216 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1217 					error = EINVAL;
1218 					break;
1219 				}
1220 				/* This is just a placeholder for now */
1221 				break;
1222 #endif	/* RSS */
1223 			default:
1224 				error = ENOPROTOOPT;
1225 				break;
1226 			}
1227 			if (error)
1228 				break;
1229 		}
1230 		m_freem(control);
1231 		control = NULL;
1232 	}
1233 	if (error)
1234 		goto release;
1235 
1236 	pr = inp->inp_socket->so_proto->pr_protocol;
1237 	pcbinfo = udp_get_inpcbinfo(pr);
1238 
1239 	/*
1240 	 * If the IP_SENDSRCADDR control message was specified, override the
1241 	 * source address for this datagram.  Its use is invalidated if the
1242 	 * address thus specified is incomplete or clobbers other inpcbs.
1243 	 */
1244 	laddr = inp->inp_laddr;
1245 	lport = inp->inp_lport;
1246 	if (src.sin_family == AF_INET) {
1247 		if ((lport == 0) ||
1248 		    (laddr.s_addr == INADDR_ANY &&
1249 		     src.sin_addr.s_addr == INADDR_ANY)) {
1250 			error = EINVAL;
1251 			goto release;
1252 		}
1253 		INP_HASH_WLOCK(pcbinfo);
1254 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1255 		    &laddr.s_addr, &lport, td->td_ucred);
1256 		INP_HASH_WUNLOCK(pcbinfo);
1257 		if (error)
1258 			goto release;
1259 	}
1260 
1261 	/*
1262 	 * If a UDP socket has been connected, then a local address/port will
1263 	 * have been selected and bound.
1264 	 *
1265 	 * If a UDP socket has not been connected to, then an explicit
1266 	 * destination address must be used, in which case a local
1267 	 * address/port may not have been selected and bound.
1268 	 */
1269 	if (sin != NULL) {
1270 		INP_LOCK_ASSERT(inp);
1271 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1272 			error = EISCONN;
1273 			goto release;
1274 		}
1275 
1276 		/*
1277 		 * Jail may rewrite the destination address, so let it do
1278 		 * that before we use it.
1279 		 */
1280 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1281 		if (error)
1282 			goto release;
1283 
1284 		/*
1285 		 * If a local address or port hasn't yet been selected, or if
1286 		 * the destination address needs to be rewritten due to using
1287 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1288 		 * to do the heavy lifting.  Once a port is selected, we
1289 		 * commit the binding back to the socket; we also commit the
1290 		 * binding of the address if in jail.
1291 		 *
1292 		 * If we already have a valid binding and we're not
1293 		 * requesting a destination address rewrite, use a fast path.
1294 		 */
1295 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1296 		    inp->inp_lport == 0 ||
1297 		    sin->sin_addr.s_addr == INADDR_ANY ||
1298 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1299 			INP_HASH_WLOCK(pcbinfo);
1300 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1301 			    &lport, &faddr.s_addr, &fport, NULL,
1302 			    td->td_ucred);
1303 			if (error) {
1304 				INP_HASH_WUNLOCK(pcbinfo);
1305 				goto release;
1306 			}
1307 
1308 			/*
1309 			 * XXXRW: Why not commit the port if the address is
1310 			 * !INADDR_ANY?
1311 			 */
1312 			/* Commit the local port if newly assigned. */
1313 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1314 			    inp->inp_lport == 0) {
1315 				INP_WLOCK_ASSERT(inp);
1316 				/*
1317 				 * Remember addr if jailed, to prevent
1318 				 * rebinding.
1319 				 */
1320 				if (prison_flag(td->td_ucred, PR_IP4))
1321 					inp->inp_laddr = laddr;
1322 				inp->inp_lport = lport;
1323 				error = in_pcbinshash(inp);
1324 				INP_HASH_WUNLOCK(pcbinfo);
1325 				if (error != 0) {
1326 					inp->inp_lport = 0;
1327 					error = EAGAIN;
1328 					goto release;
1329 				}
1330 				inp->inp_flags |= INP_ANONPORT;
1331 			} else
1332 				INP_HASH_WUNLOCK(pcbinfo);
1333 		} else {
1334 			faddr = sin->sin_addr;
1335 			fport = sin->sin_port;
1336 		}
1337 	} else {
1338 		INP_LOCK_ASSERT(inp);
1339 		faddr = inp->inp_faddr;
1340 		fport = inp->inp_fport;
1341 		if (faddr.s_addr == INADDR_ANY) {
1342 			error = ENOTCONN;
1343 			goto release;
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * Calculate data length and get a mbuf for UDP, IP, and possible
1349 	 * link-layer headers.  Immediate slide the data pointer back forward
1350 	 * since we won't use that space at this layer.
1351 	 */
1352 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1353 	if (m == NULL) {
1354 		error = ENOBUFS;
1355 		goto release;
1356 	}
1357 	m->m_data += max_linkhdr;
1358 	m->m_len -= max_linkhdr;
1359 	m->m_pkthdr.len -= max_linkhdr;
1360 
1361 	/*
1362 	 * Fill in mbuf with extended UDP header and addresses and length put
1363 	 * into network format.
1364 	 */
1365 	ui = mtod(m, struct udpiphdr *);
1366 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1367 	ui->ui_v = IPVERSION << 4;
1368 	ui->ui_pr = pr;
1369 	ui->ui_src = laddr;
1370 	ui->ui_dst = faddr;
1371 	ui->ui_sport = lport;
1372 	ui->ui_dport = fport;
1373 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1374 	if (pr == IPPROTO_UDPLITE) {
1375 		struct udpcb *up;
1376 		uint16_t plen;
1377 
1378 		up = intoudpcb(inp);
1379 		cscov = up->u_txcslen;
1380 		plen = (u_short)len + sizeof(struct udphdr);
1381 		if (cscov >= plen)
1382 			cscov = 0;
1383 		ui->ui_len = htons(plen);
1384 		ui->ui_ulen = htons(cscov);
1385 		/*
1386 		 * For UDP-Lite, checksum coverage length of zero means
1387 		 * the entire UDPLite packet is covered by the checksum.
1388 		 */
1389 		cscov_partial = (cscov == 0) ? 0 : 1;
1390 	}
1391 
1392 	/*
1393 	 * Set the Don't Fragment bit in the IP header.
1394 	 */
1395 	if (inp->inp_flags & INP_DONTFRAG) {
1396 		struct ip *ip;
1397 
1398 		ip = (struct ip *)&ui->ui_i;
1399 		ip->ip_off |= htons(IP_DF);
1400 	}
1401 
1402 	if (inp->inp_socket->so_options & SO_DONTROUTE)
1403 		ipflags |= IP_ROUTETOIF;
1404 	if (inp->inp_socket->so_options & SO_BROADCAST)
1405 		ipflags |= IP_ALLOWBROADCAST;
1406 	if (inp->inp_flags & INP_ONESBCAST)
1407 		ipflags |= IP_SENDONES;
1408 
1409 #ifdef MAC
1410 	mac_inpcb_create_mbuf(inp, m);
1411 #endif
1412 
1413 	/*
1414 	 * Set up checksum and output datagram.
1415 	 */
1416 	ui->ui_sum = 0;
1417 	if (pr == IPPROTO_UDPLITE) {
1418 		if (inp->inp_flags & INP_ONESBCAST)
1419 			faddr.s_addr = INADDR_BROADCAST;
1420 		if (cscov_partial) {
1421 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1422 				ui->ui_sum = 0xffff;
1423 		} else {
1424 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1425 				ui->ui_sum = 0xffff;
1426 		}
1427 	} else if (V_udp_cksum) {
1428 		if (inp->inp_flags & INP_ONESBCAST)
1429 			faddr.s_addr = INADDR_BROADCAST;
1430 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1431 		    htons((u_short)len + sizeof(struct udphdr) + pr));
1432 		m->m_pkthdr.csum_flags = CSUM_UDP;
1433 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1434 	}
1435 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1436 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1437 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1438 	UDPSTAT_INC(udps_opackets);
1439 
1440 	/*
1441 	 * Setup flowid / RSS information for outbound socket.
1442 	 *
1443 	 * Once the UDP code decides to set a flowid some other way,
1444 	 * this allows the flowid to be overridden by userland.
1445 	 */
1446 	if (flowtype != M_HASHTYPE_NONE) {
1447 		m->m_pkthdr.flowid = flowid;
1448 		M_HASHTYPE_SET(m, flowtype);
1449 	}
1450 #if defined(ROUTE_MPATH) || defined(RSS)
1451 	else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1452 		uint32_t hash_val, hash_type;
1453 
1454 		hash_val = fib4_calc_packet_hash(laddr, faddr,
1455 		    lport, fport, pr, &hash_type);
1456 		m->m_pkthdr.flowid = hash_val;
1457 		M_HASHTYPE_SET(m, hash_type);
1458 	}
1459 
1460 	/*
1461 	 * Don't override with the inp cached flowid value.
1462 	 *
1463 	 * Depending upon the kind of send being done, the inp
1464 	 * flowid/flowtype values may actually not be appropriate
1465 	 * for this particular socket send.
1466 	 *
1467 	 * We should either leave the flowid at zero (which is what is
1468 	 * currently done) or set it to some software generated
1469 	 * hash value based on the packet contents.
1470 	 */
1471 	ipflags |= IP_NODEFAULTFLOWID;
1472 #endif	/* RSS */
1473 
1474 	if (pr == IPPROTO_UDPLITE)
1475 		UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1476 	else
1477 		UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1478 	error = ip_output(m, inp->inp_options,
1479 	    INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
1480 	    inp->inp_moptions, inp);
1481 	INP_UNLOCK(inp);
1482 	NET_EPOCH_EXIT(et);
1483 	return (error);
1484 
1485 release:
1486 	INP_UNLOCK(inp);
1487 	NET_EPOCH_EXIT(et);
1488 	m_freem(m);
1489 	return (error);
1490 }
1491 
1492 static void
1493 udp_abort(struct socket *so)
1494 {
1495 	struct inpcb *inp;
1496 	struct inpcbinfo *pcbinfo;
1497 
1498 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1499 	inp = sotoinpcb(so);
1500 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1501 	INP_WLOCK(inp);
1502 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1503 		INP_HASH_WLOCK(pcbinfo);
1504 		in_pcbdisconnect(inp);
1505 		inp->inp_laddr.s_addr = INADDR_ANY;
1506 		INP_HASH_WUNLOCK(pcbinfo);
1507 		soisdisconnected(so);
1508 	}
1509 	INP_WUNLOCK(inp);
1510 }
1511 
1512 static int
1513 udp_attach(struct socket *so, int proto, struct thread *td)
1514 {
1515 	static uint32_t udp_flowid;
1516 	struct inpcb *inp;
1517 	struct inpcbinfo *pcbinfo;
1518 	int error;
1519 
1520 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1521 	inp = sotoinpcb(so);
1522 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1523 	error = soreserve(so, udp_sendspace, udp_recvspace);
1524 	if (error)
1525 		return (error);
1526 	error = in_pcballoc(so, pcbinfo);
1527 	if (error)
1528 		return (error);
1529 
1530 	inp = sotoinpcb(so);
1531 	inp->inp_vflag |= INP_IPV4;
1532 	inp->inp_ip_ttl = V_ip_defttl;
1533 	inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1534 	inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1535 
1536 	error = udp_newudpcb(inp);
1537 	if (error) {
1538 		in_pcbdetach(inp);
1539 		in_pcbfree(inp);
1540 		return (error);
1541 	}
1542 	INP_WUNLOCK(inp);
1543 
1544 	return (0);
1545 }
1546 #endif /* INET */
1547 
1548 int
1549 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1550 {
1551 	struct inpcb *inp;
1552 	struct udpcb *up;
1553 
1554 	KASSERT(so->so_type == SOCK_DGRAM,
1555 	    ("udp_set_kernel_tunneling: !dgram"));
1556 	inp = sotoinpcb(so);
1557 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1558 	INP_WLOCK(inp);
1559 	up = intoudpcb(inp);
1560 	if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) ||
1561 	    (up->u_icmp_func != NULL))) {
1562 		INP_WUNLOCK(inp);
1563 		return (EBUSY);
1564 	}
1565 	up->u_tun_func = f;
1566 	up->u_icmp_func = i;
1567 	up->u_tun_ctx = ctx;
1568 	INP_WUNLOCK(inp);
1569 	return (0);
1570 }
1571 
1572 #ifdef INET
1573 static int
1574 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1575 {
1576 	struct inpcb *inp;
1577 	struct inpcbinfo *pcbinfo;
1578 	struct sockaddr_in *sinp;
1579 	int error;
1580 
1581 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1582 	inp = sotoinpcb(so);
1583 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1584 
1585 	sinp = (struct sockaddr_in *)nam;
1586 	if (nam->sa_family != AF_INET) {
1587 		/*
1588 		 * Preserve compatibility with old programs.
1589 		 */
1590 		if (nam->sa_family != AF_UNSPEC ||
1591 		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
1592 		    sinp->sin_addr.s_addr != INADDR_ANY)
1593 			return (EAFNOSUPPORT);
1594 		nam->sa_family = AF_INET;
1595 	}
1596 	if (nam->sa_len != sizeof(struct sockaddr_in))
1597 		return (EINVAL);
1598 
1599 	INP_WLOCK(inp);
1600 	INP_HASH_WLOCK(pcbinfo);
1601 	error = in_pcbbind(inp, nam, td->td_ucred);
1602 	INP_HASH_WUNLOCK(pcbinfo);
1603 	INP_WUNLOCK(inp);
1604 	return (error);
1605 }
1606 
1607 static void
1608 udp_close(struct socket *so)
1609 {
1610 	struct inpcb *inp;
1611 	struct inpcbinfo *pcbinfo;
1612 
1613 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1614 	inp = sotoinpcb(so);
1615 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1616 	INP_WLOCK(inp);
1617 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1618 		INP_HASH_WLOCK(pcbinfo);
1619 		in_pcbdisconnect(inp);
1620 		inp->inp_laddr.s_addr = INADDR_ANY;
1621 		INP_HASH_WUNLOCK(pcbinfo);
1622 		soisdisconnected(so);
1623 	}
1624 	INP_WUNLOCK(inp);
1625 }
1626 
1627 static int
1628 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1629 {
1630 	struct epoch_tracker et;
1631 	struct inpcb *inp;
1632 	struct inpcbinfo *pcbinfo;
1633 	struct sockaddr_in *sin;
1634 	int error;
1635 
1636 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1637 	inp = sotoinpcb(so);
1638 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1639 
1640 	sin = (struct sockaddr_in *)nam;
1641 	if (sin->sin_family != AF_INET)
1642 		return (EAFNOSUPPORT);
1643 	if (sin->sin_len != sizeof(*sin))
1644 		return (EINVAL);
1645 
1646 	INP_WLOCK(inp);
1647 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1648 		INP_WUNLOCK(inp);
1649 		return (EISCONN);
1650 	}
1651 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1652 	if (error != 0) {
1653 		INP_WUNLOCK(inp);
1654 		return (error);
1655 	}
1656 	NET_EPOCH_ENTER(et);
1657 	INP_HASH_WLOCK(pcbinfo);
1658 	error = in_pcbconnect(inp, nam, td->td_ucred, true);
1659 	INP_HASH_WUNLOCK(pcbinfo);
1660 	NET_EPOCH_EXIT(et);
1661 	if (error == 0)
1662 		soisconnected(so);
1663 	INP_WUNLOCK(inp);
1664 	return (error);
1665 }
1666 
1667 static void
1668 udp_detach(struct socket *so)
1669 {
1670 	struct inpcb *inp;
1671 	struct udpcb *up;
1672 
1673 	inp = sotoinpcb(so);
1674 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1675 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1676 	    ("udp_detach: not disconnected"));
1677 	INP_WLOCK(inp);
1678 	up = intoudpcb(inp);
1679 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1680 	inp->inp_ppcb = NULL;
1681 	in_pcbdetach(inp);
1682 	in_pcbfree(inp);
1683 	udp_discardcb(up);
1684 }
1685 
1686 static int
1687 udp_disconnect(struct socket *so)
1688 {
1689 	struct inpcb *inp;
1690 	struct inpcbinfo *pcbinfo;
1691 
1692 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1693 	inp = sotoinpcb(so);
1694 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1695 	INP_WLOCK(inp);
1696 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1697 		INP_WUNLOCK(inp);
1698 		return (ENOTCONN);
1699 	}
1700 	INP_HASH_WLOCK(pcbinfo);
1701 	in_pcbdisconnect(inp);
1702 	inp->inp_laddr.s_addr = INADDR_ANY;
1703 	INP_HASH_WUNLOCK(pcbinfo);
1704 	SOCK_LOCK(so);
1705 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1706 	SOCK_UNLOCK(so);
1707 	INP_WUNLOCK(inp);
1708 	return (0);
1709 }
1710 
1711 static int
1712 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1713     struct mbuf *control, struct thread *td)
1714 {
1715 	struct inpcb *inp;
1716 	int error;
1717 
1718 	inp = sotoinpcb(so);
1719 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1720 
1721 	if (addr != NULL) {
1722 		error = 0;
1723 		if (addr->sa_family != AF_INET)
1724 			error = EAFNOSUPPORT;
1725 		else if (addr->sa_len != sizeof(struct sockaddr_in))
1726 			error = EINVAL;
1727 		if (__predict_false(error != 0)) {
1728 			m_freem(control);
1729 			m_freem(m);
1730 			return (error);
1731 		}
1732 	}
1733 	return (udp_output(inp, m, addr, control, td, flags));
1734 }
1735 #endif /* INET */
1736 
1737 int
1738 udp_shutdown(struct socket *so)
1739 {
1740 	struct inpcb *inp;
1741 
1742 	inp = sotoinpcb(so);
1743 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1744 	INP_WLOCK(inp);
1745 	socantsendmore(so);
1746 	INP_WUNLOCK(inp);
1747 	return (0);
1748 }
1749 
1750 #ifdef INET
1751 struct pr_usrreqs udp_usrreqs = {
1752 	.pru_abort =		udp_abort,
1753 	.pru_attach =		udp_attach,
1754 	.pru_bind =		udp_bind,
1755 	.pru_connect =		udp_connect,
1756 	.pru_control =		in_control,
1757 	.pru_detach =		udp_detach,
1758 	.pru_disconnect =	udp_disconnect,
1759 	.pru_peeraddr =		in_getpeeraddr,
1760 	.pru_send =		udp_send,
1761 	.pru_soreceive =	soreceive_dgram,
1762 	.pru_sosend =		sosend_dgram,
1763 	.pru_shutdown =		udp_shutdown,
1764 	.pru_sockaddr =		in_getsockaddr,
1765 	.pru_sosetlabel =	in_pcbsosetlabel,
1766 	.pru_close =		udp_close,
1767 };
1768 #endif /* INET */
1769