xref: /freebsd/sys/netinet/udp_usrreq.c (revision b28624fde638caadd4a89f50c9b7e7da0f98c4d2)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 4. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
31  * $FreeBSD$
32  */
33 
34 #include "opt_ipfw.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_mac.h"
38 
39 #include <sys/param.h>
40 #include <sys/domain.h>
41 #include <sys/eventhandler.h>
42 #include <sys/jail.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/priv.h>
48 #include <sys/proc.h>
49 #include <sys/protosw.h>
50 #include <sys/signalvar.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/sx.h>
54 #include <sys/sysctl.h>
55 #include <sys/syslog.h>
56 #include <sys/systm.h>
57 
58 #include <vm/uma.h>
59 
60 #include <net/if.h>
61 #include <net/route.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip.h>
68 #ifdef INET6
69 #include <netinet/ip6.h>
70 #endif
71 #include <netinet/ip_icmp.h>
72 #include <netinet/icmp_var.h>
73 #include <netinet/ip_var.h>
74 #include <netinet/ip_options.h>
75 #ifdef INET6
76 #include <netinet6/ip6_var.h>
77 #endif
78 #include <netinet/udp.h>
79 #include <netinet/udp_var.h>
80 
81 #ifdef IPSEC
82 #include <netipsec/ipsec.h>
83 #endif
84 
85 #include <machine/in_cksum.h>
86 
87 #include <security/mac/mac_framework.h>
88 
89 /*
90  * UDP protocol implementation.
91  * Per RFC 768, August, 1980.
92  */
93 
94 /*
95  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
96  * removes the only data integrity mechanism for packets and malformed
97  * packets that would otherwise be discarded due to bad checksums, and may
98  * cause problems (especially for NFS data blocks).
99  */
100 static int	udp_cksum = 1;
101 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum,
102     0, "");
103 
104 int	udp_log_in_vain = 0;
105 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
106     &udp_log_in_vain, 0, "Log all incoming UDP packets");
107 
108 int	udp_blackhole = 0;
109 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &udp_blackhole, 0,
110     "Do not send port unreachables for refused connects");
111 
112 u_long	udp_sendspace = 9216;		/* really max datagram size */
113 					/* 40 1K datagrams */
114 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
115     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
116 
117 u_long	udp_recvspace = 40 * (1024 +
118 #ifdef INET6
119 				      sizeof(struct sockaddr_in6)
120 #else
121 				      sizeof(struct sockaddr_in)
122 #endif
123 				      );
124 
125 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
126     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
127 
128 struct inpcbhead	udb;		/* from udp_var.h */
129 struct inpcbinfo	udbinfo;
130 
131 #ifndef UDBHASHSIZE
132 #define	UDBHASHSIZE	16
133 #endif
134 
135 struct udpstat	udpstat;	/* from udp_var.h */
136 SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat,
137     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
138 
139 static void	udp_detach(struct socket *so);
140 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
141 		    struct mbuf *, struct thread *);
142 
143 static void
144 udp_zone_change(void *tag)
145 {
146 
147 	uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
148 }
149 
150 static int
151 udp_inpcb_init(void *mem, int size, int flags)
152 {
153 	struct inpcb *inp;
154 
155 	inp = mem;
156 	INP_LOCK_INIT(inp, "inp", "udpinp");
157 	return (0);
158 }
159 
160 void
161 udp_init(void)
162 {
163 
164 	INP_INFO_LOCK_INIT(&udbinfo, "udp");
165 	LIST_INIT(&udb);
166 	udbinfo.ipi_listhead = &udb;
167 	udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
168 	    &udbinfo.ipi_hashmask);
169 	udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB,
170 	    &udbinfo.ipi_porthashmask);
171 	udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
172 	    NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
173 	uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
174 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
175 	    EVENTHANDLER_PRI_ANY);
176 }
177 
178 /*
179  * Subroutine of udp_input(), which appends the provided mbuf chain to the
180  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
181  * contains the source address.  If the socket ends up being an IPv6 socket,
182  * udp_append() will convert to a sockaddr_in6 before passing the address
183  * into the socket code.
184  */
185 static void
186 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
187     struct sockaddr_in *udp_in)
188 {
189 	struct sockaddr *append_sa;
190 	struct socket *so;
191 	struct mbuf *opts = 0;
192 #ifdef INET6
193 	struct sockaddr_in6 udp_in6;
194 #endif
195 
196 	INP_LOCK_ASSERT(inp);
197 
198 #ifdef IPSEC
199 	/* Check AH/ESP integrity. */
200 	if (ipsec4_in_reject(n, inp)) {
201 		m_freem(n);
202 		ipsec4stat.in_polvio++;
203 		return;
204 	}
205 #endif /* IPSEC */
206 #ifdef MAC
207 	if (mac_check_inpcb_deliver(inp, n) != 0) {
208 		m_freem(n);
209 		return;
210 	}
211 #endif
212 	if (inp->inp_flags & INP_CONTROLOPTS ||
213 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
214 #ifdef INET6
215 		if (inp->inp_vflag & INP_IPV6) {
216 			int savedflags;
217 
218 			savedflags = inp->inp_flags;
219 			inp->inp_flags &= ~INP_UNMAPPABLEOPTS;
220 			ip6_savecontrol(inp, n, &opts);
221 			inp->inp_flags = savedflags;
222 		} else
223 #endif
224 			ip_savecontrol(inp, &opts, ip, n);
225 	}
226 #ifdef INET6
227 	if (inp->inp_vflag & INP_IPV6) {
228 		bzero(&udp_in6, sizeof(udp_in6));
229 		udp_in6.sin6_len = sizeof(udp_in6);
230 		udp_in6.sin6_family = AF_INET6;
231 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
232 		append_sa = (struct sockaddr *)&udp_in6;
233 	} else
234 #endif
235 		append_sa = (struct sockaddr *)udp_in;
236 	m_adj(n, off);
237 
238 	so = inp->inp_socket;
239 	SOCKBUF_LOCK(&so->so_rcv);
240 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
241 		SOCKBUF_UNLOCK(&so->so_rcv);
242 		m_freem(n);
243 		if (opts)
244 			m_freem(opts);
245 		udpstat.udps_fullsock++;
246 	} else
247 		sorwakeup_locked(so);
248 }
249 
250 void
251 udp_input(struct mbuf *m, int off)
252 {
253 	int iphlen = off;
254 	struct ip *ip;
255 	struct udphdr *uh;
256 	struct ifnet *ifp;
257 	struct inpcb *inp;
258 	int len;
259 	struct ip save_ip;
260 	struct sockaddr_in udp_in;
261 #ifdef IPFIREWALL_FORWARD
262 	struct m_tag *fwd_tag;
263 #endif
264 
265 	ifp = m->m_pkthdr.rcvif;
266 	udpstat.udps_ipackets++;
267 
268 	/*
269 	 * Strip IP options, if any; should skip this, make available to
270 	 * user, and use on returned packets, but we don't yet have a way to
271 	 * check the checksum with options still present.
272 	 */
273 	if (iphlen > sizeof (struct ip)) {
274 		ip_stripoptions(m, (struct mbuf *)0);
275 		iphlen = sizeof(struct ip);
276 	}
277 
278 	/*
279 	 * Get IP and UDP header together in first mbuf.
280 	 */
281 	ip = mtod(m, struct ip *);
282 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
283 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
284 			udpstat.udps_hdrops++;
285 			return;
286 		}
287 		ip = mtod(m, struct ip *);
288 	}
289 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
290 
291 	/*
292 	 * Destination port of 0 is illegal, based on RFC768.
293 	 */
294 	if (uh->uh_dport == 0)
295 		goto badunlocked;
296 
297 	/*
298 	 * Construct sockaddr format source address.  Stuff source address
299 	 * and datagram in user buffer.
300 	 */
301 	bzero(&udp_in, sizeof(udp_in));
302 	udp_in.sin_len = sizeof(udp_in);
303 	udp_in.sin_family = AF_INET;
304 	udp_in.sin_port = uh->uh_sport;
305 	udp_in.sin_addr = ip->ip_src;
306 
307 	/*
308 	 * Make mbuf data length reflect UDP length.  If not enough data to
309 	 * reflect UDP length, drop.
310 	 */
311 	len = ntohs((u_short)uh->uh_ulen);
312 	if (ip->ip_len != len) {
313 		if (len > ip->ip_len || len < sizeof(struct udphdr)) {
314 			udpstat.udps_badlen++;
315 			goto badunlocked;
316 		}
317 		m_adj(m, len - ip->ip_len);
318 		/* ip->ip_len = len; */
319 	}
320 
321 	/*
322 	 * Save a copy of the IP header in case we want restore it for
323 	 * sending an ICMP error message in response.
324 	 */
325 	if (!udp_blackhole)
326 		save_ip = *ip;
327 	else
328 		memset(&save_ip, 0, sizeof(save_ip));
329 
330 	/*
331 	 * Checksum extended UDP header and data.
332 	 */
333 	if (uh->uh_sum) {
334 		u_short uh_sum;
335 
336 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
337 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
338 				uh_sum = m->m_pkthdr.csum_data;
339 			else
340 				uh_sum = in_pseudo(ip->ip_src.s_addr,
341 				    ip->ip_dst.s_addr, htonl((u_short)len +
342 				    m->m_pkthdr.csum_data + IPPROTO_UDP));
343 			uh_sum ^= 0xffff;
344 		} else {
345 			char b[9];
346 
347 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
348 			bzero(((struct ipovly *)ip)->ih_x1, 9);
349 			((struct ipovly *)ip)->ih_len = uh->uh_ulen;
350 			uh_sum = in_cksum(m, len + sizeof (struct ip));
351 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
352 		}
353 		if (uh_sum) {
354 			udpstat.udps_badsum++;
355 			m_freem(m);
356 			return;
357 		}
358 	} else
359 		udpstat.udps_nosum++;
360 
361 #ifdef IPFIREWALL_FORWARD
362 	/*
363 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
364 	 */
365 	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
366 	if (fwd_tag != NULL) {
367 		struct sockaddr_in *next_hop;
368 
369 		/*
370 		 * Do the hack.
371 		 */
372 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
373 		ip->ip_dst = next_hop->sin_addr;
374 		uh->uh_dport = ntohs(next_hop->sin_port);
375 
376 		/*
377 		 * Remove the tag from the packet.  We don't need it anymore.
378 		 */
379 		m_tag_delete(m, fwd_tag);
380 	}
381 #endif
382 
383 	INP_INFO_RLOCK(&udbinfo);
384 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
385 	    in_broadcast(ip->ip_dst, ifp)) {
386 		struct inpcb *last;
387 		struct ip_moptions *imo;
388 
389 		last = NULL;
390 		LIST_FOREACH(inp, &udb, inp_list) {
391 			if (inp->inp_lport != uh->uh_dport)
392 				continue;
393 #ifdef INET6
394 			if ((inp->inp_vflag & INP_IPV4) == 0)
395 				continue;
396 #endif
397 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
398 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
399 				continue;
400 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
401 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
402 				continue;
403 			/*
404 			 * XXX: Do not check source port of incoming datagram
405 			 * unless inp_connect() has been called to bind the
406 			 * fport part of the 4-tuple; the source could be
407 			 * trying to talk to us with an ephemeral port.
408 			 */
409 			if (inp->inp_fport != 0 &&
410 			    inp->inp_fport != uh->uh_sport)
411 				continue;
412 
413 			INP_LOCK(inp);
414 
415 			/*
416 			 * Handle socket delivery policy for any-source
417 			 * and source-specific multicast. [RFC3678]
418 			 */
419 			imo = inp->inp_moptions;
420 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
421 			    imo != NULL) {
422 				struct sockaddr_in	 sin;
423 				struct in_msource	*ims;
424 				int			 blocked, mode;
425 				size_t			 idx;
426 
427 				bzero(&sin, sizeof(struct sockaddr_in));
428 				sin.sin_len = sizeof(struct sockaddr_in);
429 				sin.sin_family = AF_INET;
430 				sin.sin_addr = ip->ip_dst;
431 
432 				blocked = 0;
433 				idx = imo_match_group(imo, ifp,
434 				    (struct sockaddr *)&sin);
435 				if (idx == -1) {
436 					/*
437 					 * No group membership for this socket.
438 					 * Do not bump udps_noportbcast, as
439 					 * this will happen further down.
440 					 */
441 					blocked++;
442 				} else {
443 					/*
444 					 * Check for a multicast source filter
445 					 * entry on this socket for this group.
446 					 * MCAST_EXCLUDE is the default
447 					 * behaviour.  It means default accept;
448 					 * entries, if present, denote sources
449 					 * to be excluded from delivery.
450 					 */
451 					ims = imo_match_source(imo, idx,
452 					    (struct sockaddr *)&udp_in);
453 					mode = imo->imo_mfilters[idx].imf_fmode;
454 					if ((ims != NULL &&
455 					     mode == MCAST_EXCLUDE) ||
456 					    (ims == NULL &&
457 					     mode == MCAST_INCLUDE)) {
458 #ifdef DIAGNOSTIC
459 						if (bootverbose) {
460 							printf("%s: blocked by"
461 							    " source filter\n",
462 							    __func__);
463 						}
464 #endif
465 						udpstat.udps_filtermcast++;
466 						blocked++;
467 					}
468 				}
469 				if (blocked != 0) {
470 					INP_UNLOCK(inp);
471 					continue;
472 				}
473 			}
474 			if (last != NULL) {
475 				struct mbuf *n;
476 
477 				n = m_copy(m, 0, M_COPYALL);
478 				if (n != NULL)
479 					udp_append(last, ip, n, iphlen +
480 					    sizeof(struct udphdr), &udp_in);
481 				INP_UNLOCK(last);
482 			}
483 			last = inp;
484 			/*
485 			 * Don't look for additional matches if this one does
486 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
487 			 * socket options set.  This heuristic avoids
488 			 * searching through all pcbs in the common case of a
489 			 * non-shared port.  It assumes that an application
490 			 * will never clear these options after setting them.
491 			 */
492 			if ((last->inp_socket->so_options &
493 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
494 				break;
495 		}
496 
497 		if (last == NULL) {
498 			/*
499 			 * No matching pcb found; discard datagram.  (No need
500 			 * to send an ICMP Port Unreachable for a broadcast
501 			 * or multicast datgram.)
502 			 */
503 			udpstat.udps_noportbcast++;
504 			goto badheadlocked;
505 		}
506 		udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
507 		    &udp_in);
508 		INP_UNLOCK(last);
509 		INP_INFO_RUNLOCK(&udbinfo);
510 		return;
511 	}
512 
513 	/*
514 	 * Locate pcb for datagram.
515 	 */
516 	inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
517 	    ip->ip_dst, uh->uh_dport, 1, ifp);
518 	if (inp == NULL) {
519 		if (udp_log_in_vain) {
520 			char buf[4*sizeof "123"];
521 
522 			strcpy(buf, inet_ntoa(ip->ip_dst));
523 			log(LOG_INFO,
524 			    "Connection attempt to UDP %s:%d from %s:%d\n",
525 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
526 			    ntohs(uh->uh_sport));
527 		}
528 		udpstat.udps_noport++;
529 		if (m->m_flags & (M_BCAST | M_MCAST)) {
530 			udpstat.udps_noportbcast++;
531 			goto badheadlocked;
532 		}
533 		if (udp_blackhole)
534 			goto badheadlocked;
535 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
536 			goto badheadlocked;
537 		*ip = save_ip;
538 		ip->ip_len += iphlen;
539 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
540 		INP_INFO_RUNLOCK(&udbinfo);
541 		return;
542 	}
543 
544 	/*
545 	 * Check the minimum TTL for socket.
546 	 */
547 	INP_LOCK(inp);
548 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
549 		goto badheadlocked;
550 	udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
551 	INP_UNLOCK(inp);
552 	INP_INFO_RUNLOCK(&udbinfo);
553 	return;
554 
555 badheadlocked:
556 	if (inp)
557 		INP_UNLOCK(inp);
558 	INP_INFO_RUNLOCK(&udbinfo);
559 badunlocked:
560 	m_freem(m);
561 }
562 
563 /*
564  * Notify a udp user of an asynchronous error; just wake up so that they can
565  * collect error status.
566  */
567 struct inpcb *
568 udp_notify(struct inpcb *inp, int errno)
569 {
570 
571 	inp->inp_socket->so_error = errno;
572 	sorwakeup(inp->inp_socket);
573 	sowwakeup(inp->inp_socket);
574 	return (inp);
575 }
576 
577 void
578 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
579 {
580 	struct ip *ip = vip;
581 	struct udphdr *uh;
582 	struct in_addr faddr;
583 	struct inpcb *inp;
584 
585 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
586 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
587 		return;
588 
589 	/*
590 	 * Redirects don't need to be handled up here.
591 	 */
592 	if (PRC_IS_REDIRECT(cmd))
593 		return;
594 
595 	/*
596 	 * Hostdead is ugly because it goes linearly through all PCBs.
597 	 *
598 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
599 	 * DoS attack on machines with many connections.
600 	 */
601 	if (cmd == PRC_HOSTDEAD)
602 		ip = NULL;
603 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
604 		return;
605 	if (ip != NULL) {
606 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
607 		INP_INFO_RLOCK(&udbinfo);
608 		inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
609 		    ip->ip_src, uh->uh_sport, 0, NULL);
610 		if (inp != NULL) {
611 			INP_LOCK(inp);
612 			if (inp->inp_socket != NULL) {
613 				udp_notify(inp, inetctlerrmap[cmd]);
614 			}
615 			INP_UNLOCK(inp);
616 		}
617 		INP_INFO_RUNLOCK(&udbinfo);
618 	} else
619 		in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd],
620 		    udp_notify);
621 }
622 
623 static int
624 udp_pcblist(SYSCTL_HANDLER_ARGS)
625 {
626 	int error, i, n;
627 	struct inpcb *inp, **inp_list;
628 	inp_gen_t gencnt;
629 	struct xinpgen xig;
630 
631 	/*
632 	 * The process of preparing the PCB list is too time-consuming and
633 	 * resource-intensive to repeat twice on every request.
634 	 */
635 	if (req->oldptr == 0) {
636 		n = udbinfo.ipi_count;
637 		req->oldidx = 2 * (sizeof xig)
638 			+ (n + n/8) * sizeof(struct xinpcb);
639 		return (0);
640 	}
641 
642 	if (req->newptr != 0)
643 		return (EPERM);
644 
645 	/*
646 	 * OK, now we're committed to doing something.
647 	 */
648 	INP_INFO_RLOCK(&udbinfo);
649 	gencnt = udbinfo.ipi_gencnt;
650 	n = udbinfo.ipi_count;
651 	INP_INFO_RUNLOCK(&udbinfo);
652 
653 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
654 		+ n * sizeof(struct xinpcb));
655 	if (error != 0)
656 		return (error);
657 
658 	xig.xig_len = sizeof xig;
659 	xig.xig_count = n;
660 	xig.xig_gen = gencnt;
661 	xig.xig_sogen = so_gencnt;
662 	error = SYSCTL_OUT(req, &xig, sizeof xig);
663 	if (error)
664 		return (error);
665 
666 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
667 	if (inp_list == 0)
668 		return (ENOMEM);
669 
670 	INP_INFO_RLOCK(&udbinfo);
671 	for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n;
672 	     inp = LIST_NEXT(inp, inp_list)) {
673 		INP_LOCK(inp);
674 		if (inp->inp_gencnt <= gencnt &&
675 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
676 			inp_list[i++] = inp;
677 		INP_UNLOCK(inp);
678 	}
679 	INP_INFO_RUNLOCK(&udbinfo);
680 	n = i;
681 
682 	error = 0;
683 	for (i = 0; i < n; i++) {
684 		inp = inp_list[i];
685 		INP_LOCK(inp);
686 		if (inp->inp_gencnt <= gencnt) {
687 			struct xinpcb xi;
688 			bzero(&xi, sizeof(xi));
689 			xi.xi_len = sizeof xi;
690 			/* XXX should avoid extra copy */
691 			bcopy(inp, &xi.xi_inp, sizeof *inp);
692 			if (inp->inp_socket)
693 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
694 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
695 			INP_UNLOCK(inp);
696 			error = SYSCTL_OUT(req, &xi, sizeof xi);
697 		} else
698 			INP_UNLOCK(inp);
699 	}
700 	if (!error) {
701 		/*
702 		 * Give the user an updated idea of our state.  If the
703 		 * generation differs from what we told her before, she knows
704 		 * that something happened while we were processing this
705 		 * request, and it might be necessary to retry.
706 		 */
707 		INP_INFO_RLOCK(&udbinfo);
708 		xig.xig_gen = udbinfo.ipi_gencnt;
709 		xig.xig_sogen = so_gencnt;
710 		xig.xig_count = udbinfo.ipi_count;
711 		INP_INFO_RUNLOCK(&udbinfo);
712 		error = SYSCTL_OUT(req, &xig, sizeof xig);
713 	}
714 	free(inp_list, M_TEMP);
715 	return (error);
716 }
717 
718 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
719     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
720 
721 static int
722 udp_getcred(SYSCTL_HANDLER_ARGS)
723 {
724 	struct xucred xuc;
725 	struct sockaddr_in addrs[2];
726 	struct inpcb *inp;
727 	int error;
728 
729 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
730 	if (error)
731 		return (error);
732 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
733 	if (error)
734 		return (error);
735 	INP_INFO_RLOCK(&udbinfo);
736 	inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
737 				addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
738 	if (inp == NULL || inp->inp_socket == NULL) {
739 		error = ENOENT;
740 		goto out;
741 	}
742 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
743 	if (error)
744 		goto out;
745 	cru2x(inp->inp_socket->so_cred, &xuc);
746 out:
747 	INP_INFO_RUNLOCK(&udbinfo);
748 	if (error == 0)
749 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
750 	return (error);
751 }
752 
753 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
754     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
755     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
756 
757 static int
758 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
759     struct mbuf *control, struct thread *td)
760 {
761 	struct udpiphdr *ui;
762 	int len = m->m_pkthdr.len;
763 	struct in_addr faddr, laddr;
764 	struct cmsghdr *cm;
765 	struct sockaddr_in *sin, src;
766 	int error = 0;
767 	int ipflags;
768 	u_short fport, lport;
769 	int unlock_udbinfo;
770 
771 	/*
772 	 * udp_output() may need to temporarily bind or connect the current
773 	 * inpcb.  As such, we don't know up front whether we will need the
774 	 * pcbinfo lock or not.  Do any work to decide what is needed up
775 	 * front before acquiring any locks.
776 	 */
777 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
778 		if (control)
779 			m_freem(control);
780 		m_freem(m);
781 		return (EMSGSIZE);
782 	}
783 
784 	src.sin_family = 0;
785 	if (control != NULL) {
786 		/*
787 		 * XXX: Currently, we assume all the optional information is
788 		 * stored in a single mbuf.
789 		 */
790 		if (control->m_next) {
791 			m_freem(control);
792 			m_freem(m);
793 			return (EINVAL);
794 		}
795 		for (; control->m_len > 0;
796 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
797 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
798 			cm = mtod(control, struct cmsghdr *);
799 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
800 			    || cm->cmsg_len > control->m_len) {
801 				error = EINVAL;
802 				break;
803 			}
804 			if (cm->cmsg_level != IPPROTO_IP)
805 				continue;
806 
807 			switch (cm->cmsg_type) {
808 			case IP_SENDSRCADDR:
809 				if (cm->cmsg_len !=
810 				    CMSG_LEN(sizeof(struct in_addr))) {
811 					error = EINVAL;
812 					break;
813 				}
814 				bzero(&src, sizeof(src));
815 				src.sin_family = AF_INET;
816 				src.sin_len = sizeof(src);
817 				src.sin_port = inp->inp_lport;
818 				src.sin_addr =
819 				    *(struct in_addr *)CMSG_DATA(cm);
820 				break;
821 
822 			default:
823 				error = ENOPROTOOPT;
824 				break;
825 			}
826 			if (error)
827 				break;
828 		}
829 		m_freem(control);
830 	}
831 	if (error) {
832 		m_freem(m);
833 		return (error);
834 	}
835 
836 	if (src.sin_family == AF_INET || addr != NULL) {
837 		INP_INFO_WLOCK(&udbinfo);
838 		unlock_udbinfo = 1;
839 	} else
840 		unlock_udbinfo = 0;
841 	INP_LOCK(inp);
842 
843 #ifdef MAC
844 	mac_create_mbuf_from_inpcb(inp, m);
845 #endif
846 
847 	/*
848 	 * If the IP_SENDSRCADDR control message was specified, override the
849 	 * source address for this datagram.  Its use is invalidated if the
850 	 * address thus specified is incomplete or clobbers other inpcbs.
851 	 */
852 	laddr = inp->inp_laddr;
853 	lport = inp->inp_lport;
854 	if (src.sin_family == AF_INET) {
855 		if ((lport == 0) ||
856 		    (laddr.s_addr == INADDR_ANY &&
857 		     src.sin_addr.s_addr == INADDR_ANY)) {
858 			error = EINVAL;
859 			goto release;
860 		}
861 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
862 		    &laddr.s_addr, &lport, td->td_ucred);
863 		if (error)
864 			goto release;
865 	}
866 
867 	if (addr) {
868 		sin = (struct sockaddr_in *)addr;
869 		if (jailed(td->td_ucred))
870 			prison_remote_ip(td->td_ucred, 0,
871 			    &sin->sin_addr.s_addr);
872 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
873 			error = EISCONN;
874 			goto release;
875 		}
876 		error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport,
877 		    &faddr.s_addr, &fport, NULL, td->td_ucred);
878 		if (error)
879 			goto release;
880 
881 		/* Commit the local port if newly assigned. */
882 		if (inp->inp_laddr.s_addr == INADDR_ANY &&
883 		    inp->inp_lport == 0) {
884 			/*
885 			 * Remember addr if jailed, to prevent rebinding.
886 			 */
887 			if (jailed(td->td_ucred))
888 				inp->inp_laddr = laddr;
889 			inp->inp_lport = lport;
890 			if (in_pcbinshash(inp) != 0) {
891 				inp->inp_lport = 0;
892 				error = EAGAIN;
893 				goto release;
894 			}
895 			inp->inp_flags |= INP_ANONPORT;
896 		}
897 	} else {
898 		faddr = inp->inp_faddr;
899 		fport = inp->inp_fport;
900 		if (faddr.s_addr == INADDR_ANY) {
901 			error = ENOTCONN;
902 			goto release;
903 		}
904 	}
905 
906 	/*
907 	 * Calculate data length and get a mbuf for UDP, IP, and possible
908 	 * link-layer headers.  Immediate slide the data pointer back forward
909 	 * since we won't use that space at this layer.
910 	 */
911 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
912 	if (m == NULL) {
913 		error = ENOBUFS;
914 		goto release;
915 	}
916 	m->m_data += max_linkhdr;
917 	m->m_len -= max_linkhdr;
918 	m->m_pkthdr.len -= max_linkhdr;
919 
920 	/*
921 	 * Fill in mbuf with extended UDP header and addresses and length put
922 	 * into network format.
923 	 */
924 	ui = mtod(m, struct udpiphdr *);
925 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
926 	ui->ui_pr = IPPROTO_UDP;
927 	ui->ui_src = laddr;
928 	ui->ui_dst = faddr;
929 	ui->ui_sport = lport;
930 	ui->ui_dport = fport;
931 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
932 
933 	/*
934 	 * Set the Don't Fragment bit in the IP header.
935 	 */
936 	if (inp->inp_flags & INP_DONTFRAG) {
937 		struct ip *ip;
938 
939 		ip = (struct ip *)&ui->ui_i;
940 		ip->ip_off |= IP_DF;
941 	}
942 
943 	ipflags = 0;
944 	if (inp->inp_socket->so_options & SO_DONTROUTE)
945 		ipflags |= IP_ROUTETOIF;
946 	if (inp->inp_socket->so_options & SO_BROADCAST)
947 		ipflags |= IP_ALLOWBROADCAST;
948 	if (inp->inp_flags & INP_ONESBCAST)
949 		ipflags |= IP_SENDONES;
950 
951 	/*
952 	 * Set up checksum and output datagram.
953 	 */
954 	if (udp_cksum) {
955 		if (inp->inp_flags & INP_ONESBCAST)
956 			faddr.s_addr = INADDR_BROADCAST;
957 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
958 		    htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
959 		m->m_pkthdr.csum_flags = CSUM_UDP;
960 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
961 	} else
962 		ui->ui_sum = 0;
963 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
964 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
965 	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
966 	udpstat.udps_opackets++;
967 
968 	if (unlock_udbinfo)
969 		INP_INFO_WUNLOCK(&udbinfo);
970 	error = ip_output(m, inp->inp_options, NULL, ipflags,
971 	    inp->inp_moptions, inp);
972 	INP_UNLOCK(inp);
973 	return (error);
974 
975 release:
976 	INP_UNLOCK(inp);
977 	if (unlock_udbinfo)
978 		INP_INFO_WUNLOCK(&udbinfo);
979 	m_freem(m);
980 	return (error);
981 }
982 
983 static void
984 udp_abort(struct socket *so)
985 {
986 	struct inpcb *inp;
987 
988 	inp = sotoinpcb(so);
989 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
990 	INP_INFO_WLOCK(&udbinfo);
991 	INP_LOCK(inp);
992 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
993 		in_pcbdisconnect(inp);
994 		inp->inp_laddr.s_addr = INADDR_ANY;
995 		soisdisconnected(so);
996 	}
997 	INP_UNLOCK(inp);
998 	INP_INFO_WUNLOCK(&udbinfo);
999 }
1000 
1001 static int
1002 udp_attach(struct socket *so, int proto, struct thread *td)
1003 {
1004 	struct inpcb *inp;
1005 	int error;
1006 
1007 	inp = sotoinpcb(so);
1008 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1009 	error = soreserve(so, udp_sendspace, udp_recvspace);
1010 	if (error)
1011 		return (error);
1012 	INP_INFO_WLOCK(&udbinfo);
1013 	error = in_pcballoc(so, &udbinfo);
1014 	if (error) {
1015 		INP_INFO_WUNLOCK(&udbinfo);
1016 		return (error);
1017 	}
1018 
1019 	inp = (struct inpcb *)so->so_pcb;
1020 	INP_INFO_WUNLOCK(&udbinfo);
1021 	inp->inp_vflag |= INP_IPV4;
1022 	inp->inp_ip_ttl = ip_defttl;
1023 	INP_UNLOCK(inp);
1024 	return (0);
1025 }
1026 
1027 static int
1028 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1029 {
1030 	struct inpcb *inp;
1031 	int error;
1032 
1033 	inp = sotoinpcb(so);
1034 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1035 	INP_INFO_WLOCK(&udbinfo);
1036 	INP_LOCK(inp);
1037 	error = in_pcbbind(inp, nam, td->td_ucred);
1038 	INP_UNLOCK(inp);
1039 	INP_INFO_WUNLOCK(&udbinfo);
1040 	return (error);
1041 }
1042 
1043 static void
1044 udp_close(struct socket *so)
1045 {
1046 	struct inpcb *inp;
1047 
1048 	inp = sotoinpcb(so);
1049 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1050 	INP_INFO_WLOCK(&udbinfo);
1051 	INP_LOCK(inp);
1052 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1053 		in_pcbdisconnect(inp);
1054 		inp->inp_laddr.s_addr = INADDR_ANY;
1055 		soisdisconnected(so);
1056 	}
1057 	INP_UNLOCK(inp);
1058 	INP_INFO_WUNLOCK(&udbinfo);
1059 }
1060 
1061 static int
1062 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1063 {
1064 	struct inpcb *inp;
1065 	int error;
1066 	struct sockaddr_in *sin;
1067 
1068 	inp = sotoinpcb(so);
1069 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1070 	INP_INFO_WLOCK(&udbinfo);
1071 	INP_LOCK(inp);
1072 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1073 		INP_UNLOCK(inp);
1074 		INP_INFO_WUNLOCK(&udbinfo);
1075 		return (EISCONN);
1076 	}
1077 	sin = (struct sockaddr_in *)nam;
1078 	if (jailed(td->td_ucred))
1079 		prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
1080 	error = in_pcbconnect(inp, nam, td->td_ucred);
1081 	if (error == 0)
1082 		soisconnected(so);
1083 	INP_UNLOCK(inp);
1084 	INP_INFO_WUNLOCK(&udbinfo);
1085 	return (error);
1086 }
1087 
1088 static void
1089 udp_detach(struct socket *so)
1090 {
1091 	struct inpcb *inp;
1092 
1093 	inp = sotoinpcb(so);
1094 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1095 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1096 	    ("udp_detach: not disconnected"));
1097 	INP_INFO_WLOCK(&udbinfo);
1098 	INP_LOCK(inp);
1099 	in_pcbdetach(inp);
1100 	in_pcbfree(inp);
1101 	INP_INFO_WUNLOCK(&udbinfo);
1102 }
1103 
1104 static int
1105 udp_disconnect(struct socket *so)
1106 {
1107 	struct inpcb *inp;
1108 
1109 	inp = sotoinpcb(so);
1110 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1111 	INP_INFO_WLOCK(&udbinfo);
1112 	INP_LOCK(inp);
1113 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1114 		INP_INFO_WUNLOCK(&udbinfo);
1115 		INP_UNLOCK(inp);
1116 		return (ENOTCONN);
1117 	}
1118 
1119 	in_pcbdisconnect(inp);
1120 	inp->inp_laddr.s_addr = INADDR_ANY;
1121 	SOCK_LOCK(so);
1122 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1123 	SOCK_UNLOCK(so);
1124 	INP_UNLOCK(inp);
1125 	INP_INFO_WUNLOCK(&udbinfo);
1126 	return (0);
1127 }
1128 
1129 static int
1130 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1131     struct mbuf *control, struct thread *td)
1132 {
1133 	struct inpcb *inp;
1134 
1135 	inp = sotoinpcb(so);
1136 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1137 	return (udp_output(inp, m, addr, control, td));
1138 }
1139 
1140 int
1141 udp_shutdown(struct socket *so)
1142 {
1143 	struct inpcb *inp;
1144 
1145 	inp = sotoinpcb(so);
1146 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1147 	INP_LOCK(inp);
1148 	socantsendmore(so);
1149 	INP_UNLOCK(inp);
1150 	return (0);
1151 }
1152 
1153 struct pr_usrreqs udp_usrreqs = {
1154 	.pru_abort =		udp_abort,
1155 	.pru_attach =		udp_attach,
1156 	.pru_bind =		udp_bind,
1157 	.pru_connect =		udp_connect,
1158 	.pru_control =		in_control,
1159 	.pru_detach =		udp_detach,
1160 	.pru_disconnect =	udp_disconnect,
1161 	.pru_peeraddr =		in_getpeeraddr,
1162 	.pru_send =		udp_send,
1163 	.pru_sosend =		sosend_dgram,
1164 	.pru_shutdown =		udp_shutdown,
1165 	.pru_sockaddr =		in_getsockaddr,
1166 	.pru_sosetlabel =	in_pcbsosetlabel,
1167 	.pru_close =		udp_close,
1168 };
1169