xref: /freebsd/sys/kern/uipc_socket.c (revision da759cfa320d5076b075d15ff3f00ab3ba5634fd)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 2004 The FreeBSD Foundation
7  * Copyright (c) 2004-2008 Robert N. M. Watson
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
35  */
36 
37 /*
38  * Comments on the socket life cycle:
39  *
40  * soalloc() sets of socket layer state for a socket, called only by
41  * socreate() and sonewconn().  Socket layer private.
42  *
43  * sodealloc() tears down socket layer state for a socket, called only by
44  * sofree() and sonewconn().  Socket layer private.
45  *
46  * pru_attach() associates protocol layer state with an allocated socket;
47  * called only once, may fail, aborting socket allocation.  This is called
48  * from socreate() and sonewconn().  Socket layer private.
49  *
50  * pru_detach() disassociates protocol layer state from an attached socket,
51  * and will be called exactly once for sockets in which pru_attach() has
52  * been successfully called.  If pru_attach() returned an error,
53  * pru_detach() will not be called.  Socket layer private.
54  *
55  * pru_abort() and pru_close() notify the protocol layer that the last
56  * consumer of a socket is starting to tear down the socket, and that the
57  * protocol should terminate the connection.  Historically, pru_abort() also
58  * detached protocol state from the socket state, but this is no longer the
59  * case.
60  *
61  * socreate() creates a socket and attaches protocol state.  This is a public
62  * interface that may be used by socket layer consumers to create new
63  * sockets.
64  *
65  * sonewconn() creates a socket and attaches protocol state.  This is a
66  * public interface  that may be used by protocols to create new sockets when
67  * a new connection is received and will be available for accept() on a
68  * listen socket.
69  *
70  * soclose() destroys a socket after possibly waiting for it to disconnect.
71  * This is a public interface that socket consumers should use to close and
72  * release a socket when done with it.
73  *
74  * soabort() destroys a socket without waiting for it to disconnect (used
75  * only for incoming connections that are already partially or fully
76  * connected).  This is used internally by the socket layer when clearing
77  * listen socket queues (due to overflow or close on the listen socket), but
78  * is also a public interface protocols may use to abort connections in
79  * their incomplete listen queues should they no longer be required.  Sockets
80  * placed in completed connection listen queues should not be aborted for
81  * reasons described in the comment above the soclose() implementation.  This
82  * is not a general purpose close routine, and except in the specific
83  * circumstances described here, should not be used.
84  *
85  * sofree() will free a socket and its protocol state if all references on
86  * the socket have been released, and is the public interface to attempt to
87  * free a socket when a reference is removed.  This is a socket layer private
88  * interface.
89  *
90  * NOTE: In addition to socreate() and soclose(), which provide a single
91  * socket reference to the consumer to be managed as required, there are two
92  * calls to explicitly manage socket references, soref(), and sorele().
93  * Currently, these are generally required only when transitioning a socket
94  * from a listen queue to a file descriptor, in order to prevent garbage
95  * collection of the socket at an untimely moment.  For a number of reasons,
96  * these interfaces are not preferred, and should be avoided.
97  *
98  * NOTE: With regard to VNETs the general rule is that callers do not set
99  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
100  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
101  * and sorflush(), which are usually called from a pre-set VNET context.
102  * sopoll() currently does not need a VNET context to be set.
103  */
104 
105 #include <sys/cdefs.h>
106 __FBSDID("$FreeBSD$");
107 
108 #include "opt_inet.h"
109 #include "opt_inet6.h"
110 #include "opt_kern_tls.h"
111 #include "opt_sctp.h"
112 
113 #include <sys/param.h>
114 #include <sys/systm.h>
115 #include <sys/fcntl.h>
116 #include <sys/limits.h>
117 #include <sys/lock.h>
118 #include <sys/mac.h>
119 #include <sys/malloc.h>
120 #include <sys/mbuf.h>
121 #include <sys/mutex.h>
122 #include <sys/domain.h>
123 #include <sys/file.h>			/* for struct knote */
124 #include <sys/hhook.h>
125 #include <sys/kernel.h>
126 #include <sys/khelp.h>
127 #include <sys/ktls.h>
128 #include <sys/event.h>
129 #include <sys/eventhandler.h>
130 #include <sys/poll.h>
131 #include <sys/proc.h>
132 #include <sys/protosw.h>
133 #include <sys/socket.h>
134 #include <sys/socketvar.h>
135 #include <sys/resourcevar.h>
136 #include <net/route.h>
137 #include <sys/signalvar.h>
138 #include <sys/stat.h>
139 #include <sys/sx.h>
140 #include <sys/sysctl.h>
141 #include <sys/taskqueue.h>
142 #include <sys/uio.h>
143 #include <sys/jail.h>
144 #include <sys/syslog.h>
145 #include <netinet/in.h>
146 #include <netinet/tcp.h>
147 
148 #include <net/vnet.h>
149 
150 #include <security/mac/mac_framework.h>
151 
152 #include <vm/uma.h>
153 
154 #ifdef COMPAT_FREEBSD32
155 #include <sys/mount.h>
156 #include <sys/sysent.h>
157 #include <compat/freebsd32/freebsd32.h>
158 #endif
159 
160 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
161 		    int flags);
162 static void	so_rdknl_lock(void *);
163 static void	so_rdknl_unlock(void *);
164 static void	so_rdknl_assert_locked(void *);
165 static void	so_rdknl_assert_unlocked(void *);
166 static void	so_wrknl_lock(void *);
167 static void	so_wrknl_unlock(void *);
168 static void	so_wrknl_assert_locked(void *);
169 static void	so_wrknl_assert_unlocked(void *);
170 
171 static void	filt_sordetach(struct knote *kn);
172 static int	filt_soread(struct knote *kn, long hint);
173 static void	filt_sowdetach(struct knote *kn);
174 static int	filt_sowrite(struct knote *kn, long hint);
175 static int	filt_soempty(struct knote *kn, long hint);
176 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
177 fo_kqfilter_t	soo_kqfilter;
178 
179 static struct filterops soread_filtops = {
180 	.f_isfd = 1,
181 	.f_detach = filt_sordetach,
182 	.f_event = filt_soread,
183 };
184 static struct filterops sowrite_filtops = {
185 	.f_isfd = 1,
186 	.f_detach = filt_sowdetach,
187 	.f_event = filt_sowrite,
188 };
189 static struct filterops soempty_filtops = {
190 	.f_isfd = 1,
191 	.f_detach = filt_sowdetach,
192 	.f_event = filt_soempty,
193 };
194 
195 so_gen_t	so_gencnt;	/* generation count for sockets */
196 
197 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
198 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
199 
200 #define	VNET_SO_ASSERT(so)						\
201 	VNET_ASSERT(curvnet != NULL,					\
202 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
203 
204 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
205 #define	V_socket_hhh		VNET(socket_hhh)
206 
207 /*
208  * Limit on the number of connections in the listen queue waiting
209  * for accept(2).
210  * NB: The original sysctl somaxconn is still available but hidden
211  * to prevent confusion about the actual purpose of this number.
212  */
213 static u_int somaxconn = SOMAXCONN;
214 
215 static int
216 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
217 {
218 	int error;
219 	int val;
220 
221 	val = somaxconn;
222 	error = sysctl_handle_int(oidp, &val, 0, req);
223 	if (error || !req->newptr )
224 		return (error);
225 
226 	/*
227 	 * The purpose of the UINT_MAX / 3 limit, is so that the formula
228 	 *   3 * so_qlimit / 2
229 	 * below, will not overflow.
230          */
231 
232 	if (val < 1 || val > UINT_MAX / 3)
233 		return (EINVAL);
234 
235 	somaxconn = val;
236 	return (0);
237 }
238 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
239     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int),
240     sysctl_somaxconn, "I",
241     "Maximum listen socket pending connection accept queue size");
242 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
243     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0,
244     sizeof(int), sysctl_somaxconn, "I",
245     "Maximum listen socket pending connection accept queue size (compat)");
246 
247 static int numopensockets;
248 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
249     &numopensockets, 0, "Number of open sockets");
250 
251 /*
252  * accept_mtx locks down per-socket fields relating to accept queues.  See
253  * socketvar.h for an annotation of the protected fields of struct socket.
254  */
255 struct mtx accept_mtx;
256 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
257 
258 /*
259  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
260  * so_gencnt field.
261  */
262 static struct mtx so_global_mtx;
263 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
264 
265 /*
266  * General IPC sysctl name space, used by sockets and a variety of other IPC
267  * types.
268  */
269 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
270     "IPC");
271 
272 /*
273  * Initialize the socket subsystem and set up the socket
274  * memory allocator.
275  */
276 static uma_zone_t socket_zone;
277 int	maxsockets;
278 
279 static void
280 socket_zone_change(void *tag)
281 {
282 
283 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
284 }
285 
286 static void
287 socket_hhook_register(int subtype)
288 {
289 
290 	if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
291 	    &V_socket_hhh[subtype],
292 	    HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
293 		printf("%s: WARNING: unable to register hook\n", __func__);
294 }
295 
296 static void
297 socket_hhook_deregister(int subtype)
298 {
299 
300 	if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
301 		printf("%s: WARNING: unable to deregister hook\n", __func__);
302 }
303 
304 static void
305 socket_init(void *tag)
306 {
307 
308 	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
309 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
310 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
311 	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
312 	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
313 	    EVENTHANDLER_PRI_FIRST);
314 }
315 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
316 
317 static void
318 socket_vnet_init(const void *unused __unused)
319 {
320 	int i;
321 
322 	/* We expect a contiguous range */
323 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
324 		socket_hhook_register(i);
325 }
326 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
327     socket_vnet_init, NULL);
328 
329 static void
330 socket_vnet_uninit(const void *unused __unused)
331 {
332 	int i;
333 
334 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
335 		socket_hhook_deregister(i);
336 }
337 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
338     socket_vnet_uninit, NULL);
339 
340 /*
341  * Initialise maxsockets.  This SYSINIT must be run after
342  * tunable_mbinit().
343  */
344 static void
345 init_maxsockets(void *ignored)
346 {
347 
348 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
349 	maxsockets = imax(maxsockets, maxfiles);
350 }
351 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
352 
353 /*
354  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
355  * of the change so that they can update their dependent limits as required.
356  */
357 static int
358 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
359 {
360 	int error, newmaxsockets;
361 
362 	newmaxsockets = maxsockets;
363 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
364 	if (error == 0 && req->newptr) {
365 		if (newmaxsockets > maxsockets &&
366 		    newmaxsockets <= maxfiles) {
367 			maxsockets = newmaxsockets;
368 			EVENTHANDLER_INVOKE(maxsockets_change);
369 		} else
370 			error = EINVAL;
371 	}
372 	return (error);
373 }
374 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
375     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0,
376     sysctl_maxsockets, "IU",
377     "Maximum number of sockets available");
378 
379 /*
380  * Socket operation routines.  These routines are called by the routines in
381  * sys_socket.c or from a system process, and implement the semantics of
382  * socket operations by switching out to the protocol specific routines.
383  */
384 
385 /*
386  * Get a socket structure from our zone, and initialize it.  Note that it
387  * would probably be better to allocate socket and PCB at the same time, but
388  * I'm not convinced that all the protocols can be easily modified to do
389  * this.
390  *
391  * soalloc() returns a socket with a ref count of 0.
392  */
393 static struct socket *
394 soalloc(struct vnet *vnet)
395 {
396 	struct socket *so;
397 
398 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
399 	if (so == NULL)
400 		return (NULL);
401 #ifdef MAC
402 	if (mac_socket_init(so, M_NOWAIT) != 0) {
403 		uma_zfree(socket_zone, so);
404 		return (NULL);
405 	}
406 #endif
407 	if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
408 		uma_zfree(socket_zone, so);
409 		return (NULL);
410 	}
411 
412 	/*
413 	 * The socket locking protocol allows to lock 2 sockets at a time,
414 	 * however, the first one must be a listening socket.  WITNESS lacks
415 	 * a feature to change class of an existing lock, so we use DUPOK.
416 	 */
417 	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
418 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
419 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
420 	so->so_rcv.sb_sel = &so->so_rdsel;
421 	so->so_snd.sb_sel = &so->so_wrsel;
422 	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
423 	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
424 	TAILQ_INIT(&so->so_snd.sb_aiojobq);
425 	TAILQ_INIT(&so->so_rcv.sb_aiojobq);
426 	TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
427 	TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
428 #ifdef VIMAGE
429 	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
430 	    __func__, __LINE__, so));
431 	so->so_vnet = vnet;
432 #endif
433 	/* We shouldn't need the so_global_mtx */
434 	if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
435 		/* Do we need more comprehensive error returns? */
436 		uma_zfree(socket_zone, so);
437 		return (NULL);
438 	}
439 	mtx_lock(&so_global_mtx);
440 	so->so_gencnt = ++so_gencnt;
441 	++numopensockets;
442 #ifdef VIMAGE
443 	vnet->vnet_sockcnt++;
444 #endif
445 	mtx_unlock(&so_global_mtx);
446 
447 	return (so);
448 }
449 
450 /*
451  * Free the storage associated with a socket at the socket layer, tear down
452  * locks, labels, etc.  All protocol state is assumed already to have been
453  * torn down (and possibly never set up) by the caller.
454  */
455 static void
456 sodealloc(struct socket *so)
457 {
458 
459 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
460 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
461 
462 	mtx_lock(&so_global_mtx);
463 	so->so_gencnt = ++so_gencnt;
464 	--numopensockets;	/* Could be below, but faster here. */
465 #ifdef VIMAGE
466 	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
467 	    __func__, __LINE__, so));
468 	so->so_vnet->vnet_sockcnt--;
469 #endif
470 	mtx_unlock(&so_global_mtx);
471 #ifdef MAC
472 	mac_socket_destroy(so);
473 #endif
474 	hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
475 
476 	crfree(so->so_cred);
477 	khelp_destroy_osd(&so->osd);
478 	if (SOLISTENING(so)) {
479 		if (so->sol_accept_filter != NULL)
480 			accept_filt_setopt(so, NULL);
481 	} else {
482 		if (so->so_rcv.sb_hiwat)
483 			(void)chgsbsize(so->so_cred->cr_uidinfo,
484 			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
485 		if (so->so_snd.sb_hiwat)
486 			(void)chgsbsize(so->so_cred->cr_uidinfo,
487 			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
488 		sx_destroy(&so->so_snd.sb_sx);
489 		sx_destroy(&so->so_rcv.sb_sx);
490 		SOCKBUF_LOCK_DESTROY(&so->so_snd);
491 		SOCKBUF_LOCK_DESTROY(&so->so_rcv);
492 	}
493 	mtx_destroy(&so->so_lock);
494 	uma_zfree(socket_zone, so);
495 }
496 
497 /*
498  * socreate returns a socket with a ref count of 1.  The socket should be
499  * closed with soclose().
500  */
501 int
502 socreate(int dom, struct socket **aso, int type, int proto,
503     struct ucred *cred, struct thread *td)
504 {
505 	struct protosw *prp;
506 	struct socket *so;
507 	int error;
508 
509 	if (proto)
510 		prp = pffindproto(dom, proto, type);
511 	else
512 		prp = pffindtype(dom, type);
513 
514 	if (prp == NULL) {
515 		/* No support for domain. */
516 		if (pffinddomain(dom) == NULL)
517 			return (EAFNOSUPPORT);
518 		/* No support for socket type. */
519 		if (proto == 0 && type != 0)
520 			return (EPROTOTYPE);
521 		return (EPROTONOSUPPORT);
522 	}
523 	if (prp->pr_usrreqs->pru_attach == NULL ||
524 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
525 		return (EPROTONOSUPPORT);
526 
527 	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
528 		return (EPROTONOSUPPORT);
529 
530 	if (prp->pr_type != type)
531 		return (EPROTOTYPE);
532 	so = soalloc(CRED_TO_VNET(cred));
533 	if (so == NULL)
534 		return (ENOBUFS);
535 
536 	so->so_type = type;
537 	so->so_cred = crhold(cred);
538 	if ((prp->pr_domain->dom_family == PF_INET) ||
539 	    (prp->pr_domain->dom_family == PF_INET6) ||
540 	    (prp->pr_domain->dom_family == PF_ROUTE))
541 		so->so_fibnum = td->td_proc->p_fibnum;
542 	else
543 		so->so_fibnum = 0;
544 	so->so_proto = prp;
545 #ifdef MAC
546 	mac_socket_create(cred, so);
547 #endif
548 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
549 	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
550 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
551 	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
552 	/*
553 	 * Auto-sizing of socket buffers is managed by the protocols and
554 	 * the appropriate flags must be set in the pru_attach function.
555 	 */
556 	CURVNET_SET(so->so_vnet);
557 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
558 	CURVNET_RESTORE();
559 	if (error) {
560 		sodealloc(so);
561 		return (error);
562 	}
563 	soref(so);
564 	*aso = so;
565 	return (0);
566 }
567 
568 #ifdef REGRESSION
569 static int regression_sonewconn_earlytest = 1;
570 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
571     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
572 #endif
573 
574 /*
575  * When an attempt at a new connection is noted on a socket which accepts
576  * connections, sonewconn is called.  If the connection is possible (subject
577  * to space constraints, etc.) then we allocate a new structure, properly
578  * linked into the data structure of the original socket, and return this.
579  * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
580  *
581  * Note: the ref count on the socket is 0 on return.
582  */
583 struct socket *
584 sonewconn(struct socket *head, int connstatus)
585 {
586 	static struct timeval lastover;
587 	static struct timeval overinterval = { 60, 0 };
588 	static int overcount;
589 
590 	struct socket *so;
591 	u_int over;
592 
593 	SOLISTEN_LOCK(head);
594 	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
595 	SOLISTEN_UNLOCK(head);
596 #ifdef REGRESSION
597 	if (regression_sonewconn_earlytest && over) {
598 #else
599 	if (over) {
600 #endif
601 		overcount++;
602 
603 		if (ratecheck(&lastover, &overinterval)) {
604 			log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
605 			    "%i already in queue awaiting acceptance "
606 			    "(%d occurrences)\n",
607 			    __func__, head->so_pcb, head->sol_qlen, overcount);
608 
609 			overcount = 0;
610 		}
611 
612 		return (NULL);
613 	}
614 	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
615 	    __func__, head));
616 	so = soalloc(head->so_vnet);
617 	if (so == NULL) {
618 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
619 		    "limit reached or out of memory\n",
620 		    __func__, head->so_pcb);
621 		return (NULL);
622 	}
623 	so->so_listen = head;
624 	so->so_type = head->so_type;
625 	so->so_linger = head->so_linger;
626 	so->so_state = head->so_state | SS_NOFDREF;
627 	so->so_fibnum = head->so_fibnum;
628 	so->so_proto = head->so_proto;
629 	so->so_cred = crhold(head->so_cred);
630 #ifdef MAC
631 	mac_socket_newconn(head, so);
632 #endif
633 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
634 	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
635 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
636 	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
637 	VNET_SO_ASSERT(head);
638 	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
639 		sodealloc(so);
640 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
641 		    __func__, head->so_pcb);
642 		return (NULL);
643 	}
644 	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
645 		sodealloc(so);
646 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
647 		    __func__, head->so_pcb);
648 		return (NULL);
649 	}
650 	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
651 	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
652 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
653 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
654 	so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
655 	so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
656 
657 	SOLISTEN_LOCK(head);
658 	if (head->sol_accept_filter != NULL)
659 		connstatus = 0;
660 	so->so_state |= connstatus;
661 	so->so_options = head->so_options & ~SO_ACCEPTCONN;
662 	soref(head); /* A socket on (in)complete queue refs head. */
663 	if (connstatus) {
664 		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
665 		so->so_qstate = SQ_COMP;
666 		head->sol_qlen++;
667 		solisten_wakeup(head);	/* unlocks */
668 	} else {
669 		/*
670 		 * Keep removing sockets from the head until there's room for
671 		 * us to insert on the tail.  In pre-locking revisions, this
672 		 * was a simple if(), but as we could be racing with other
673 		 * threads and soabort() requires dropping locks, we must
674 		 * loop waiting for the condition to be true.
675 		 */
676 		while (head->sol_incqlen > head->sol_qlimit) {
677 			struct socket *sp;
678 
679 			sp = TAILQ_FIRST(&head->sol_incomp);
680 			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
681 			head->sol_incqlen--;
682 			SOCK_LOCK(sp);
683 			sp->so_qstate = SQ_NONE;
684 			sp->so_listen = NULL;
685 			SOCK_UNLOCK(sp);
686 			sorele(head);	/* does SOLISTEN_UNLOCK, head stays */
687 			soabort(sp);
688 			SOLISTEN_LOCK(head);
689 		}
690 		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
691 		so->so_qstate = SQ_INCOMP;
692 		head->sol_incqlen++;
693 		SOLISTEN_UNLOCK(head);
694 	}
695 	return (so);
696 }
697 
698 #ifdef SCTP
699 /*
700  * Socket part of sctp_peeloff().  Detach a new socket from an
701  * association.  The new socket is returned with a reference.
702  */
703 struct socket *
704 sopeeloff(struct socket *head)
705 {
706 	struct socket *so;
707 
708 	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
709 	    __func__, __LINE__, head));
710 	so = soalloc(head->so_vnet);
711 	if (so == NULL) {
712 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
713 		    "limit reached or out of memory\n",
714 		    __func__, head->so_pcb);
715 		return (NULL);
716 	}
717 	so->so_type = head->so_type;
718 	so->so_options = head->so_options;
719 	so->so_linger = head->so_linger;
720 	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
721 	so->so_fibnum = head->so_fibnum;
722 	so->so_proto = head->so_proto;
723 	so->so_cred = crhold(head->so_cred);
724 #ifdef MAC
725 	mac_socket_newconn(head, so);
726 #endif
727 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
728 	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
729 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
730 	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
731 	VNET_SO_ASSERT(head);
732 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
733 		sodealloc(so);
734 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
735 		    __func__, head->so_pcb);
736 		return (NULL);
737 	}
738 	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
739 		sodealloc(so);
740 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
741 		    __func__, head->so_pcb);
742 		return (NULL);
743 	}
744 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
745 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
746 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
747 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
748 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
749 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
750 
751 	soref(so);
752 
753 	return (so);
754 }
755 #endif	/* SCTP */
756 
757 int
758 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
759 {
760 	int error;
761 
762 	CURVNET_SET(so->so_vnet);
763 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
764 	CURVNET_RESTORE();
765 	return (error);
766 }
767 
768 int
769 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
770 {
771 	int error;
772 
773 	CURVNET_SET(so->so_vnet);
774 	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
775 	CURVNET_RESTORE();
776 	return (error);
777 }
778 
779 /*
780  * solisten() transitions a socket from a non-listening state to a listening
781  * state, but can also be used to update the listen queue depth on an
782  * existing listen socket.  The protocol will call back into the sockets
783  * layer using solisten_proto_check() and solisten_proto() to check and set
784  * socket-layer listen state.  Call backs are used so that the protocol can
785  * acquire both protocol and socket layer locks in whatever order is required
786  * by the protocol.
787  *
788  * Protocol implementors are advised to hold the socket lock across the
789  * socket-layer test and set to avoid races at the socket layer.
790  */
791 int
792 solisten(struct socket *so, int backlog, struct thread *td)
793 {
794 	int error;
795 
796 	CURVNET_SET(so->so_vnet);
797 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
798 	CURVNET_RESTORE();
799 	return (error);
800 }
801 
802 int
803 solisten_proto_check(struct socket *so)
804 {
805 
806 	SOCK_LOCK_ASSERT(so);
807 
808 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
809 	    SS_ISDISCONNECTING))
810 		return (EINVAL);
811 	return (0);
812 }
813 
814 void
815 solisten_proto(struct socket *so, int backlog)
816 {
817 	int sbrcv_lowat, sbsnd_lowat;
818 	u_int sbrcv_hiwat, sbsnd_hiwat;
819 	short sbrcv_flags, sbsnd_flags;
820 	sbintime_t sbrcv_timeo, sbsnd_timeo;
821 
822 	SOCK_LOCK_ASSERT(so);
823 
824 	if (SOLISTENING(so))
825 		goto listening;
826 
827 	/*
828 	 * Change this socket to listening state.
829 	 */
830 	sbrcv_lowat = so->so_rcv.sb_lowat;
831 	sbsnd_lowat = so->so_snd.sb_lowat;
832 	sbrcv_hiwat = so->so_rcv.sb_hiwat;
833 	sbsnd_hiwat = so->so_snd.sb_hiwat;
834 	sbrcv_flags = so->so_rcv.sb_flags;
835 	sbsnd_flags = so->so_snd.sb_flags;
836 	sbrcv_timeo = so->so_rcv.sb_timeo;
837 	sbsnd_timeo = so->so_snd.sb_timeo;
838 
839 	sbdestroy(&so->so_snd, so);
840 	sbdestroy(&so->so_rcv, so);
841 	sx_destroy(&so->so_snd.sb_sx);
842 	sx_destroy(&so->so_rcv.sb_sx);
843 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
844 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
845 
846 #ifdef INVARIANTS
847 	bzero(&so->so_rcv,
848 	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
849 #endif
850 
851 	so->sol_sbrcv_lowat = sbrcv_lowat;
852 	so->sol_sbsnd_lowat = sbsnd_lowat;
853 	so->sol_sbrcv_hiwat = sbrcv_hiwat;
854 	so->sol_sbsnd_hiwat = sbsnd_hiwat;
855 	so->sol_sbrcv_flags = sbrcv_flags;
856 	so->sol_sbsnd_flags = sbsnd_flags;
857 	so->sol_sbrcv_timeo = sbrcv_timeo;
858 	so->sol_sbsnd_timeo = sbsnd_timeo;
859 
860 	so->sol_qlen = so->sol_incqlen = 0;
861 	TAILQ_INIT(&so->sol_incomp);
862 	TAILQ_INIT(&so->sol_comp);
863 
864 	so->sol_accept_filter = NULL;
865 	so->sol_accept_filter_arg = NULL;
866 	so->sol_accept_filter_str = NULL;
867 
868 	so->sol_upcall = NULL;
869 	so->sol_upcallarg = NULL;
870 
871 	so->so_options |= SO_ACCEPTCONN;
872 
873 listening:
874 	if (backlog < 0 || backlog > somaxconn)
875 		backlog = somaxconn;
876 	so->sol_qlimit = backlog;
877 }
878 
879 /*
880  * Wakeup listeners/subsystems once we have a complete connection.
881  * Enters with lock, returns unlocked.
882  */
883 void
884 solisten_wakeup(struct socket *sol)
885 {
886 
887 	if (sol->sol_upcall != NULL)
888 		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
889 	else {
890 		selwakeuppri(&sol->so_rdsel, PSOCK);
891 		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
892 	}
893 	SOLISTEN_UNLOCK(sol);
894 	wakeup_one(&sol->sol_comp);
895 	if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
896 		pgsigio(&sol->so_sigio, SIGIO, 0);
897 }
898 
899 /*
900  * Return single connection off a listening socket queue.  Main consumer of
901  * the function is kern_accept4().  Some modules, that do their own accept
902  * management also use the function.
903  *
904  * Listening socket must be locked on entry and is returned unlocked on
905  * return.
906  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
907  */
908 int
909 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
910 {
911 	struct socket *so;
912 	int error;
913 
914 	SOLISTEN_LOCK_ASSERT(head);
915 
916 	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
917 	    head->so_error == 0) {
918 		error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
919 		    "accept", 0);
920 		if (error != 0) {
921 			SOLISTEN_UNLOCK(head);
922 			return (error);
923 		}
924 	}
925 	if (head->so_error) {
926 		error = head->so_error;
927 		head->so_error = 0;
928 	} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
929 		error = EWOULDBLOCK;
930 	else
931 		error = 0;
932 	if (error) {
933 		SOLISTEN_UNLOCK(head);
934 		return (error);
935 	}
936 	so = TAILQ_FIRST(&head->sol_comp);
937 	SOCK_LOCK(so);
938 	KASSERT(so->so_qstate == SQ_COMP,
939 	    ("%s: so %p not SQ_COMP", __func__, so));
940 	soref(so);
941 	head->sol_qlen--;
942 	so->so_qstate = SQ_NONE;
943 	so->so_listen = NULL;
944 	TAILQ_REMOVE(&head->sol_comp, so, so_list);
945 	if (flags & ACCEPT4_INHERIT)
946 		so->so_state |= (head->so_state & SS_NBIO);
947 	else
948 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
949 	SOCK_UNLOCK(so);
950 	sorele(head);
951 
952 	*ret = so;
953 	return (0);
954 }
955 
956 /*
957  * Evaluate the reference count and named references on a socket; if no
958  * references remain, free it.  This should be called whenever a reference is
959  * released, such as in sorele(), but also when named reference flags are
960  * cleared in socket or protocol code.
961  *
962  * sofree() will free the socket if:
963  *
964  * - There are no outstanding file descriptor references or related consumers
965  *   (so_count == 0).
966  *
967  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
968  *
969  * - The protocol does not have an outstanding strong reference on the socket
970  *   (SS_PROTOREF).
971  *
972  * - The socket is not in a completed connection queue, so a process has been
973  *   notified that it is present.  If it is removed, the user process may
974  *   block in accept() despite select() saying the socket was ready.
975  */
976 void
977 sofree(struct socket *so)
978 {
979 	struct protosw *pr = so->so_proto;
980 
981 	SOCK_LOCK_ASSERT(so);
982 
983 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
984 	    (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
985 		SOCK_UNLOCK(so);
986 		return;
987 	}
988 
989 	if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
990 		struct socket *sol;
991 
992 		sol = so->so_listen;
993 		KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
994 
995 		/*
996 		 * To solve race between close of a listening socket and
997 		 * a socket on its incomplete queue, we need to lock both.
998 		 * The order is first listening socket, then regular.
999 		 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
1000 		 * function and the listening socket are the only pointers
1001 		 * to so.  To preserve so and sol, we reference both and then
1002 		 * relock.
1003 		 * After relock the socket may not move to so_comp since it
1004 		 * doesn't have PCB already, but it may be removed from
1005 		 * so_incomp. If that happens, we share responsiblity on
1006 		 * freeing the socket, but soclose() has already removed
1007 		 * it from queue.
1008 		 */
1009 		soref(sol);
1010 		soref(so);
1011 		SOCK_UNLOCK(so);
1012 		SOLISTEN_LOCK(sol);
1013 		SOCK_LOCK(so);
1014 		if (so->so_qstate == SQ_INCOMP) {
1015 			KASSERT(so->so_listen == sol,
1016 			    ("%s: so %p migrated out of sol %p",
1017 			    __func__, so, sol));
1018 			TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
1019 			sol->sol_incqlen--;
1020 			/* This is guarenteed not to be the last. */
1021 			refcount_release(&sol->so_count);
1022 			so->so_qstate = SQ_NONE;
1023 			so->so_listen = NULL;
1024 		} else
1025 			KASSERT(so->so_listen == NULL,
1026 			    ("%s: so %p not on (in)comp with so_listen",
1027 			    __func__, so));
1028 		sorele(sol);
1029 		KASSERT(so->so_count == 1,
1030 		    ("%s: so %p count %u", __func__, so, so->so_count));
1031 		so->so_count = 0;
1032 	}
1033 	if (SOLISTENING(so))
1034 		so->so_error = ECONNABORTED;
1035 	SOCK_UNLOCK(so);
1036 
1037 	if (so->so_dtor != NULL)
1038 		so->so_dtor(so);
1039 
1040 	VNET_SO_ASSERT(so);
1041 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1042 		(*pr->pr_domain->dom_dispose)(so);
1043 	if (pr->pr_usrreqs->pru_detach != NULL)
1044 		(*pr->pr_usrreqs->pru_detach)(so);
1045 
1046 	/*
1047 	 * From this point on, we assume that no other references to this
1048 	 * socket exist anywhere else in the stack.  Therefore, no locks need
1049 	 * to be acquired or held.
1050 	 *
1051 	 * We used to do a lot of socket buffer and socket locking here, as
1052 	 * well as invoke sorflush() and perform wakeups.  The direct call to
1053 	 * dom_dispose() and sbdestroy() are an inlining of what was
1054 	 * necessary from sorflush().
1055 	 *
1056 	 * Notice that the socket buffer and kqueue state are torn down
1057 	 * before calling pru_detach.  This means that protocols shold not
1058 	 * assume they can perform socket wakeups, etc, in their detach code.
1059 	 */
1060 	if (!SOLISTENING(so)) {
1061 		sbdestroy(&so->so_snd, so);
1062 		sbdestroy(&so->so_rcv, so);
1063 	}
1064 	seldrain(&so->so_rdsel);
1065 	seldrain(&so->so_wrsel);
1066 	knlist_destroy(&so->so_rdsel.si_note);
1067 	knlist_destroy(&so->so_wrsel.si_note);
1068 	sodealloc(so);
1069 }
1070 
1071 /*
1072  * Close a socket on last file table reference removal.  Initiate disconnect
1073  * if connected.  Free socket when disconnect complete.
1074  *
1075  * This function will sorele() the socket.  Note that soclose() may be called
1076  * prior to the ref count reaching zero.  The actual socket structure will
1077  * not be freed until the ref count reaches zero.
1078  */
1079 int
1080 soclose(struct socket *so)
1081 {
1082 	struct accept_queue lqueue;
1083 	bool listening;
1084 	int error = 0;
1085 
1086 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
1087 
1088 	CURVNET_SET(so->so_vnet);
1089 	funsetown(&so->so_sigio);
1090 	if (so->so_state & SS_ISCONNECTED) {
1091 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1092 			error = sodisconnect(so);
1093 			if (error) {
1094 				if (error == ENOTCONN)
1095 					error = 0;
1096 				goto drop;
1097 			}
1098 		}
1099 		if (so->so_options & SO_LINGER) {
1100 			if ((so->so_state & SS_ISDISCONNECTING) &&
1101 			    (so->so_state & SS_NBIO))
1102 				goto drop;
1103 			while (so->so_state & SS_ISCONNECTED) {
1104 				error = tsleep(&so->so_timeo,
1105 				    PSOCK | PCATCH, "soclos",
1106 				    so->so_linger * hz);
1107 				if (error)
1108 					break;
1109 			}
1110 		}
1111 	}
1112 
1113 drop:
1114 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
1115 		(*so->so_proto->pr_usrreqs->pru_close)(so);
1116 
1117 	SOCK_LOCK(so);
1118 	if ((listening = (so->so_options & SO_ACCEPTCONN))) {
1119 		struct socket *sp;
1120 
1121 		TAILQ_INIT(&lqueue);
1122 		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1123 		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1124 
1125 		so->sol_qlen = so->sol_incqlen = 0;
1126 
1127 		TAILQ_FOREACH(sp, &lqueue, so_list) {
1128 			SOCK_LOCK(sp);
1129 			sp->so_qstate = SQ_NONE;
1130 			sp->so_listen = NULL;
1131 			SOCK_UNLOCK(sp);
1132 			/* Guaranteed not to be the last. */
1133 			refcount_release(&so->so_count);
1134 		}
1135 	}
1136 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
1137 	so->so_state |= SS_NOFDREF;
1138 	sorele(so);
1139 	if (listening) {
1140 		struct socket *sp, *tsp;
1141 
1142 		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
1143 			SOCK_LOCK(sp);
1144 			if (sp->so_count == 0) {
1145 				SOCK_UNLOCK(sp);
1146 				soabort(sp);
1147 			} else
1148 				/* sp is now in sofree() */
1149 				SOCK_UNLOCK(sp);
1150 		}
1151 	}
1152 	CURVNET_RESTORE();
1153 	return (error);
1154 }
1155 
1156 /*
1157  * soabort() is used to abruptly tear down a connection, such as when a
1158  * resource limit is reached (listen queue depth exceeded), or if a listen
1159  * socket is closed while there are sockets waiting to be accepted.
1160  *
1161  * This interface is tricky, because it is called on an unreferenced socket,
1162  * and must be called only by a thread that has actually removed the socket
1163  * from the listen queue it was on, or races with other threads are risked.
1164  *
1165  * This interface will call into the protocol code, so must not be called
1166  * with any socket locks held.  Protocols do call it while holding their own
1167  * recursible protocol mutexes, but this is something that should be subject
1168  * to review in the future.
1169  */
1170 void
1171 soabort(struct socket *so)
1172 {
1173 
1174 	/*
1175 	 * In as much as is possible, assert that no references to this
1176 	 * socket are held.  This is not quite the same as asserting that the
1177 	 * current thread is responsible for arranging for no references, but
1178 	 * is as close as we can get for now.
1179 	 */
1180 	KASSERT(so->so_count == 0, ("soabort: so_count"));
1181 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
1182 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
1183 	VNET_SO_ASSERT(so);
1184 
1185 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
1186 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
1187 	SOCK_LOCK(so);
1188 	sofree(so);
1189 }
1190 
1191 int
1192 soaccept(struct socket *so, struct sockaddr **nam)
1193 {
1194 	int error;
1195 
1196 	SOCK_LOCK(so);
1197 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
1198 	so->so_state &= ~SS_NOFDREF;
1199 	SOCK_UNLOCK(so);
1200 
1201 	CURVNET_SET(so->so_vnet);
1202 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1203 	CURVNET_RESTORE();
1204 	return (error);
1205 }
1206 
1207 int
1208 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1209 {
1210 
1211 	return (soconnectat(AT_FDCWD, so, nam, td));
1212 }
1213 
1214 int
1215 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1216 {
1217 	int error;
1218 
1219 	if (so->so_options & SO_ACCEPTCONN)
1220 		return (EOPNOTSUPP);
1221 
1222 	CURVNET_SET(so->so_vnet);
1223 	/*
1224 	 * If protocol is connection-based, can only connect once.
1225 	 * Otherwise, if connected, try to disconnect first.  This allows
1226 	 * user to disconnect by connecting to, e.g., a null address.
1227 	 */
1228 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1229 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1230 	    (error = sodisconnect(so)))) {
1231 		error = EISCONN;
1232 	} else {
1233 		/*
1234 		 * Prevent accumulated error from previous connection from
1235 		 * biting us.
1236 		 */
1237 		so->so_error = 0;
1238 		if (fd == AT_FDCWD) {
1239 			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1240 			    nam, td);
1241 		} else {
1242 			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1243 			    so, nam, td);
1244 		}
1245 	}
1246 	CURVNET_RESTORE();
1247 
1248 	return (error);
1249 }
1250 
1251 int
1252 soconnect2(struct socket *so1, struct socket *so2)
1253 {
1254 	int error;
1255 
1256 	CURVNET_SET(so1->so_vnet);
1257 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1258 	CURVNET_RESTORE();
1259 	return (error);
1260 }
1261 
1262 int
1263 sodisconnect(struct socket *so)
1264 {
1265 	int error;
1266 
1267 	if ((so->so_state & SS_ISCONNECTED) == 0)
1268 		return (ENOTCONN);
1269 	if (so->so_state & SS_ISDISCONNECTING)
1270 		return (EALREADY);
1271 	VNET_SO_ASSERT(so);
1272 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1273 	return (error);
1274 }
1275 
1276 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1277 
1278 int
1279 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1280     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1281 {
1282 	long space;
1283 	ssize_t resid;
1284 	int clen = 0, error, dontroute;
1285 
1286 	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1287 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1288 	    ("sosend_dgram: !PR_ATOMIC"));
1289 
1290 	if (uio != NULL)
1291 		resid = uio->uio_resid;
1292 	else
1293 		resid = top->m_pkthdr.len;
1294 	/*
1295 	 * In theory resid should be unsigned.  However, space must be
1296 	 * signed, as it might be less than 0 if we over-committed, and we
1297 	 * must use a signed comparison of space and resid.  On the other
1298 	 * hand, a negative resid causes us to loop sending 0-length
1299 	 * segments to the protocol.
1300 	 */
1301 	if (resid < 0) {
1302 		error = EINVAL;
1303 		goto out;
1304 	}
1305 
1306 	dontroute =
1307 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1308 	if (td != NULL)
1309 		td->td_ru.ru_msgsnd++;
1310 	if (control != NULL)
1311 		clen = control->m_len;
1312 
1313 	SOCKBUF_LOCK(&so->so_snd);
1314 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1315 		SOCKBUF_UNLOCK(&so->so_snd);
1316 		error = EPIPE;
1317 		goto out;
1318 	}
1319 	if (so->so_error) {
1320 		error = so->so_error;
1321 		so->so_error = 0;
1322 		SOCKBUF_UNLOCK(&so->so_snd);
1323 		goto out;
1324 	}
1325 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1326 		/*
1327 		 * `sendto' and `sendmsg' is allowed on a connection-based
1328 		 * socket if it supports implied connect.  Return ENOTCONN if
1329 		 * not connected and no address is supplied.
1330 		 */
1331 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1332 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1333 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1334 			    !(resid == 0 && clen != 0)) {
1335 				SOCKBUF_UNLOCK(&so->so_snd);
1336 				error = ENOTCONN;
1337 				goto out;
1338 			}
1339 		} else if (addr == NULL) {
1340 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1341 				error = ENOTCONN;
1342 			else
1343 				error = EDESTADDRREQ;
1344 			SOCKBUF_UNLOCK(&so->so_snd);
1345 			goto out;
1346 		}
1347 	}
1348 
1349 	/*
1350 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1351 	 * problem and need fixing.
1352 	 */
1353 	space = sbspace(&so->so_snd);
1354 	if (flags & MSG_OOB)
1355 		space += 1024;
1356 	space -= clen;
1357 	SOCKBUF_UNLOCK(&so->so_snd);
1358 	if (resid > space) {
1359 		error = EMSGSIZE;
1360 		goto out;
1361 	}
1362 	if (uio == NULL) {
1363 		resid = 0;
1364 		if (flags & MSG_EOR)
1365 			top->m_flags |= M_EOR;
1366 	} else {
1367 		/*
1368 		 * Copy the data from userland into a mbuf chain.
1369 		 * If no data is to be copied in, a single empty mbuf
1370 		 * is returned.
1371 		 */
1372 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1373 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1374 		if (top == NULL) {
1375 			error = EFAULT;	/* only possible error */
1376 			goto out;
1377 		}
1378 		space -= resid - uio->uio_resid;
1379 		resid = uio->uio_resid;
1380 	}
1381 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1382 	/*
1383 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1384 	 * than with.
1385 	 */
1386 	if (dontroute) {
1387 		SOCK_LOCK(so);
1388 		so->so_options |= SO_DONTROUTE;
1389 		SOCK_UNLOCK(so);
1390 	}
1391 	/*
1392 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1393 	 * of date.  We could have received a reset packet in an interrupt or
1394 	 * maybe we slept while doing page faults in uiomove() etc.  We could
1395 	 * probably recheck again inside the locking protection here, but
1396 	 * there are probably other places that this also happens.  We must
1397 	 * rethink this.
1398 	 */
1399 	VNET_SO_ASSERT(so);
1400 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1401 	    (flags & MSG_OOB) ? PRUS_OOB :
1402 	/*
1403 	 * If the user set MSG_EOF, the protocol understands this flag and
1404 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1405 	 */
1406 	    ((flags & MSG_EOF) &&
1407 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1408 	     (resid <= 0)) ?
1409 		PRUS_EOF :
1410 		/* If there is more to send set PRUS_MORETOCOME */
1411 		(flags & MSG_MORETOCOME) ||
1412 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1413 		top, addr, control, td);
1414 	if (dontroute) {
1415 		SOCK_LOCK(so);
1416 		so->so_options &= ~SO_DONTROUTE;
1417 		SOCK_UNLOCK(so);
1418 	}
1419 	clen = 0;
1420 	control = NULL;
1421 	top = NULL;
1422 out:
1423 	if (top != NULL)
1424 		m_freem(top);
1425 	if (control != NULL)
1426 		m_freem(control);
1427 	return (error);
1428 }
1429 
1430 /*
1431  * Send on a socket.  If send must go all at once and message is larger than
1432  * send buffering, then hard error.  Lock against other senders.  If must go
1433  * all at once and not enough room now, then inform user that this would
1434  * block and do nothing.  Otherwise, if nonblocking, send as much as
1435  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1436  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1437  * in mbuf chain must be small enough to send all at once.
1438  *
1439  * Returns nonzero on error, timeout or signal; callers must check for short
1440  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1441  * on return.
1442  */
1443 int
1444 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1445     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1446 {
1447 	long space;
1448 	ssize_t resid;
1449 	int clen = 0, error, dontroute;
1450 	int atomic = sosendallatonce(so) || top;
1451 	int pru_flag;
1452 #ifdef KERN_TLS
1453 	struct ktls_session *tls;
1454 	int tls_enq_cnt, tls_pruflag;
1455 	uint8_t tls_rtype;
1456 
1457 	tls = NULL;
1458 	tls_rtype = TLS_RLTYPE_APP;
1459 #endif
1460 	if (uio != NULL)
1461 		resid = uio->uio_resid;
1462 	else
1463 		resid = top->m_pkthdr.len;
1464 	/*
1465 	 * In theory resid should be unsigned.  However, space must be
1466 	 * signed, as it might be less than 0 if we over-committed, and we
1467 	 * must use a signed comparison of space and resid.  On the other
1468 	 * hand, a negative resid causes us to loop sending 0-length
1469 	 * segments to the protocol.
1470 	 *
1471 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1472 	 * type sockets since that's an error.
1473 	 */
1474 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1475 		error = EINVAL;
1476 		goto out;
1477 	}
1478 
1479 	dontroute =
1480 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1481 	    (so->so_proto->pr_flags & PR_ATOMIC);
1482 	if (td != NULL)
1483 		td->td_ru.ru_msgsnd++;
1484 	if (control != NULL)
1485 		clen = control->m_len;
1486 
1487 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1488 	if (error)
1489 		goto out;
1490 
1491 #ifdef KERN_TLS
1492 	tls_pruflag = 0;
1493 	tls = ktls_hold(so->so_snd.sb_tls_info);
1494 	if (tls != NULL) {
1495 		if (tls->mode == TCP_TLS_MODE_SW)
1496 			tls_pruflag = PRUS_NOTREADY;
1497 
1498 		if (control != NULL) {
1499 			struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1500 
1501 			if (clen >= sizeof(*cm) &&
1502 			    cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1503 				tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1504 				clen = 0;
1505 				m_freem(control);
1506 				control = NULL;
1507 				atomic = 1;
1508 			}
1509 		}
1510 	}
1511 #endif
1512 
1513 restart:
1514 	do {
1515 		SOCKBUF_LOCK(&so->so_snd);
1516 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1517 			SOCKBUF_UNLOCK(&so->so_snd);
1518 			error = EPIPE;
1519 			goto release;
1520 		}
1521 		if (so->so_error) {
1522 			error = so->so_error;
1523 			so->so_error = 0;
1524 			SOCKBUF_UNLOCK(&so->so_snd);
1525 			goto release;
1526 		}
1527 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1528 			/*
1529 			 * `sendto' and `sendmsg' is allowed on a connection-
1530 			 * based socket if it supports implied connect.
1531 			 * Return ENOTCONN if not connected and no address is
1532 			 * supplied.
1533 			 */
1534 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1535 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1536 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1537 				    !(resid == 0 && clen != 0)) {
1538 					SOCKBUF_UNLOCK(&so->so_snd);
1539 					error = ENOTCONN;
1540 					goto release;
1541 				}
1542 			} else if (addr == NULL) {
1543 				SOCKBUF_UNLOCK(&so->so_snd);
1544 				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1545 					error = ENOTCONN;
1546 				else
1547 					error = EDESTADDRREQ;
1548 				goto release;
1549 			}
1550 		}
1551 		space = sbspace(&so->so_snd);
1552 		if (flags & MSG_OOB)
1553 			space += 1024;
1554 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1555 		    clen > so->so_snd.sb_hiwat) {
1556 			SOCKBUF_UNLOCK(&so->so_snd);
1557 			error = EMSGSIZE;
1558 			goto release;
1559 		}
1560 		if (space < resid + clen &&
1561 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1562 			if ((so->so_state & SS_NBIO) ||
1563 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1564 				SOCKBUF_UNLOCK(&so->so_snd);
1565 				error = EWOULDBLOCK;
1566 				goto release;
1567 			}
1568 			error = sbwait(&so->so_snd);
1569 			SOCKBUF_UNLOCK(&so->so_snd);
1570 			if (error)
1571 				goto release;
1572 			goto restart;
1573 		}
1574 		SOCKBUF_UNLOCK(&so->so_snd);
1575 		space -= clen;
1576 		do {
1577 			if (uio == NULL) {
1578 				resid = 0;
1579 				if (flags & MSG_EOR)
1580 					top->m_flags |= M_EOR;
1581 			} else {
1582 				/*
1583 				 * Copy the data from userland into a mbuf
1584 				 * chain.  If resid is 0, which can happen
1585 				 * only if we have control to send, then
1586 				 * a single empty mbuf is returned.  This
1587 				 * is a workaround to prevent protocol send
1588 				 * methods to panic.
1589 				 */
1590 #ifdef KERN_TLS
1591 				if (tls != NULL) {
1592 					top = m_uiotombuf(uio, M_WAITOK, space,
1593 					    tls->params.max_frame_len,
1594 					    M_NOMAP |
1595 					    ((flags & MSG_EOR) ? M_EOR : 0));
1596 					if (top != NULL) {
1597 						ktls_frame(top, tls,
1598 						    &tls_enq_cnt, tls_rtype);
1599 					}
1600 					tls_rtype = TLS_RLTYPE_APP;
1601 				} else
1602 #endif
1603 					top = m_uiotombuf(uio, M_WAITOK, space,
1604 					    (atomic ? max_hdr : 0),
1605 					    (atomic ? M_PKTHDR : 0) |
1606 					    ((flags & MSG_EOR) ? M_EOR : 0));
1607 				if (top == NULL) {
1608 					error = EFAULT; /* only possible error */
1609 					goto release;
1610 				}
1611 				space -= resid - uio->uio_resid;
1612 				resid = uio->uio_resid;
1613 			}
1614 			if (dontroute) {
1615 				SOCK_LOCK(so);
1616 				so->so_options |= SO_DONTROUTE;
1617 				SOCK_UNLOCK(so);
1618 			}
1619 			/*
1620 			 * XXX all the SBS_CANTSENDMORE checks previously
1621 			 * done could be out of date.  We could have received
1622 			 * a reset packet in an interrupt or maybe we slept
1623 			 * while doing page faults in uiomove() etc.  We
1624 			 * could probably recheck again inside the locking
1625 			 * protection here, but there are probably other
1626 			 * places that this also happens.  We must rethink
1627 			 * this.
1628 			 */
1629 			VNET_SO_ASSERT(so);
1630 
1631 			pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
1632 			/*
1633 			 * If the user set MSG_EOF, the protocol understands
1634 			 * this flag and nothing left to send then use
1635 			 * PRU_SEND_EOF instead of PRU_SEND.
1636 			 */
1637 			    ((flags & MSG_EOF) &&
1638 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1639 			     (resid <= 0)) ?
1640 				PRUS_EOF :
1641 			/* If there is more to send set PRUS_MORETOCOME. */
1642 			    (flags & MSG_MORETOCOME) ||
1643 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1644 
1645 #ifdef KERN_TLS
1646 			pru_flag |= tls_pruflag;
1647 #endif
1648 
1649 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1650 			    pru_flag, top, addr, control, td);
1651 
1652 			if (dontroute) {
1653 				SOCK_LOCK(so);
1654 				so->so_options &= ~SO_DONTROUTE;
1655 				SOCK_UNLOCK(so);
1656 			}
1657 
1658 #ifdef KERN_TLS
1659 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1660 				/*
1661 				 * Note that error is intentionally
1662 				 * ignored.
1663 				 *
1664 				 * Like sendfile(), we rely on the
1665 				 * completion routine (pru_ready())
1666 				 * to free the mbufs in the event that
1667 				 * pru_send() encountered an error and
1668 				 * did not append them to the sockbuf.
1669 				 */
1670 				soref(so);
1671 				ktls_enqueue(top, so, tls_enq_cnt);
1672 			}
1673 #endif
1674 			clen = 0;
1675 			control = NULL;
1676 			top = NULL;
1677 			if (error)
1678 				goto release;
1679 		} while (resid && space > 0);
1680 	} while (resid);
1681 
1682 release:
1683 	sbunlock(&so->so_snd);
1684 out:
1685 #ifdef KERN_TLS
1686 	if (tls != NULL)
1687 		ktls_free(tls);
1688 #endif
1689 	if (top != NULL)
1690 		m_freem(top);
1691 	if (control != NULL)
1692 		m_freem(control);
1693 	return (error);
1694 }
1695 
1696 int
1697 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1698     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1699 {
1700 	int error;
1701 
1702 	CURVNET_SET(so->so_vnet);
1703 	if (!SOLISTENING(so))
1704 		error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
1705 		    top, control, flags, td);
1706 	else {
1707 		m_freem(top);
1708 		m_freem(control);
1709 		error = ENOTCONN;
1710 	}
1711 	CURVNET_RESTORE();
1712 	return (error);
1713 }
1714 
1715 /*
1716  * The part of soreceive() that implements reading non-inline out-of-band
1717  * data from a socket.  For more complete comments, see soreceive(), from
1718  * which this code originated.
1719  *
1720  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1721  * unable to return an mbuf chain to the caller.
1722  */
1723 static int
1724 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1725 {
1726 	struct protosw *pr = so->so_proto;
1727 	struct mbuf *m;
1728 	int error;
1729 
1730 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1731 	VNET_SO_ASSERT(so);
1732 
1733 	m = m_get(M_WAITOK, MT_DATA);
1734 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1735 	if (error)
1736 		goto bad;
1737 	do {
1738 		error = uiomove(mtod(m, void *),
1739 		    (int) min(uio->uio_resid, m->m_len), uio);
1740 		m = m_free(m);
1741 	} while (uio->uio_resid && error == 0 && m);
1742 bad:
1743 	if (m != NULL)
1744 		m_freem(m);
1745 	return (error);
1746 }
1747 
1748 /*
1749  * Following replacement or removal of the first mbuf on the first mbuf chain
1750  * of a socket buffer, push necessary state changes back into the socket
1751  * buffer so that other consumers see the values consistently.  'nextrecord'
1752  * is the callers locally stored value of the original value of
1753  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1754  * NOTE: 'nextrecord' may be NULL.
1755  */
1756 static __inline void
1757 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1758 {
1759 
1760 	SOCKBUF_LOCK_ASSERT(sb);
1761 	/*
1762 	 * First, update for the new value of nextrecord.  If necessary, make
1763 	 * it the first record.
1764 	 */
1765 	if (sb->sb_mb != NULL)
1766 		sb->sb_mb->m_nextpkt = nextrecord;
1767 	else
1768 		sb->sb_mb = nextrecord;
1769 
1770 	/*
1771 	 * Now update any dependent socket buffer fields to reflect the new
1772 	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1773 	 * addition of a second clause that takes care of the case where
1774 	 * sb_mb has been updated, but remains the last record.
1775 	 */
1776 	if (sb->sb_mb == NULL) {
1777 		sb->sb_mbtail = NULL;
1778 		sb->sb_lastrecord = NULL;
1779 	} else if (sb->sb_mb->m_nextpkt == NULL)
1780 		sb->sb_lastrecord = sb->sb_mb;
1781 }
1782 
1783 /*
1784  * Implement receive operations on a socket.  We depend on the way that
1785  * records are added to the sockbuf by sbappend.  In particular, each record
1786  * (mbufs linked through m_next) must begin with an address if the protocol
1787  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1788  * data, and then zero or more mbufs of data.  In order to allow parallelism
1789  * between network receive and copying to user space, as well as avoid
1790  * sleeping with a mutex held, we release the socket buffer mutex during the
1791  * user space copy.  Although the sockbuf is locked, new data may still be
1792  * appended, and thus we must maintain consistency of the sockbuf during that
1793  * time.
1794  *
1795  * The caller may receive the data as a single mbuf chain by supplying an
1796  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1797  * the count in uio_resid.
1798  */
1799 int
1800 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1801     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1802 {
1803 	struct mbuf *m, **mp;
1804 	int flags, error, offset;
1805 	ssize_t len;
1806 	struct protosw *pr = so->so_proto;
1807 	struct mbuf *nextrecord;
1808 	int moff, type = 0;
1809 	ssize_t orig_resid = uio->uio_resid;
1810 
1811 	mp = mp0;
1812 	if (psa != NULL)
1813 		*psa = NULL;
1814 	if (controlp != NULL)
1815 		*controlp = NULL;
1816 	if (flagsp != NULL)
1817 		flags = *flagsp &~ MSG_EOR;
1818 	else
1819 		flags = 0;
1820 	if (flags & MSG_OOB)
1821 		return (soreceive_rcvoob(so, uio, flags));
1822 	if (mp != NULL)
1823 		*mp = NULL;
1824 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1825 	    && uio->uio_resid) {
1826 		VNET_SO_ASSERT(so);
1827 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1828 	}
1829 
1830 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1831 	if (error)
1832 		return (error);
1833 
1834 restart:
1835 	SOCKBUF_LOCK(&so->so_rcv);
1836 	m = so->so_rcv.sb_mb;
1837 	/*
1838 	 * If we have less data than requested, block awaiting more (subject
1839 	 * to any timeout) if:
1840 	 *   1. the current count is less than the low water mark, or
1841 	 *   2. MSG_DONTWAIT is not set
1842 	 */
1843 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1844 	    sbavail(&so->so_rcv) < uio->uio_resid) &&
1845 	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
1846 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1847 		KASSERT(m != NULL || !sbavail(&so->so_rcv),
1848 		    ("receive: m == %p sbavail == %u",
1849 		    m, sbavail(&so->so_rcv)));
1850 		if (so->so_error) {
1851 			if (m != NULL)
1852 				goto dontblock;
1853 			error = so->so_error;
1854 			if ((flags & MSG_PEEK) == 0)
1855 				so->so_error = 0;
1856 			SOCKBUF_UNLOCK(&so->so_rcv);
1857 			goto release;
1858 		}
1859 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1860 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1861 			if (m == NULL) {
1862 				SOCKBUF_UNLOCK(&so->so_rcv);
1863 				goto release;
1864 			} else
1865 				goto dontblock;
1866 		}
1867 		for (; m != NULL; m = m->m_next)
1868 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1869 				m = so->so_rcv.sb_mb;
1870 				goto dontblock;
1871 			}
1872 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1873 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1874 			SOCKBUF_UNLOCK(&so->so_rcv);
1875 			error = ENOTCONN;
1876 			goto release;
1877 		}
1878 		if (uio->uio_resid == 0) {
1879 			SOCKBUF_UNLOCK(&so->so_rcv);
1880 			goto release;
1881 		}
1882 		if ((so->so_state & SS_NBIO) ||
1883 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1884 			SOCKBUF_UNLOCK(&so->so_rcv);
1885 			error = EWOULDBLOCK;
1886 			goto release;
1887 		}
1888 		SBLASTRECORDCHK(&so->so_rcv);
1889 		SBLASTMBUFCHK(&so->so_rcv);
1890 		error = sbwait(&so->so_rcv);
1891 		SOCKBUF_UNLOCK(&so->so_rcv);
1892 		if (error)
1893 			goto release;
1894 		goto restart;
1895 	}
1896 dontblock:
1897 	/*
1898 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1899 	 * pointer to the next record in the socket buffer.  We must keep the
1900 	 * various socket buffer pointers and local stack versions of the
1901 	 * pointers in sync, pushing out modifications before dropping the
1902 	 * socket buffer mutex, and re-reading them when picking it up.
1903 	 *
1904 	 * Otherwise, we will race with the network stack appending new data
1905 	 * or records onto the socket buffer by using inconsistent/stale
1906 	 * versions of the field, possibly resulting in socket buffer
1907 	 * corruption.
1908 	 *
1909 	 * By holding the high-level sblock(), we prevent simultaneous
1910 	 * readers from pulling off the front of the socket buffer.
1911 	 */
1912 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1913 	if (uio->uio_td)
1914 		uio->uio_td->td_ru.ru_msgrcv++;
1915 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1916 	SBLASTRECORDCHK(&so->so_rcv);
1917 	SBLASTMBUFCHK(&so->so_rcv);
1918 	nextrecord = m->m_nextpkt;
1919 	if (pr->pr_flags & PR_ADDR) {
1920 		KASSERT(m->m_type == MT_SONAME,
1921 		    ("m->m_type == %d", m->m_type));
1922 		orig_resid = 0;
1923 		if (psa != NULL)
1924 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1925 			    M_NOWAIT);
1926 		if (flags & MSG_PEEK) {
1927 			m = m->m_next;
1928 		} else {
1929 			sbfree(&so->so_rcv, m);
1930 			so->so_rcv.sb_mb = m_free(m);
1931 			m = so->so_rcv.sb_mb;
1932 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1933 		}
1934 	}
1935 
1936 	/*
1937 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1938 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1939 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1940 	 * perform externalization (or freeing if controlp == NULL).
1941 	 */
1942 	if (m != NULL && m->m_type == MT_CONTROL) {
1943 		struct mbuf *cm = NULL, *cmn;
1944 		struct mbuf **cme = &cm;
1945 
1946 		do {
1947 			if (flags & MSG_PEEK) {
1948 				if (controlp != NULL) {
1949 					*controlp = m_copym(m, 0, m->m_len,
1950 					    M_NOWAIT);
1951 					controlp = &(*controlp)->m_next;
1952 				}
1953 				m = m->m_next;
1954 			} else {
1955 				sbfree(&so->so_rcv, m);
1956 				so->so_rcv.sb_mb = m->m_next;
1957 				m->m_next = NULL;
1958 				*cme = m;
1959 				cme = &(*cme)->m_next;
1960 				m = so->so_rcv.sb_mb;
1961 			}
1962 		} while (m != NULL && m->m_type == MT_CONTROL);
1963 		if ((flags & MSG_PEEK) == 0)
1964 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1965 		while (cm != NULL) {
1966 			cmn = cm->m_next;
1967 			cm->m_next = NULL;
1968 			if (pr->pr_domain->dom_externalize != NULL) {
1969 				SOCKBUF_UNLOCK(&so->so_rcv);
1970 				VNET_SO_ASSERT(so);
1971 				error = (*pr->pr_domain->dom_externalize)
1972 				    (cm, controlp, flags);
1973 				SOCKBUF_LOCK(&so->so_rcv);
1974 			} else if (controlp != NULL)
1975 				*controlp = cm;
1976 			else
1977 				m_freem(cm);
1978 			if (controlp != NULL) {
1979 				orig_resid = 0;
1980 				while (*controlp != NULL)
1981 					controlp = &(*controlp)->m_next;
1982 			}
1983 			cm = cmn;
1984 		}
1985 		if (m != NULL)
1986 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1987 		else
1988 			nextrecord = so->so_rcv.sb_mb;
1989 		orig_resid = 0;
1990 	}
1991 	if (m != NULL) {
1992 		if ((flags & MSG_PEEK) == 0) {
1993 			KASSERT(m->m_nextpkt == nextrecord,
1994 			    ("soreceive: post-control, nextrecord !sync"));
1995 			if (nextrecord == NULL) {
1996 				KASSERT(so->so_rcv.sb_mb == m,
1997 				    ("soreceive: post-control, sb_mb!=m"));
1998 				KASSERT(so->so_rcv.sb_lastrecord == m,
1999 				    ("soreceive: post-control, lastrecord!=m"));
2000 			}
2001 		}
2002 		type = m->m_type;
2003 		if (type == MT_OOBDATA)
2004 			flags |= MSG_OOB;
2005 	} else {
2006 		if ((flags & MSG_PEEK) == 0) {
2007 			KASSERT(so->so_rcv.sb_mb == nextrecord,
2008 			    ("soreceive: sb_mb != nextrecord"));
2009 			if (so->so_rcv.sb_mb == NULL) {
2010 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
2011 				    ("soreceive: sb_lastercord != NULL"));
2012 			}
2013 		}
2014 	}
2015 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2016 	SBLASTRECORDCHK(&so->so_rcv);
2017 	SBLASTMBUFCHK(&so->so_rcv);
2018 
2019 	/*
2020 	 * Now continue to read any data mbufs off of the head of the socket
2021 	 * buffer until the read request is satisfied.  Note that 'type' is
2022 	 * used to store the type of any mbuf reads that have happened so far
2023 	 * such that soreceive() can stop reading if the type changes, which
2024 	 * causes soreceive() to return only one of regular data and inline
2025 	 * out-of-band data in a single socket receive operation.
2026 	 */
2027 	moff = 0;
2028 	offset = 0;
2029 	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2030 	    && error == 0) {
2031 		/*
2032 		 * If the type of mbuf has changed since the last mbuf
2033 		 * examined ('type'), end the receive operation.
2034 		 */
2035 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2036 		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2037 			if (type != m->m_type)
2038 				break;
2039 		} else if (type == MT_OOBDATA)
2040 			break;
2041 		else
2042 		    KASSERT(m->m_type == MT_DATA,
2043 			("m->m_type == %d", m->m_type));
2044 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2045 		len = uio->uio_resid;
2046 		if (so->so_oobmark && len > so->so_oobmark - offset)
2047 			len = so->so_oobmark - offset;
2048 		if (len > m->m_len - moff)
2049 			len = m->m_len - moff;
2050 		/*
2051 		 * If mp is set, just pass back the mbufs.  Otherwise copy
2052 		 * them out via the uio, then free.  Sockbuf must be
2053 		 * consistent here (points to current mbuf, it points to next
2054 		 * record) when we drop priority; we must note any additions
2055 		 * to the sockbuf when we block interrupts again.
2056 		 */
2057 		if (mp == NULL) {
2058 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2059 			SBLASTRECORDCHK(&so->so_rcv);
2060 			SBLASTMBUFCHK(&so->so_rcv);
2061 			SOCKBUF_UNLOCK(&so->so_rcv);
2062 			if ((m->m_flags & M_NOMAP) != 0)
2063 				error = m_unmappedtouio(m, moff, uio, (int)len);
2064 			else
2065 				error = uiomove(mtod(m, char *) + moff,
2066 				    (int)len, uio);
2067 			SOCKBUF_LOCK(&so->so_rcv);
2068 			if (error) {
2069 				/*
2070 				 * The MT_SONAME mbuf has already been removed
2071 				 * from the record, so it is necessary to
2072 				 * remove the data mbufs, if any, to preserve
2073 				 * the invariant in the case of PR_ADDR that
2074 				 * requires MT_SONAME mbufs at the head of
2075 				 * each record.
2076 				 */
2077 				if (pr->pr_flags & PR_ATOMIC &&
2078 				    ((flags & MSG_PEEK) == 0))
2079 					(void)sbdroprecord_locked(&so->so_rcv);
2080 				SOCKBUF_UNLOCK(&so->so_rcv);
2081 				goto release;
2082 			}
2083 		} else
2084 			uio->uio_resid -= len;
2085 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2086 		if (len == m->m_len - moff) {
2087 			if (m->m_flags & M_EOR)
2088 				flags |= MSG_EOR;
2089 			if (flags & MSG_PEEK) {
2090 				m = m->m_next;
2091 				moff = 0;
2092 			} else {
2093 				nextrecord = m->m_nextpkt;
2094 				sbfree(&so->so_rcv, m);
2095 				if (mp != NULL) {
2096 					m->m_nextpkt = NULL;
2097 					*mp = m;
2098 					mp = &m->m_next;
2099 					so->so_rcv.sb_mb = m = m->m_next;
2100 					*mp = NULL;
2101 				} else {
2102 					so->so_rcv.sb_mb = m_free(m);
2103 					m = so->so_rcv.sb_mb;
2104 				}
2105 				sockbuf_pushsync(&so->so_rcv, nextrecord);
2106 				SBLASTRECORDCHK(&so->so_rcv);
2107 				SBLASTMBUFCHK(&so->so_rcv);
2108 			}
2109 		} else {
2110 			if (flags & MSG_PEEK)
2111 				moff += len;
2112 			else {
2113 				if (mp != NULL) {
2114 					if (flags & MSG_DONTWAIT) {
2115 						*mp = m_copym(m, 0, len,
2116 						    M_NOWAIT);
2117 						if (*mp == NULL) {
2118 							/*
2119 							 * m_copym() couldn't
2120 							 * allocate an mbuf.
2121 							 * Adjust uio_resid back
2122 							 * (it was adjusted
2123 							 * down by len bytes,
2124 							 * which we didn't end
2125 							 * up "copying" over).
2126 							 */
2127 							uio->uio_resid += len;
2128 							break;
2129 						}
2130 					} else {
2131 						SOCKBUF_UNLOCK(&so->so_rcv);
2132 						*mp = m_copym(m, 0, len,
2133 						    M_WAITOK);
2134 						SOCKBUF_LOCK(&so->so_rcv);
2135 					}
2136 				}
2137 				sbcut_locked(&so->so_rcv, len);
2138 			}
2139 		}
2140 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2141 		if (so->so_oobmark) {
2142 			if ((flags & MSG_PEEK) == 0) {
2143 				so->so_oobmark -= len;
2144 				if (so->so_oobmark == 0) {
2145 					so->so_rcv.sb_state |= SBS_RCVATMARK;
2146 					break;
2147 				}
2148 			} else {
2149 				offset += len;
2150 				if (offset == so->so_oobmark)
2151 					break;
2152 			}
2153 		}
2154 		if (flags & MSG_EOR)
2155 			break;
2156 		/*
2157 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
2158 		 * must not quit until "uio->uio_resid == 0" or an error
2159 		 * termination.  If a signal/timeout occurs, return with a
2160 		 * short count but without error.  Keep sockbuf locked
2161 		 * against other readers.
2162 		 */
2163 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2164 		    !sosendallatonce(so) && nextrecord == NULL) {
2165 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2166 			if (so->so_error ||
2167 			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
2168 				break;
2169 			/*
2170 			 * Notify the protocol that some data has been
2171 			 * drained before blocking.
2172 			 */
2173 			if (pr->pr_flags & PR_WANTRCVD) {
2174 				SOCKBUF_UNLOCK(&so->so_rcv);
2175 				VNET_SO_ASSERT(so);
2176 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2177 				SOCKBUF_LOCK(&so->so_rcv);
2178 			}
2179 			SBLASTRECORDCHK(&so->so_rcv);
2180 			SBLASTMBUFCHK(&so->so_rcv);
2181 			/*
2182 			 * We could receive some data while was notifying
2183 			 * the protocol. Skip blocking in this case.
2184 			 */
2185 			if (so->so_rcv.sb_mb == NULL) {
2186 				error = sbwait(&so->so_rcv);
2187 				if (error) {
2188 					SOCKBUF_UNLOCK(&so->so_rcv);
2189 					goto release;
2190 				}
2191 			}
2192 			m = so->so_rcv.sb_mb;
2193 			if (m != NULL)
2194 				nextrecord = m->m_nextpkt;
2195 		}
2196 	}
2197 
2198 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2199 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2200 		flags |= MSG_TRUNC;
2201 		if ((flags & MSG_PEEK) == 0)
2202 			(void) sbdroprecord_locked(&so->so_rcv);
2203 	}
2204 	if ((flags & MSG_PEEK) == 0) {
2205 		if (m == NULL) {
2206 			/*
2207 			 * First part is an inline SB_EMPTY_FIXUP().  Second
2208 			 * part makes sure sb_lastrecord is up-to-date if
2209 			 * there is still data in the socket buffer.
2210 			 */
2211 			so->so_rcv.sb_mb = nextrecord;
2212 			if (so->so_rcv.sb_mb == NULL) {
2213 				so->so_rcv.sb_mbtail = NULL;
2214 				so->so_rcv.sb_lastrecord = NULL;
2215 			} else if (nextrecord->m_nextpkt == NULL)
2216 				so->so_rcv.sb_lastrecord = nextrecord;
2217 		}
2218 		SBLASTRECORDCHK(&so->so_rcv);
2219 		SBLASTMBUFCHK(&so->so_rcv);
2220 		/*
2221 		 * If soreceive() is being done from the socket callback,
2222 		 * then don't need to generate ACK to peer to update window,
2223 		 * since ACK will be generated on return to TCP.
2224 		 */
2225 		if (!(flags & MSG_SOCALLBCK) &&
2226 		    (pr->pr_flags & PR_WANTRCVD)) {
2227 			SOCKBUF_UNLOCK(&so->so_rcv);
2228 			VNET_SO_ASSERT(so);
2229 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2230 			SOCKBUF_LOCK(&so->so_rcv);
2231 		}
2232 	}
2233 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2234 	if (orig_resid == uio->uio_resid && orig_resid &&
2235 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2236 		SOCKBUF_UNLOCK(&so->so_rcv);
2237 		goto restart;
2238 	}
2239 	SOCKBUF_UNLOCK(&so->so_rcv);
2240 
2241 	if (flagsp != NULL)
2242 		*flagsp |= flags;
2243 release:
2244 	sbunlock(&so->so_rcv);
2245 	return (error);
2246 }
2247 
2248 /*
2249  * Optimized version of soreceive() for stream (TCP) sockets.
2250  */
2251 int
2252 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2253     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2254 {
2255 	int len = 0, error = 0, flags, oresid;
2256 	struct sockbuf *sb;
2257 	struct mbuf *m, *n = NULL;
2258 
2259 	/* We only do stream sockets. */
2260 	if (so->so_type != SOCK_STREAM)
2261 		return (EINVAL);
2262 	if (psa != NULL)
2263 		*psa = NULL;
2264 	if (flagsp != NULL)
2265 		flags = *flagsp &~ MSG_EOR;
2266 	else
2267 		flags = 0;
2268 	if (controlp != NULL)
2269 		*controlp = NULL;
2270 	if (flags & MSG_OOB)
2271 		return (soreceive_rcvoob(so, uio, flags));
2272 	if (mp0 != NULL)
2273 		*mp0 = NULL;
2274 
2275 	sb = &so->so_rcv;
2276 
2277 	/* Prevent other readers from entering the socket. */
2278 	error = sblock(sb, SBLOCKWAIT(flags));
2279 	if (error)
2280 		return (error);
2281 	SOCKBUF_LOCK(sb);
2282 
2283 	/* Easy one, no space to copyout anything. */
2284 	if (uio->uio_resid == 0) {
2285 		error = EINVAL;
2286 		goto out;
2287 	}
2288 	oresid = uio->uio_resid;
2289 
2290 	/* We will never ever get anything unless we are or were connected. */
2291 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2292 		error = ENOTCONN;
2293 		goto out;
2294 	}
2295 
2296 restart:
2297 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2298 
2299 	/* Abort if socket has reported problems. */
2300 	if (so->so_error) {
2301 		if (sbavail(sb) > 0)
2302 			goto deliver;
2303 		if (oresid > uio->uio_resid)
2304 			goto out;
2305 		error = so->so_error;
2306 		if (!(flags & MSG_PEEK))
2307 			so->so_error = 0;
2308 		goto out;
2309 	}
2310 
2311 	/* Door is closed.  Deliver what is left, if any. */
2312 	if (sb->sb_state & SBS_CANTRCVMORE) {
2313 		if (sbavail(sb) > 0)
2314 			goto deliver;
2315 		else
2316 			goto out;
2317 	}
2318 
2319 	/* Socket buffer is empty and we shall not block. */
2320 	if (sbavail(sb) == 0 &&
2321 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2322 		error = EAGAIN;
2323 		goto out;
2324 	}
2325 
2326 	/* Socket buffer got some data that we shall deliver now. */
2327 	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2328 	    ((so->so_state & SS_NBIO) ||
2329 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2330 	     sbavail(sb) >= sb->sb_lowat ||
2331 	     sbavail(sb) >= uio->uio_resid ||
2332 	     sbavail(sb) >= sb->sb_hiwat) ) {
2333 		goto deliver;
2334 	}
2335 
2336 	/* On MSG_WAITALL we must wait until all data or error arrives. */
2337 	if ((flags & MSG_WAITALL) &&
2338 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2339 		goto deliver;
2340 
2341 	/*
2342 	 * Wait and block until (more) data comes in.
2343 	 * NB: Drops the sockbuf lock during wait.
2344 	 */
2345 	error = sbwait(sb);
2346 	if (error)
2347 		goto out;
2348 	goto restart;
2349 
2350 deliver:
2351 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2352 	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2353 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2354 
2355 	/* Statistics. */
2356 	if (uio->uio_td)
2357 		uio->uio_td->td_ru.ru_msgrcv++;
2358 
2359 	/* Fill uio until full or current end of socket buffer is reached. */
2360 	len = min(uio->uio_resid, sbavail(sb));
2361 	if (mp0 != NULL) {
2362 		/* Dequeue as many mbufs as possible. */
2363 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2364 			if (*mp0 == NULL)
2365 				*mp0 = sb->sb_mb;
2366 			else
2367 				m_cat(*mp0, sb->sb_mb);
2368 			for (m = sb->sb_mb;
2369 			     m != NULL && m->m_len <= len;
2370 			     m = m->m_next) {
2371 				KASSERT(!(m->m_flags & M_NOTAVAIL),
2372 				    ("%s: m %p not available", __func__, m));
2373 				len -= m->m_len;
2374 				uio->uio_resid -= m->m_len;
2375 				sbfree(sb, m);
2376 				n = m;
2377 			}
2378 			n->m_next = NULL;
2379 			sb->sb_mb = m;
2380 			sb->sb_lastrecord = sb->sb_mb;
2381 			if (sb->sb_mb == NULL)
2382 				SB_EMPTY_FIXUP(sb);
2383 		}
2384 		/* Copy the remainder. */
2385 		if (len > 0) {
2386 			KASSERT(sb->sb_mb != NULL,
2387 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2388 
2389 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2390 			if (m == NULL)
2391 				len = 0;	/* Don't flush data from sockbuf. */
2392 			else
2393 				uio->uio_resid -= len;
2394 			if (*mp0 != NULL)
2395 				m_cat(*mp0, m);
2396 			else
2397 				*mp0 = m;
2398 			if (*mp0 == NULL) {
2399 				error = ENOBUFS;
2400 				goto out;
2401 			}
2402 		}
2403 	} else {
2404 		/* NB: Must unlock socket buffer as uiomove may sleep. */
2405 		SOCKBUF_UNLOCK(sb);
2406 		error = m_mbuftouio(uio, sb->sb_mb, len);
2407 		SOCKBUF_LOCK(sb);
2408 		if (error)
2409 			goto out;
2410 	}
2411 	SBLASTRECORDCHK(sb);
2412 	SBLASTMBUFCHK(sb);
2413 
2414 	/*
2415 	 * Remove the delivered data from the socket buffer unless we
2416 	 * were only peeking.
2417 	 */
2418 	if (!(flags & MSG_PEEK)) {
2419 		if (len > 0)
2420 			sbdrop_locked(sb, len);
2421 
2422 		/* Notify protocol that we drained some data. */
2423 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2424 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2425 		     !(flags & MSG_SOCALLBCK))) {
2426 			SOCKBUF_UNLOCK(sb);
2427 			VNET_SO_ASSERT(so);
2428 			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2429 			SOCKBUF_LOCK(sb);
2430 		}
2431 	}
2432 
2433 	/*
2434 	 * For MSG_WAITALL we may have to loop again and wait for
2435 	 * more data to come in.
2436 	 */
2437 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2438 		goto restart;
2439 out:
2440 	SOCKBUF_LOCK_ASSERT(sb);
2441 	SBLASTRECORDCHK(sb);
2442 	SBLASTMBUFCHK(sb);
2443 	SOCKBUF_UNLOCK(sb);
2444 	sbunlock(sb);
2445 	return (error);
2446 }
2447 
2448 /*
2449  * Optimized version of soreceive() for simple datagram cases from userspace.
2450  * Unlike in the stream case, we're able to drop a datagram if copyout()
2451  * fails, and because we handle datagrams atomically, we don't need to use a
2452  * sleep lock to prevent I/O interlacing.
2453  */
2454 int
2455 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2456     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2457 {
2458 	struct mbuf *m, *m2;
2459 	int flags, error;
2460 	ssize_t len;
2461 	struct protosw *pr = so->so_proto;
2462 	struct mbuf *nextrecord;
2463 
2464 	if (psa != NULL)
2465 		*psa = NULL;
2466 	if (controlp != NULL)
2467 		*controlp = NULL;
2468 	if (flagsp != NULL)
2469 		flags = *flagsp &~ MSG_EOR;
2470 	else
2471 		flags = 0;
2472 
2473 	/*
2474 	 * For any complicated cases, fall back to the full
2475 	 * soreceive_generic().
2476 	 */
2477 	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2478 		return (soreceive_generic(so, psa, uio, mp0, controlp,
2479 		    flagsp));
2480 
2481 	/*
2482 	 * Enforce restrictions on use.
2483 	 */
2484 	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2485 	    ("soreceive_dgram: wantrcvd"));
2486 	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2487 	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2488 	    ("soreceive_dgram: SBS_RCVATMARK"));
2489 	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2490 	    ("soreceive_dgram: P_CONNREQUIRED"));
2491 
2492 	/*
2493 	 * Loop blocking while waiting for a datagram.
2494 	 */
2495 	SOCKBUF_LOCK(&so->so_rcv);
2496 	while ((m = so->so_rcv.sb_mb) == NULL) {
2497 		KASSERT(sbavail(&so->so_rcv) == 0,
2498 		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
2499 		    sbavail(&so->so_rcv)));
2500 		if (so->so_error) {
2501 			error = so->so_error;
2502 			so->so_error = 0;
2503 			SOCKBUF_UNLOCK(&so->so_rcv);
2504 			return (error);
2505 		}
2506 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2507 		    uio->uio_resid == 0) {
2508 			SOCKBUF_UNLOCK(&so->so_rcv);
2509 			return (0);
2510 		}
2511 		if ((so->so_state & SS_NBIO) ||
2512 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2513 			SOCKBUF_UNLOCK(&so->so_rcv);
2514 			return (EWOULDBLOCK);
2515 		}
2516 		SBLASTRECORDCHK(&so->so_rcv);
2517 		SBLASTMBUFCHK(&so->so_rcv);
2518 		error = sbwait(&so->so_rcv);
2519 		if (error) {
2520 			SOCKBUF_UNLOCK(&so->so_rcv);
2521 			return (error);
2522 		}
2523 	}
2524 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2525 
2526 	if (uio->uio_td)
2527 		uio->uio_td->td_ru.ru_msgrcv++;
2528 	SBLASTRECORDCHK(&so->so_rcv);
2529 	SBLASTMBUFCHK(&so->so_rcv);
2530 	nextrecord = m->m_nextpkt;
2531 	if (nextrecord == NULL) {
2532 		KASSERT(so->so_rcv.sb_lastrecord == m,
2533 		    ("soreceive_dgram: lastrecord != m"));
2534 	}
2535 
2536 	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2537 	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2538 
2539 	/*
2540 	 * Pull 'm' and its chain off the front of the packet queue.
2541 	 */
2542 	so->so_rcv.sb_mb = NULL;
2543 	sockbuf_pushsync(&so->so_rcv, nextrecord);
2544 
2545 	/*
2546 	 * Walk 'm's chain and free that many bytes from the socket buffer.
2547 	 */
2548 	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2549 		sbfree(&so->so_rcv, m2);
2550 
2551 	/*
2552 	 * Do a few last checks before we let go of the lock.
2553 	 */
2554 	SBLASTRECORDCHK(&so->so_rcv);
2555 	SBLASTMBUFCHK(&so->so_rcv);
2556 	SOCKBUF_UNLOCK(&so->so_rcv);
2557 
2558 	if (pr->pr_flags & PR_ADDR) {
2559 		KASSERT(m->m_type == MT_SONAME,
2560 		    ("m->m_type == %d", m->m_type));
2561 		if (psa != NULL)
2562 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2563 			    M_NOWAIT);
2564 		m = m_free(m);
2565 	}
2566 	if (m == NULL) {
2567 		/* XXXRW: Can this happen? */
2568 		return (0);
2569 	}
2570 
2571 	/*
2572 	 * Packet to copyout() is now in 'm' and it is disconnected from the
2573 	 * queue.
2574 	 *
2575 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2576 	 * in the first mbuf chain on the socket buffer.  We call into the
2577 	 * protocol to perform externalization (or freeing if controlp ==
2578 	 * NULL). In some cases there can be only MT_CONTROL mbufs without
2579 	 * MT_DATA mbufs.
2580 	 */
2581 	if (m->m_type == MT_CONTROL) {
2582 		struct mbuf *cm = NULL, *cmn;
2583 		struct mbuf **cme = &cm;
2584 
2585 		do {
2586 			m2 = m->m_next;
2587 			m->m_next = NULL;
2588 			*cme = m;
2589 			cme = &(*cme)->m_next;
2590 			m = m2;
2591 		} while (m != NULL && m->m_type == MT_CONTROL);
2592 		while (cm != NULL) {
2593 			cmn = cm->m_next;
2594 			cm->m_next = NULL;
2595 			if (pr->pr_domain->dom_externalize != NULL) {
2596 				error = (*pr->pr_domain->dom_externalize)
2597 				    (cm, controlp, flags);
2598 			} else if (controlp != NULL)
2599 				*controlp = cm;
2600 			else
2601 				m_freem(cm);
2602 			if (controlp != NULL) {
2603 				while (*controlp != NULL)
2604 					controlp = &(*controlp)->m_next;
2605 			}
2606 			cm = cmn;
2607 		}
2608 	}
2609 	KASSERT(m == NULL || m->m_type == MT_DATA,
2610 	    ("soreceive_dgram: !data"));
2611 	while (m != NULL && uio->uio_resid > 0) {
2612 		len = uio->uio_resid;
2613 		if (len > m->m_len)
2614 			len = m->m_len;
2615 		error = uiomove(mtod(m, char *), (int)len, uio);
2616 		if (error) {
2617 			m_freem(m);
2618 			return (error);
2619 		}
2620 		if (len == m->m_len)
2621 			m = m_free(m);
2622 		else {
2623 			m->m_data += len;
2624 			m->m_len -= len;
2625 		}
2626 	}
2627 	if (m != NULL) {
2628 		flags |= MSG_TRUNC;
2629 		m_freem(m);
2630 	}
2631 	if (flagsp != NULL)
2632 		*flagsp |= flags;
2633 	return (0);
2634 }
2635 
2636 int
2637 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2638     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2639 {
2640 	int error;
2641 
2642 	CURVNET_SET(so->so_vnet);
2643 	if (!SOLISTENING(so))
2644 		error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
2645 		    mp0, controlp, flagsp));
2646 	else
2647 		error = ENOTCONN;
2648 	CURVNET_RESTORE();
2649 	return (error);
2650 }
2651 
2652 int
2653 soshutdown(struct socket *so, int how)
2654 {
2655 	struct protosw *pr = so->so_proto;
2656 	int error, soerror_enotconn;
2657 
2658 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2659 		return (EINVAL);
2660 
2661 	soerror_enotconn = 0;
2662 	if ((so->so_state &
2663 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2664 		/*
2665 		 * POSIX mandates us to return ENOTCONN when shutdown(2) is
2666 		 * invoked on a datagram sockets, however historically we would
2667 		 * actually tear socket down. This is known to be leveraged by
2668 		 * some applications to unblock process waiting in recvXXX(2)
2669 		 * by other process that it shares that socket with. Try to meet
2670 		 * both backward-compatibility and POSIX requirements by forcing
2671 		 * ENOTCONN but still asking protocol to perform pru_shutdown().
2672 		 */
2673 		if (so->so_type != SOCK_DGRAM && !SOLISTENING(so))
2674 			return (ENOTCONN);
2675 		soerror_enotconn = 1;
2676 	}
2677 
2678 	if (SOLISTENING(so)) {
2679 		if (how != SHUT_WR) {
2680 			SOLISTEN_LOCK(so);
2681 			so->so_error = ECONNABORTED;
2682 			solisten_wakeup(so);	/* unlocks so */
2683 		}
2684 		goto done;
2685 	}
2686 
2687 	CURVNET_SET(so->so_vnet);
2688 	if (pr->pr_usrreqs->pru_flush != NULL)
2689 		(*pr->pr_usrreqs->pru_flush)(so, how);
2690 	if (how != SHUT_WR)
2691 		sorflush(so);
2692 	if (how != SHUT_RD) {
2693 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2694 		wakeup(&so->so_timeo);
2695 		CURVNET_RESTORE();
2696 		return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2697 	}
2698 	wakeup(&so->so_timeo);
2699 	CURVNET_RESTORE();
2700 
2701 done:
2702 	return (soerror_enotconn ? ENOTCONN : 0);
2703 }
2704 
2705 void
2706 sorflush(struct socket *so)
2707 {
2708 	struct sockbuf *sb = &so->so_rcv;
2709 	struct protosw *pr = so->so_proto;
2710 	struct socket aso;
2711 
2712 	VNET_SO_ASSERT(so);
2713 
2714 	/*
2715 	 * In order to avoid calling dom_dispose with the socket buffer mutex
2716 	 * held, and in order to generally avoid holding the lock for a long
2717 	 * time, we make a copy of the socket buffer and clear the original
2718 	 * (except locks, state).  The new socket buffer copy won't have
2719 	 * initialized locks so we can only call routines that won't use or
2720 	 * assert those locks.
2721 	 *
2722 	 * Dislodge threads currently blocked in receive and wait to acquire
2723 	 * a lock against other simultaneous readers before clearing the
2724 	 * socket buffer.  Don't let our acquire be interrupted by a signal
2725 	 * despite any existing socket disposition on interruptable waiting.
2726 	 */
2727 	socantrcvmore(so);
2728 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2729 
2730 	/*
2731 	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2732 	 * and mutex data unchanged.
2733 	 */
2734 	SOCKBUF_LOCK(sb);
2735 	bzero(&aso, sizeof(aso));
2736 	aso.so_pcb = so->so_pcb;
2737 	bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
2738 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2739 	bzero(&sb->sb_startzero,
2740 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2741 	SOCKBUF_UNLOCK(sb);
2742 	sbunlock(sb);
2743 
2744 	/*
2745 	 * Dispose of special rights and flush the copied socket.  Don't call
2746 	 * any unsafe routines (that rely on locks being initialized) on aso.
2747 	 */
2748 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2749 		(*pr->pr_domain->dom_dispose)(&aso);
2750 	sbrelease_internal(&aso.so_rcv, so);
2751 }
2752 
2753 /*
2754  * Wrapper for Socket established helper hook.
2755  * Parameters: socket, context of the hook point, hook id.
2756  */
2757 static int inline
2758 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2759 {
2760 	struct socket_hhook_data hhook_data = {
2761 		.so = so,
2762 		.hctx = hctx,
2763 		.m = NULL,
2764 		.status = 0
2765 	};
2766 
2767 	CURVNET_SET(so->so_vnet);
2768 	HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
2769 	CURVNET_RESTORE();
2770 
2771 	/* Ugly but needed, since hhooks return void for now */
2772 	return (hhook_data.status);
2773 }
2774 
2775 /*
2776  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2777  * additional variant to handle the case where the option value needs to be
2778  * some kind of integer, but not a specific size.  In addition to their use
2779  * here, these functions are also called by the protocol-level pr_ctloutput()
2780  * routines.
2781  */
2782 int
2783 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2784 {
2785 	size_t	valsize;
2786 
2787 	/*
2788 	 * If the user gives us more than we wanted, we ignore it, but if we
2789 	 * don't get the minimum length the caller wants, we return EINVAL.
2790 	 * On success, sopt->sopt_valsize is set to however much we actually
2791 	 * retrieved.
2792 	 */
2793 	if ((valsize = sopt->sopt_valsize) < minlen)
2794 		return EINVAL;
2795 	if (valsize > len)
2796 		sopt->sopt_valsize = valsize = len;
2797 
2798 	if (sopt->sopt_td != NULL)
2799 		return (copyin(sopt->sopt_val, buf, valsize));
2800 
2801 	bcopy(sopt->sopt_val, buf, valsize);
2802 	return (0);
2803 }
2804 
2805 /*
2806  * Kernel version of setsockopt(2).
2807  *
2808  * XXX: optlen is size_t, not socklen_t
2809  */
2810 int
2811 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2812     size_t optlen)
2813 {
2814 	struct sockopt sopt;
2815 
2816 	sopt.sopt_level = level;
2817 	sopt.sopt_name = optname;
2818 	sopt.sopt_dir = SOPT_SET;
2819 	sopt.sopt_val = optval;
2820 	sopt.sopt_valsize = optlen;
2821 	sopt.sopt_td = NULL;
2822 	return (sosetopt(so, &sopt));
2823 }
2824 
2825 int
2826 sosetopt(struct socket *so, struct sockopt *sopt)
2827 {
2828 	int	error, optval;
2829 	struct	linger l;
2830 	struct	timeval tv;
2831 	sbintime_t val;
2832 	uint32_t val32;
2833 #ifdef MAC
2834 	struct mac extmac;
2835 #endif
2836 
2837 	CURVNET_SET(so->so_vnet);
2838 	error = 0;
2839 	if (sopt->sopt_level != SOL_SOCKET) {
2840 		if (so->so_proto->pr_ctloutput != NULL)
2841 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2842 		else
2843 			error = ENOPROTOOPT;
2844 	} else {
2845 		switch (sopt->sopt_name) {
2846 		case SO_ACCEPTFILTER:
2847 			error = accept_filt_setopt(so, sopt);
2848 			if (error)
2849 				goto bad;
2850 			break;
2851 
2852 		case SO_LINGER:
2853 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2854 			if (error)
2855 				goto bad;
2856 			if (l.l_linger < 0 ||
2857 			    l.l_linger > USHRT_MAX ||
2858 			    l.l_linger > (INT_MAX / hz)) {
2859 				error = EDOM;
2860 				goto bad;
2861 			}
2862 			SOCK_LOCK(so);
2863 			so->so_linger = l.l_linger;
2864 			if (l.l_onoff)
2865 				so->so_options |= SO_LINGER;
2866 			else
2867 				so->so_options &= ~SO_LINGER;
2868 			SOCK_UNLOCK(so);
2869 			break;
2870 
2871 		case SO_DEBUG:
2872 		case SO_KEEPALIVE:
2873 		case SO_DONTROUTE:
2874 		case SO_USELOOPBACK:
2875 		case SO_BROADCAST:
2876 		case SO_REUSEADDR:
2877 		case SO_REUSEPORT:
2878 		case SO_REUSEPORT_LB:
2879 		case SO_OOBINLINE:
2880 		case SO_TIMESTAMP:
2881 		case SO_BINTIME:
2882 		case SO_NOSIGPIPE:
2883 		case SO_NO_DDP:
2884 		case SO_NO_OFFLOAD:
2885 			error = sooptcopyin(sopt, &optval, sizeof optval,
2886 			    sizeof optval);
2887 			if (error)
2888 				goto bad;
2889 			SOCK_LOCK(so);
2890 			if (optval)
2891 				so->so_options |= sopt->sopt_name;
2892 			else
2893 				so->so_options &= ~sopt->sopt_name;
2894 			SOCK_UNLOCK(so);
2895 			break;
2896 
2897 		case SO_SETFIB:
2898 			error = sooptcopyin(sopt, &optval, sizeof optval,
2899 			    sizeof optval);
2900 			if (error)
2901 				goto bad;
2902 
2903 			if (optval < 0 || optval >= rt_numfibs) {
2904 				error = EINVAL;
2905 				goto bad;
2906 			}
2907 			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2908 			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2909 			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2910 				so->so_fibnum = optval;
2911 			else
2912 				so->so_fibnum = 0;
2913 			break;
2914 
2915 		case SO_USER_COOKIE:
2916 			error = sooptcopyin(sopt, &val32, sizeof val32,
2917 			    sizeof val32);
2918 			if (error)
2919 				goto bad;
2920 			so->so_user_cookie = val32;
2921 			break;
2922 
2923 		case SO_SNDBUF:
2924 		case SO_RCVBUF:
2925 		case SO_SNDLOWAT:
2926 		case SO_RCVLOWAT:
2927 			error = sooptcopyin(sopt, &optval, sizeof optval,
2928 			    sizeof optval);
2929 			if (error)
2930 				goto bad;
2931 
2932 			/*
2933 			 * Values < 1 make no sense for any of these options,
2934 			 * so disallow them.
2935 			 */
2936 			if (optval < 1) {
2937 				error = EINVAL;
2938 				goto bad;
2939 			}
2940 
2941 			error = sbsetopt(so, sopt->sopt_name, optval);
2942 			break;
2943 
2944 		case SO_SNDTIMEO:
2945 		case SO_RCVTIMEO:
2946 #ifdef COMPAT_FREEBSD32
2947 			if (SV_CURPROC_FLAG(SV_ILP32)) {
2948 				struct timeval32 tv32;
2949 
2950 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2951 				    sizeof tv32);
2952 				CP(tv32, tv, tv_sec);
2953 				CP(tv32, tv, tv_usec);
2954 			} else
2955 #endif
2956 				error = sooptcopyin(sopt, &tv, sizeof tv,
2957 				    sizeof tv);
2958 			if (error)
2959 				goto bad;
2960 			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2961 			    tv.tv_usec >= 1000000) {
2962 				error = EDOM;
2963 				goto bad;
2964 			}
2965 			if (tv.tv_sec > INT32_MAX)
2966 				val = SBT_MAX;
2967 			else
2968 				val = tvtosbt(tv);
2969 			switch (sopt->sopt_name) {
2970 			case SO_SNDTIMEO:
2971 				so->so_snd.sb_timeo = val;
2972 				break;
2973 			case SO_RCVTIMEO:
2974 				so->so_rcv.sb_timeo = val;
2975 				break;
2976 			}
2977 			break;
2978 
2979 		case SO_LABEL:
2980 #ifdef MAC
2981 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2982 			    sizeof extmac);
2983 			if (error)
2984 				goto bad;
2985 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2986 			    so, &extmac);
2987 #else
2988 			error = EOPNOTSUPP;
2989 #endif
2990 			break;
2991 
2992 		case SO_TS_CLOCK:
2993 			error = sooptcopyin(sopt, &optval, sizeof optval,
2994 			    sizeof optval);
2995 			if (error)
2996 				goto bad;
2997 			if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
2998 				error = EINVAL;
2999 				goto bad;
3000 			}
3001 			so->so_ts_clock = optval;
3002 			break;
3003 
3004 		case SO_MAX_PACING_RATE:
3005 			error = sooptcopyin(sopt, &val32, sizeof(val32),
3006 			    sizeof(val32));
3007 			if (error)
3008 				goto bad;
3009 			so->so_max_pacing_rate = val32;
3010 			break;
3011 
3012 		default:
3013 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3014 				error = hhook_run_socket(so, sopt,
3015 				    HHOOK_SOCKET_OPT);
3016 			else
3017 				error = ENOPROTOOPT;
3018 			break;
3019 		}
3020 		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3021 			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
3022 	}
3023 bad:
3024 	CURVNET_RESTORE();
3025 	return (error);
3026 }
3027 
3028 /*
3029  * Helper routine for getsockopt.
3030  */
3031 int
3032 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3033 {
3034 	int	error;
3035 	size_t	valsize;
3036 
3037 	error = 0;
3038 
3039 	/*
3040 	 * Documented get behavior is that we always return a value, possibly
3041 	 * truncated to fit in the user's buffer.  Traditional behavior is
3042 	 * that we always tell the user precisely how much we copied, rather
3043 	 * than something useful like the total amount we had available for
3044 	 * her.  Note that this interface is not idempotent; the entire
3045 	 * answer must be generated ahead of time.
3046 	 */
3047 	valsize = min(len, sopt->sopt_valsize);
3048 	sopt->sopt_valsize = valsize;
3049 	if (sopt->sopt_val != NULL) {
3050 		if (sopt->sopt_td != NULL)
3051 			error = copyout(buf, sopt->sopt_val, valsize);
3052 		else
3053 			bcopy(buf, sopt->sopt_val, valsize);
3054 	}
3055 	return (error);
3056 }
3057 
3058 int
3059 sogetopt(struct socket *so, struct sockopt *sopt)
3060 {
3061 	int	error, optval;
3062 	struct	linger l;
3063 	struct	timeval tv;
3064 #ifdef MAC
3065 	struct mac extmac;
3066 #endif
3067 
3068 	CURVNET_SET(so->so_vnet);
3069 	error = 0;
3070 	if (sopt->sopt_level != SOL_SOCKET) {
3071 		if (so->so_proto->pr_ctloutput != NULL)
3072 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
3073 		else
3074 			error = ENOPROTOOPT;
3075 		CURVNET_RESTORE();
3076 		return (error);
3077 	} else {
3078 		switch (sopt->sopt_name) {
3079 		case SO_ACCEPTFILTER:
3080 			error = accept_filt_getopt(so, sopt);
3081 			break;
3082 
3083 		case SO_LINGER:
3084 			SOCK_LOCK(so);
3085 			l.l_onoff = so->so_options & SO_LINGER;
3086 			l.l_linger = so->so_linger;
3087 			SOCK_UNLOCK(so);
3088 			error = sooptcopyout(sopt, &l, sizeof l);
3089 			break;
3090 
3091 		case SO_USELOOPBACK:
3092 		case SO_DONTROUTE:
3093 		case SO_DEBUG:
3094 		case SO_KEEPALIVE:
3095 		case SO_REUSEADDR:
3096 		case SO_REUSEPORT:
3097 		case SO_REUSEPORT_LB:
3098 		case SO_BROADCAST:
3099 		case SO_OOBINLINE:
3100 		case SO_ACCEPTCONN:
3101 		case SO_TIMESTAMP:
3102 		case SO_BINTIME:
3103 		case SO_NOSIGPIPE:
3104 			optval = so->so_options & sopt->sopt_name;
3105 integer:
3106 			error = sooptcopyout(sopt, &optval, sizeof optval);
3107 			break;
3108 
3109 		case SO_DOMAIN:
3110 			optval = so->so_proto->pr_domain->dom_family;
3111 			goto integer;
3112 
3113 		case SO_TYPE:
3114 			optval = so->so_type;
3115 			goto integer;
3116 
3117 		case SO_PROTOCOL:
3118 			optval = so->so_proto->pr_protocol;
3119 			goto integer;
3120 
3121 		case SO_ERROR:
3122 			SOCK_LOCK(so);
3123 			optval = so->so_error;
3124 			so->so_error = 0;
3125 			SOCK_UNLOCK(so);
3126 			goto integer;
3127 
3128 		case SO_SNDBUF:
3129 			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3130 			    so->so_snd.sb_hiwat;
3131 			goto integer;
3132 
3133 		case SO_RCVBUF:
3134 			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3135 			    so->so_rcv.sb_hiwat;
3136 			goto integer;
3137 
3138 		case SO_SNDLOWAT:
3139 			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3140 			    so->so_snd.sb_lowat;
3141 			goto integer;
3142 
3143 		case SO_RCVLOWAT:
3144 			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3145 			    so->so_rcv.sb_lowat;
3146 			goto integer;
3147 
3148 		case SO_SNDTIMEO:
3149 		case SO_RCVTIMEO:
3150 			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3151 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3152 #ifdef COMPAT_FREEBSD32
3153 			if (SV_CURPROC_FLAG(SV_ILP32)) {
3154 				struct timeval32 tv32;
3155 
3156 				CP(tv, tv32, tv_sec);
3157 				CP(tv, tv32, tv_usec);
3158 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
3159 			} else
3160 #endif
3161 				error = sooptcopyout(sopt, &tv, sizeof tv);
3162 			break;
3163 
3164 		case SO_LABEL:
3165 #ifdef MAC
3166 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3167 			    sizeof(extmac));
3168 			if (error)
3169 				goto bad;
3170 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3171 			    so, &extmac);
3172 			if (error)
3173 				goto bad;
3174 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
3175 #else
3176 			error = EOPNOTSUPP;
3177 #endif
3178 			break;
3179 
3180 		case SO_PEERLABEL:
3181 #ifdef MAC
3182 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3183 			    sizeof(extmac));
3184 			if (error)
3185 				goto bad;
3186 			error = mac_getsockopt_peerlabel(
3187 			    sopt->sopt_td->td_ucred, so, &extmac);
3188 			if (error)
3189 				goto bad;
3190 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
3191 #else
3192 			error = EOPNOTSUPP;
3193 #endif
3194 			break;
3195 
3196 		case SO_LISTENQLIMIT:
3197 			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3198 			goto integer;
3199 
3200 		case SO_LISTENQLEN:
3201 			optval = SOLISTENING(so) ? so->sol_qlen : 0;
3202 			goto integer;
3203 
3204 		case SO_LISTENINCQLEN:
3205 			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3206 			goto integer;
3207 
3208 		case SO_TS_CLOCK:
3209 			optval = so->so_ts_clock;
3210 			goto integer;
3211 
3212 		case SO_MAX_PACING_RATE:
3213 			optval = so->so_max_pacing_rate;
3214 			goto integer;
3215 
3216 		default:
3217 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3218 				error = hhook_run_socket(so, sopt,
3219 				    HHOOK_SOCKET_OPT);
3220 			else
3221 				error = ENOPROTOOPT;
3222 			break;
3223 		}
3224 	}
3225 #ifdef MAC
3226 bad:
3227 #endif
3228 	CURVNET_RESTORE();
3229 	return (error);
3230 }
3231 
3232 int
3233 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3234 {
3235 	struct mbuf *m, *m_prev;
3236 	int sopt_size = sopt->sopt_valsize;
3237 
3238 	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3239 	if (m == NULL)
3240 		return ENOBUFS;
3241 	if (sopt_size > MLEN) {
3242 		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3243 		if ((m->m_flags & M_EXT) == 0) {
3244 			m_free(m);
3245 			return ENOBUFS;
3246 		}
3247 		m->m_len = min(MCLBYTES, sopt_size);
3248 	} else {
3249 		m->m_len = min(MLEN, sopt_size);
3250 	}
3251 	sopt_size -= m->m_len;
3252 	*mp = m;
3253 	m_prev = m;
3254 
3255 	while (sopt_size) {
3256 		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3257 		if (m == NULL) {
3258 			m_freem(*mp);
3259 			return ENOBUFS;
3260 		}
3261 		if (sopt_size > MLEN) {
3262 			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3263 			    M_NOWAIT);
3264 			if ((m->m_flags & M_EXT) == 0) {
3265 				m_freem(m);
3266 				m_freem(*mp);
3267 				return ENOBUFS;
3268 			}
3269 			m->m_len = min(MCLBYTES, sopt_size);
3270 		} else {
3271 			m->m_len = min(MLEN, sopt_size);
3272 		}
3273 		sopt_size -= m->m_len;
3274 		m_prev->m_next = m;
3275 		m_prev = m;
3276 	}
3277 	return (0);
3278 }
3279 
3280 int
3281 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3282 {
3283 	struct mbuf *m0 = m;
3284 
3285 	if (sopt->sopt_val == NULL)
3286 		return (0);
3287 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3288 		if (sopt->sopt_td != NULL) {
3289 			int error;
3290 
3291 			error = copyin(sopt->sopt_val, mtod(m, char *),
3292 			    m->m_len);
3293 			if (error != 0) {
3294 				m_freem(m0);
3295 				return(error);
3296 			}
3297 		} else
3298 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3299 		sopt->sopt_valsize -= m->m_len;
3300 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3301 		m = m->m_next;
3302 	}
3303 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3304 		panic("ip6_sooptmcopyin");
3305 	return (0);
3306 }
3307 
3308 int
3309 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3310 {
3311 	struct mbuf *m0 = m;
3312 	size_t valsize = 0;
3313 
3314 	if (sopt->sopt_val == NULL)
3315 		return (0);
3316 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3317 		if (sopt->sopt_td != NULL) {
3318 			int error;
3319 
3320 			error = copyout(mtod(m, char *), sopt->sopt_val,
3321 			    m->m_len);
3322 			if (error != 0) {
3323 				m_freem(m0);
3324 				return(error);
3325 			}
3326 		} else
3327 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3328 		sopt->sopt_valsize -= m->m_len;
3329 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3330 		valsize += m->m_len;
3331 		m = m->m_next;
3332 	}
3333 	if (m != NULL) {
3334 		/* enough soopt buffer should be given from user-land */
3335 		m_freem(m0);
3336 		return(EINVAL);
3337 	}
3338 	sopt->sopt_valsize = valsize;
3339 	return (0);
3340 }
3341 
3342 /*
3343  * sohasoutofband(): protocol notifies socket layer of the arrival of new
3344  * out-of-band data, which will then notify socket consumers.
3345  */
3346 void
3347 sohasoutofband(struct socket *so)
3348 {
3349 
3350 	if (so->so_sigio != NULL)
3351 		pgsigio(&so->so_sigio, SIGURG, 0);
3352 	selwakeuppri(&so->so_rdsel, PSOCK);
3353 }
3354 
3355 int
3356 sopoll(struct socket *so, int events, struct ucred *active_cred,
3357     struct thread *td)
3358 {
3359 
3360 	/*
3361 	 * We do not need to set or assert curvnet as long as everyone uses
3362 	 * sopoll_generic().
3363 	 */
3364 	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3365 	    td));
3366 }
3367 
3368 int
3369 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3370     struct thread *td)
3371 {
3372 	int revents;
3373 
3374 	SOCK_LOCK(so);
3375 	if (SOLISTENING(so)) {
3376 		if (!(events & (POLLIN | POLLRDNORM)))
3377 			revents = 0;
3378 		else if (!TAILQ_EMPTY(&so->sol_comp))
3379 			revents = events & (POLLIN | POLLRDNORM);
3380 		else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3381 			revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3382 		else {
3383 			selrecord(td, &so->so_rdsel);
3384 			revents = 0;
3385 		}
3386 	} else {
3387 		revents = 0;
3388 		SOCKBUF_LOCK(&so->so_snd);
3389 		SOCKBUF_LOCK(&so->so_rcv);
3390 		if (events & (POLLIN | POLLRDNORM))
3391 			if (soreadabledata(so))
3392 				revents |= events & (POLLIN | POLLRDNORM);
3393 		if (events & (POLLOUT | POLLWRNORM))
3394 			if (sowriteable(so))
3395 				revents |= events & (POLLOUT | POLLWRNORM);
3396 		if (events & (POLLPRI | POLLRDBAND))
3397 			if (so->so_oobmark ||
3398 			    (so->so_rcv.sb_state & SBS_RCVATMARK))
3399 				revents |= events & (POLLPRI | POLLRDBAND);
3400 		if ((events & POLLINIGNEOF) == 0) {
3401 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3402 				revents |= events & (POLLIN | POLLRDNORM);
3403 				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3404 					revents |= POLLHUP;
3405 			}
3406 		}
3407 		if (revents == 0) {
3408 			if (events &
3409 			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3410 				selrecord(td, &so->so_rdsel);
3411 				so->so_rcv.sb_flags |= SB_SEL;
3412 			}
3413 			if (events & (POLLOUT | POLLWRNORM)) {
3414 				selrecord(td, &so->so_wrsel);
3415 				so->so_snd.sb_flags |= SB_SEL;
3416 			}
3417 		}
3418 		SOCKBUF_UNLOCK(&so->so_rcv);
3419 		SOCKBUF_UNLOCK(&so->so_snd);
3420 	}
3421 	SOCK_UNLOCK(so);
3422 	return (revents);
3423 }
3424 
3425 int
3426 soo_kqfilter(struct file *fp, struct knote *kn)
3427 {
3428 	struct socket *so = kn->kn_fp->f_data;
3429 	struct sockbuf *sb;
3430 	struct knlist *knl;
3431 
3432 	switch (kn->kn_filter) {
3433 	case EVFILT_READ:
3434 		kn->kn_fop = &soread_filtops;
3435 		knl = &so->so_rdsel.si_note;
3436 		sb = &so->so_rcv;
3437 		break;
3438 	case EVFILT_WRITE:
3439 		kn->kn_fop = &sowrite_filtops;
3440 		knl = &so->so_wrsel.si_note;
3441 		sb = &so->so_snd;
3442 		break;
3443 	case EVFILT_EMPTY:
3444 		kn->kn_fop = &soempty_filtops;
3445 		knl = &so->so_wrsel.si_note;
3446 		sb = &so->so_snd;
3447 		break;
3448 	default:
3449 		return (EINVAL);
3450 	}
3451 
3452 	SOCK_LOCK(so);
3453 	if (SOLISTENING(so)) {
3454 		knlist_add(knl, kn, 1);
3455 	} else {
3456 		SOCKBUF_LOCK(sb);
3457 		knlist_add(knl, kn, 1);
3458 		sb->sb_flags |= SB_KNOTE;
3459 		SOCKBUF_UNLOCK(sb);
3460 	}
3461 	SOCK_UNLOCK(so);
3462 	return (0);
3463 }
3464 
3465 /*
3466  * Some routines that return EOPNOTSUPP for entry points that are not
3467  * supported by a protocol.  Fill in as needed.
3468  */
3469 int
3470 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3471 {
3472 
3473 	return EOPNOTSUPP;
3474 }
3475 
3476 int
3477 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
3478 {
3479 
3480 	return EOPNOTSUPP;
3481 }
3482 
3483 int
3484 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3485 {
3486 
3487 	return EOPNOTSUPP;
3488 }
3489 
3490 int
3491 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3492 {
3493 
3494 	return EOPNOTSUPP;
3495 }
3496 
3497 int
3498 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3499     struct thread *td)
3500 {
3501 
3502 	return EOPNOTSUPP;
3503 }
3504 
3505 int
3506 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3507 {
3508 
3509 	return EOPNOTSUPP;
3510 }
3511 
3512 int
3513 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3514     struct thread *td)
3515 {
3516 
3517 	return EOPNOTSUPP;
3518 }
3519 
3520 int
3521 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3522 {
3523 
3524 	return EOPNOTSUPP;
3525 }
3526 
3527 int
3528 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3529     struct ifnet *ifp, struct thread *td)
3530 {
3531 
3532 	return EOPNOTSUPP;
3533 }
3534 
3535 int
3536 pru_disconnect_notsupp(struct socket *so)
3537 {
3538 
3539 	return EOPNOTSUPP;
3540 }
3541 
3542 int
3543 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3544 {
3545 
3546 	return EOPNOTSUPP;
3547 }
3548 
3549 int
3550 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3551 {
3552 
3553 	return EOPNOTSUPP;
3554 }
3555 
3556 int
3557 pru_rcvd_notsupp(struct socket *so, int flags)
3558 {
3559 
3560 	return EOPNOTSUPP;
3561 }
3562 
3563 int
3564 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3565 {
3566 
3567 	return EOPNOTSUPP;
3568 }
3569 
3570 int
3571 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3572     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3573 {
3574 
3575 	return EOPNOTSUPP;
3576 }
3577 
3578 int
3579 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
3580 {
3581 
3582 	return (EOPNOTSUPP);
3583 }
3584 
3585 /*
3586  * This isn't really a ``null'' operation, but it's the default one and
3587  * doesn't do anything destructive.
3588  */
3589 int
3590 pru_sense_null(struct socket *so, struct stat *sb)
3591 {
3592 
3593 	sb->st_blksize = so->so_snd.sb_hiwat;
3594 	return 0;
3595 }
3596 
3597 int
3598 pru_shutdown_notsupp(struct socket *so)
3599 {
3600 
3601 	return EOPNOTSUPP;
3602 }
3603 
3604 int
3605 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3606 {
3607 
3608 	return EOPNOTSUPP;
3609 }
3610 
3611 int
3612 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3613     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3614 {
3615 
3616 	return EOPNOTSUPP;
3617 }
3618 
3619 int
3620 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3621     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3622 {
3623 
3624 	return EOPNOTSUPP;
3625 }
3626 
3627 int
3628 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3629     struct thread *td)
3630 {
3631 
3632 	return EOPNOTSUPP;
3633 }
3634 
3635 static void
3636 filt_sordetach(struct knote *kn)
3637 {
3638 	struct socket *so = kn->kn_fp->f_data;
3639 
3640 	so_rdknl_lock(so);
3641 	knlist_remove(&so->so_rdsel.si_note, kn, 1);
3642 	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3643 		so->so_rcv.sb_flags &= ~SB_KNOTE;
3644 	so_rdknl_unlock(so);
3645 }
3646 
3647 /*ARGSUSED*/
3648 static int
3649 filt_soread(struct knote *kn, long hint)
3650 {
3651 	struct socket *so;
3652 
3653 	so = kn->kn_fp->f_data;
3654 
3655 	if (SOLISTENING(so)) {
3656 		SOCK_LOCK_ASSERT(so);
3657 		kn->kn_data = so->sol_qlen;
3658 		if (so->so_error) {
3659 			kn->kn_flags |= EV_EOF;
3660 			kn->kn_fflags = so->so_error;
3661 			return (1);
3662 		}
3663 		return (!TAILQ_EMPTY(&so->sol_comp));
3664 	}
3665 
3666 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3667 
3668 	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3669 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3670 		kn->kn_flags |= EV_EOF;
3671 		kn->kn_fflags = so->so_error;
3672 		return (1);
3673 	} else if (so->so_error)	/* temporary udp error */
3674 		return (1);
3675 
3676 	if (kn->kn_sfflags & NOTE_LOWAT) {
3677 		if (kn->kn_data >= kn->kn_sdata)
3678 			return (1);
3679 	} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3680 		return (1);
3681 
3682 	/* This hook returning non-zero indicates an event, not error */
3683 	return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3684 }
3685 
3686 static void
3687 filt_sowdetach(struct knote *kn)
3688 {
3689 	struct socket *so = kn->kn_fp->f_data;
3690 
3691 	so_wrknl_lock(so);
3692 	knlist_remove(&so->so_wrsel.si_note, kn, 1);
3693 	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3694 		so->so_snd.sb_flags &= ~SB_KNOTE;
3695 	so_wrknl_unlock(so);
3696 }
3697 
3698 /*ARGSUSED*/
3699 static int
3700 filt_sowrite(struct knote *kn, long hint)
3701 {
3702 	struct socket *so;
3703 
3704 	so = kn->kn_fp->f_data;
3705 
3706 	if (SOLISTENING(so))
3707 		return (0);
3708 
3709 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3710 	kn->kn_data = sbspace(&so->so_snd);
3711 
3712 	hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3713 
3714 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3715 		kn->kn_flags |= EV_EOF;
3716 		kn->kn_fflags = so->so_error;
3717 		return (1);
3718 	} else if (so->so_error)	/* temporary udp error */
3719 		return (1);
3720 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3721 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3722 		return (0);
3723 	else if (kn->kn_sfflags & NOTE_LOWAT)
3724 		return (kn->kn_data >= kn->kn_sdata);
3725 	else
3726 		return (kn->kn_data >= so->so_snd.sb_lowat);
3727 }
3728 
3729 static int
3730 filt_soempty(struct knote *kn, long hint)
3731 {
3732 	struct socket *so;
3733 
3734 	so = kn->kn_fp->f_data;
3735 
3736 	if (SOLISTENING(so))
3737 		return (1);
3738 
3739 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3740 	kn->kn_data = sbused(&so->so_snd);
3741 
3742 	if (kn->kn_data == 0)
3743 		return (1);
3744 	else
3745 		return (0);
3746 }
3747 
3748 int
3749 socheckuid(struct socket *so, uid_t uid)
3750 {
3751 
3752 	if (so == NULL)
3753 		return (EPERM);
3754 	if (so->so_cred->cr_uid != uid)
3755 		return (EPERM);
3756 	return (0);
3757 }
3758 
3759 /*
3760  * These functions are used by protocols to notify the socket layer (and its
3761  * consumers) of state changes in the sockets driven by protocol-side events.
3762  */
3763 
3764 /*
3765  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3766  *
3767  * Normal sequence from the active (originating) side is that
3768  * soisconnecting() is called during processing of connect() call, resulting
3769  * in an eventual call to soisconnected() if/when the connection is
3770  * established.  When the connection is torn down soisdisconnecting() is
3771  * called during processing of disconnect() call, and soisdisconnected() is
3772  * called when the connection to the peer is totally severed.  The semantics
3773  * of these routines are such that connectionless protocols can call
3774  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3775  * calls when setting up a ``connection'' takes no time.
3776  *
3777  * From the passive side, a socket is created with two queues of sockets:
3778  * so_incomp for connections in progress and so_comp for connections already
3779  * made and awaiting user acceptance.  As a protocol is preparing incoming
3780  * connections, it creates a socket structure queued on so_incomp by calling
3781  * sonewconn().  When the connection is established, soisconnected() is
3782  * called, and transfers the socket structure to so_comp, making it available
3783  * to accept().
3784  *
3785  * If a socket is closed with sockets on either so_incomp or so_comp, these
3786  * sockets are dropped.
3787  *
3788  * If higher-level protocols are implemented in the kernel, the wakeups done
3789  * here will sometimes cause software-interrupt process scheduling.
3790  */
3791 void
3792 soisconnecting(struct socket *so)
3793 {
3794 
3795 	SOCK_LOCK(so);
3796 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3797 	so->so_state |= SS_ISCONNECTING;
3798 	SOCK_UNLOCK(so);
3799 }
3800 
3801 void
3802 soisconnected(struct socket *so)
3803 {
3804 
3805 	SOCK_LOCK(so);
3806 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3807 	so->so_state |= SS_ISCONNECTED;
3808 
3809 	if (so->so_qstate == SQ_INCOMP) {
3810 		struct socket *head = so->so_listen;
3811 		int ret;
3812 
3813 		KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
3814 		/*
3815 		 * Promoting a socket from incomplete queue to complete, we
3816 		 * need to go through reverse order of locking.  We first do
3817 		 * trylock, and if that doesn't succeed, we go the hard way
3818 		 * leaving a reference and rechecking consistency after proper
3819 		 * locking.
3820 		 */
3821 		if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
3822 			soref(head);
3823 			SOCK_UNLOCK(so);
3824 			SOLISTEN_LOCK(head);
3825 			SOCK_LOCK(so);
3826 			if (__predict_false(head != so->so_listen)) {
3827 				/*
3828 				 * The socket went off the listen queue,
3829 				 * should be lost race to close(2) of sol.
3830 				 * The socket is about to soabort().
3831 				 */
3832 				SOCK_UNLOCK(so);
3833 				sorele(head);
3834 				return;
3835 			}
3836 			/* Not the last one, as so holds a ref. */
3837 			refcount_release(&head->so_count);
3838 		}
3839 again:
3840 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3841 			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
3842 			head->sol_incqlen--;
3843 			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
3844 			head->sol_qlen++;
3845 			so->so_qstate = SQ_COMP;
3846 			SOCK_UNLOCK(so);
3847 			solisten_wakeup(head);	/* unlocks */
3848 		} else {
3849 			SOCKBUF_LOCK(&so->so_rcv);
3850 			soupcall_set(so, SO_RCV,
3851 			    head->sol_accept_filter->accf_callback,
3852 			    head->sol_accept_filter_arg);
3853 			so->so_options &= ~SO_ACCEPTFILTER;
3854 			ret = head->sol_accept_filter->accf_callback(so,
3855 			    head->sol_accept_filter_arg, M_NOWAIT);
3856 			if (ret == SU_ISCONNECTED) {
3857 				soupcall_clear(so, SO_RCV);
3858 				SOCKBUF_UNLOCK(&so->so_rcv);
3859 				goto again;
3860 			}
3861 			SOCKBUF_UNLOCK(&so->so_rcv);
3862 			SOCK_UNLOCK(so);
3863 			SOLISTEN_UNLOCK(head);
3864 		}
3865 		return;
3866 	}
3867 	SOCK_UNLOCK(so);
3868 	wakeup(&so->so_timeo);
3869 	sorwakeup(so);
3870 	sowwakeup(so);
3871 }
3872 
3873 void
3874 soisdisconnecting(struct socket *so)
3875 {
3876 
3877 	SOCK_LOCK(so);
3878 	so->so_state &= ~SS_ISCONNECTING;
3879 	so->so_state |= SS_ISDISCONNECTING;
3880 
3881 	if (!SOLISTENING(so)) {
3882 		SOCKBUF_LOCK(&so->so_rcv);
3883 		socantrcvmore_locked(so);
3884 		SOCKBUF_LOCK(&so->so_snd);
3885 		socantsendmore_locked(so);
3886 	}
3887 	SOCK_UNLOCK(so);
3888 	wakeup(&so->so_timeo);
3889 }
3890 
3891 void
3892 soisdisconnected(struct socket *so)
3893 {
3894 
3895 	SOCK_LOCK(so);
3896 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3897 	so->so_state |= SS_ISDISCONNECTED;
3898 
3899 	if (!SOLISTENING(so)) {
3900 		SOCK_UNLOCK(so);
3901 		SOCKBUF_LOCK(&so->so_rcv);
3902 		socantrcvmore_locked(so);
3903 		SOCKBUF_LOCK(&so->so_snd);
3904 		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
3905 		socantsendmore_locked(so);
3906 	} else
3907 		SOCK_UNLOCK(so);
3908 	wakeup(&so->so_timeo);
3909 }
3910 
3911 /*
3912  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3913  */
3914 struct sockaddr *
3915 sodupsockaddr(const struct sockaddr *sa, int mflags)
3916 {
3917 	struct sockaddr *sa2;
3918 
3919 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3920 	if (sa2)
3921 		bcopy(sa, sa2, sa->sa_len);
3922 	return sa2;
3923 }
3924 
3925 /*
3926  * Register per-socket destructor.
3927  */
3928 void
3929 sodtor_set(struct socket *so, so_dtor_t *func)
3930 {
3931 
3932 	SOCK_LOCK_ASSERT(so);
3933 	so->so_dtor = func;
3934 }
3935 
3936 /*
3937  * Register per-socket buffer upcalls.
3938  */
3939 void
3940 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
3941 {
3942 	struct sockbuf *sb;
3943 
3944 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
3945 
3946 	switch (which) {
3947 	case SO_RCV:
3948 		sb = &so->so_rcv;
3949 		break;
3950 	case SO_SND:
3951 		sb = &so->so_snd;
3952 		break;
3953 	default:
3954 		panic("soupcall_set: bad which");
3955 	}
3956 	SOCKBUF_LOCK_ASSERT(sb);
3957 	sb->sb_upcall = func;
3958 	sb->sb_upcallarg = arg;
3959 	sb->sb_flags |= SB_UPCALL;
3960 }
3961 
3962 void
3963 soupcall_clear(struct socket *so, int which)
3964 {
3965 	struct sockbuf *sb;
3966 
3967 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
3968 
3969 	switch (which) {
3970 	case SO_RCV:
3971 		sb = &so->so_rcv;
3972 		break;
3973 	case SO_SND:
3974 		sb = &so->so_snd;
3975 		break;
3976 	default:
3977 		panic("soupcall_clear: bad which");
3978 	}
3979 	SOCKBUF_LOCK_ASSERT(sb);
3980 	KASSERT(sb->sb_upcall != NULL,
3981 	    ("%s: so %p no upcall to clear", __func__, so));
3982 	sb->sb_upcall = NULL;
3983 	sb->sb_upcallarg = NULL;
3984 	sb->sb_flags &= ~SB_UPCALL;
3985 }
3986 
3987 void
3988 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
3989 {
3990 
3991 	SOLISTEN_LOCK_ASSERT(so);
3992 	so->sol_upcall = func;
3993 	so->sol_upcallarg = arg;
3994 }
3995 
3996 static void
3997 so_rdknl_lock(void *arg)
3998 {
3999 	struct socket *so = arg;
4000 
4001 	if (SOLISTENING(so))
4002 		SOCK_LOCK(so);
4003 	else
4004 		SOCKBUF_LOCK(&so->so_rcv);
4005 }
4006 
4007 static void
4008 so_rdknl_unlock(void *arg)
4009 {
4010 	struct socket *so = arg;
4011 
4012 	if (SOLISTENING(so))
4013 		SOCK_UNLOCK(so);
4014 	else
4015 		SOCKBUF_UNLOCK(&so->so_rcv);
4016 }
4017 
4018 static void
4019 so_rdknl_assert_locked(void *arg)
4020 {
4021 	struct socket *so = arg;
4022 
4023 	if (SOLISTENING(so))
4024 		SOCK_LOCK_ASSERT(so);
4025 	else
4026 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
4027 }
4028 
4029 static void
4030 so_rdknl_assert_unlocked(void *arg)
4031 {
4032 	struct socket *so = arg;
4033 
4034 	if (SOLISTENING(so))
4035 		SOCK_UNLOCK_ASSERT(so);
4036 	else
4037 		SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
4038 }
4039 
4040 static void
4041 so_wrknl_lock(void *arg)
4042 {
4043 	struct socket *so = arg;
4044 
4045 	if (SOLISTENING(so))
4046 		SOCK_LOCK(so);
4047 	else
4048 		SOCKBUF_LOCK(&so->so_snd);
4049 }
4050 
4051 static void
4052 so_wrknl_unlock(void *arg)
4053 {
4054 	struct socket *so = arg;
4055 
4056 	if (SOLISTENING(so))
4057 		SOCK_UNLOCK(so);
4058 	else
4059 		SOCKBUF_UNLOCK(&so->so_snd);
4060 }
4061 
4062 static void
4063 so_wrknl_assert_locked(void *arg)
4064 {
4065 	struct socket *so = arg;
4066 
4067 	if (SOLISTENING(so))
4068 		SOCK_LOCK_ASSERT(so);
4069 	else
4070 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
4071 }
4072 
4073 static void
4074 so_wrknl_assert_unlocked(void *arg)
4075 {
4076 	struct socket *so = arg;
4077 
4078 	if (SOLISTENING(so))
4079 		SOCK_UNLOCK_ASSERT(so);
4080 	else
4081 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
4082 }
4083 
4084 /*
4085  * Create an external-format (``xsocket'') structure using the information in
4086  * the kernel-format socket structure pointed to by so.  This is done to
4087  * reduce the spew of irrelevant information over this interface, to isolate
4088  * user code from changes in the kernel structure, and potentially to provide
4089  * information-hiding if we decide that some of this information should be
4090  * hidden from users.
4091  */
4092 void
4093 sotoxsocket(struct socket *so, struct xsocket *xso)
4094 {
4095 
4096 	bzero(xso, sizeof(*xso));
4097 	xso->xso_len = sizeof *xso;
4098 	xso->xso_so = (uintptr_t)so;
4099 	xso->so_type = so->so_type;
4100 	xso->so_options = so->so_options;
4101 	xso->so_linger = so->so_linger;
4102 	xso->so_state = so->so_state;
4103 	xso->so_pcb = (uintptr_t)so->so_pcb;
4104 	xso->xso_protocol = so->so_proto->pr_protocol;
4105 	xso->xso_family = so->so_proto->pr_domain->dom_family;
4106 	xso->so_timeo = so->so_timeo;
4107 	xso->so_error = so->so_error;
4108 	xso->so_uid = so->so_cred->cr_uid;
4109 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4110 	if (SOLISTENING(so)) {
4111 		xso->so_qlen = so->sol_qlen;
4112 		xso->so_incqlen = so->sol_incqlen;
4113 		xso->so_qlimit = so->sol_qlimit;
4114 		xso->so_oobmark = 0;
4115 	} else {
4116 		xso->so_state |= so->so_qstate;
4117 		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4118 		xso->so_oobmark = so->so_oobmark;
4119 		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4120 		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4121 	}
4122 }
4123 
4124 struct sockbuf *
4125 so_sockbuf_rcv(struct socket *so)
4126 {
4127 
4128 	return (&so->so_rcv);
4129 }
4130 
4131 struct sockbuf *
4132 so_sockbuf_snd(struct socket *so)
4133 {
4134 
4135 	return (&so->so_snd);
4136 }
4137 
4138 int
4139 so_state_get(const struct socket *so)
4140 {
4141 
4142 	return (so->so_state);
4143 }
4144 
4145 void
4146 so_state_set(struct socket *so, int val)
4147 {
4148 
4149 	so->so_state = val;
4150 }
4151 
4152 int
4153 so_options_get(const struct socket *so)
4154 {
4155 
4156 	return (so->so_options);
4157 }
4158 
4159 void
4160 so_options_set(struct socket *so, int val)
4161 {
4162 
4163 	so->so_options = val;
4164 }
4165 
4166 int
4167 so_error_get(const struct socket *so)
4168 {
4169 
4170 	return (so->so_error);
4171 }
4172 
4173 void
4174 so_error_set(struct socket *so, int val)
4175 {
4176 
4177 	so->so_error = val;
4178 }
4179 
4180 int
4181 so_linger_get(const struct socket *so)
4182 {
4183 
4184 	return (so->so_linger);
4185 }
4186 
4187 void
4188 so_linger_set(struct socket *so, int val)
4189 {
4190 
4191 	KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4192 	    ("%s: val %d out of range", __func__, val));
4193 
4194 	so->so_linger = val;
4195 }
4196 
4197 struct protosw *
4198 so_protosw_get(const struct socket *so)
4199 {
4200 
4201 	return (so->so_proto);
4202 }
4203 
4204 void
4205 so_protosw_set(struct socket *so, struct protosw *val)
4206 {
4207 
4208 	so->so_proto = val;
4209 }
4210 
4211 void
4212 so_sorwakeup(struct socket *so)
4213 {
4214 
4215 	sorwakeup(so);
4216 }
4217 
4218 void
4219 so_sowwakeup(struct socket *so)
4220 {
4221 
4222 	sowwakeup(so);
4223 }
4224 
4225 void
4226 so_sorwakeup_locked(struct socket *so)
4227 {
4228 
4229 	sorwakeup_locked(so);
4230 }
4231 
4232 void
4233 so_sowwakeup_locked(struct socket *so)
4234 {
4235 
4236 	sowwakeup_locked(so);
4237 }
4238 
4239 void
4240 so_lock(struct socket *so)
4241 {
4242 
4243 	SOCK_LOCK(so);
4244 }
4245 
4246 void
4247 so_unlock(struct socket *so)
4248 {
4249 
4250 	SOCK_UNLOCK(so);
4251 }
4252