xref: /freebsd/sys/kern/uipc_socket.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2008 Robert N. M. Watson
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 /*
36  * Comments on the socket life cycle:
37  *
38  * soalloc() sets of socket layer state for a socket, called only by
39  * socreate() and sonewconn().  Socket layer private.
40  *
41  * sodealloc() tears down socket layer state for a socket, called only by
42  * sofree() and sonewconn().  Socket layer private.
43  *
44  * pru_attach() associates protocol layer state with an allocated socket;
45  * called only once, may fail, aborting socket allocation.  This is called
46  * from socreate() and sonewconn().  Socket layer private.
47  *
48  * pru_detach() disassociates protocol layer state from an attached socket,
49  * and will be called exactly once for sockets in which pru_attach() has
50  * been successfully called.  If pru_attach() returned an error,
51  * pru_detach() will not be called.  Socket layer private.
52  *
53  * pru_abort() and pru_close() notify the protocol layer that the last
54  * consumer of a socket is starting to tear down the socket, and that the
55  * protocol should terminate the connection.  Historically, pru_abort() also
56  * detached protocol state from the socket state, but this is no longer the
57  * case.
58  *
59  * socreate() creates a socket and attaches protocol state.  This is a public
60  * interface that may be used by socket layer consumers to create new
61  * sockets.
62  *
63  * sonewconn() creates a socket and attaches protocol state.  This is a
64  * public interface  that may be used by protocols to create new sockets when
65  * a new connection is received and will be available for accept() on a
66  * listen socket.
67  *
68  * soclose() destroys a socket after possibly waiting for it to disconnect.
69  * This is a public interface that socket consumers should use to close and
70  * release a socket when done with it.
71  *
72  * soabort() destroys a socket without waiting for it to disconnect (used
73  * only for incoming connections that are already partially or fully
74  * connected).  This is used internally by the socket layer when clearing
75  * listen socket queues (due to overflow or close on the listen socket), but
76  * is also a public interface protocols may use to abort connections in
77  * their incomplete listen queues should they no longer be required.  Sockets
78  * placed in completed connection listen queues should not be aborted for
79  * reasons described in the comment above the soclose() implementation.  This
80  * is not a general purpose close routine, and except in the specific
81  * circumstances described here, should not be used.
82  *
83  * sofree() will free a socket and its protocol state if all references on
84  * the socket have been released, and is the public interface to attempt to
85  * free a socket when a reference is removed.  This is a socket layer private
86  * interface.
87  *
88  * NOTE: In addition to socreate() and soclose(), which provide a single
89  * socket reference to the consumer to be managed as required, there are two
90  * calls to explicitly manage socket references, soref(), and sorele().
91  * Currently, these are generally required only when transitioning a socket
92  * from a listen queue to a file descriptor, in order to prevent garbage
93  * collection of the socket at an untimely moment.  For a number of reasons,
94  * these interfaces are not preferred, and should be avoided.
95  */
96 
97 #include <sys/cdefs.h>
98 __FBSDID("$FreeBSD$");
99 
100 #include "opt_inet.h"
101 #include "opt_inet6.h"
102 #include "opt_zero.h"
103 #include "opt_compat.h"
104 
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/fcntl.h>
108 #include <sys/limits.h>
109 #include <sys/lock.h>
110 #include <sys/mac.h>
111 #include <sys/malloc.h>
112 #include <sys/mbuf.h>
113 #include <sys/mutex.h>
114 #include <sys/domain.h>
115 #include <sys/file.h>			/* for struct knote */
116 #include <sys/kernel.h>
117 #include <sys/event.h>
118 #include <sys/eventhandler.h>
119 #include <sys/poll.h>
120 #include <sys/proc.h>
121 #include <sys/protosw.h>
122 #include <sys/socket.h>
123 #include <sys/socketvar.h>
124 #include <sys/resourcevar.h>
125 #include <net/route.h>
126 #include <sys/signalvar.h>
127 #include <sys/stat.h>
128 #include <sys/sx.h>
129 #include <sys/sysctl.h>
130 #include <sys/uio.h>
131 #include <sys/jail.h>
132 
133 #include <net/vnet.h>
134 
135 #include <security/mac/mac_framework.h>
136 
137 #include <vm/uma.h>
138 
139 #ifdef COMPAT_IA32
140 #include <sys/mount.h>
141 #include <sys/sysent.h>
142 #include <compat/freebsd32/freebsd32.h>
143 #endif
144 
145 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
146 		    int flags);
147 
148 static void	filt_sordetach(struct knote *kn);
149 static int	filt_soread(struct knote *kn, long hint);
150 static void	filt_sowdetach(struct knote *kn);
151 static int	filt_sowrite(struct knote *kn, long hint);
152 static int	filt_solisten(struct knote *kn, long hint);
153 
154 static struct filterops solisten_filtops =
155 	{ 1, NULL, filt_sordetach, filt_solisten };
156 static struct filterops soread_filtops =
157 	{ 1, NULL, filt_sordetach, filt_soread };
158 static struct filterops sowrite_filtops =
159 	{ 1, NULL, filt_sowdetach, filt_sowrite };
160 
161 uma_zone_t socket_zone;
162 so_gen_t	so_gencnt;	/* generation count for sockets */
163 
164 int	maxsockets;
165 
166 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
167 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
168 
169 static int somaxconn = SOMAXCONN;
170 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
171 /* XXX: we dont have SYSCTL_USHORT */
172 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
173     0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
174     "queue size");
175 static int numopensockets;
176 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
177     &numopensockets, 0, "Number of open sockets");
178 #ifdef ZERO_COPY_SOCKETS
179 /* These aren't static because they're used in other files. */
180 int so_zero_copy_send = 1;
181 int so_zero_copy_receive = 1;
182 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
183     "Zero copy controls");
184 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
185     &so_zero_copy_receive, 0, "Enable zero copy receive");
186 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
187     &so_zero_copy_send, 0, "Enable zero copy send");
188 #endif /* ZERO_COPY_SOCKETS */
189 
190 /*
191  * accept_mtx locks down per-socket fields relating to accept queues.  See
192  * socketvar.h for an annotation of the protected fields of struct socket.
193  */
194 struct mtx accept_mtx;
195 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
196 
197 /*
198  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
199  * so_gencnt field.
200  */
201 static struct mtx so_global_mtx;
202 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
203 
204 /*
205  * General IPC sysctl name space, used by sockets and a variety of other IPC
206  * types.
207  */
208 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
209 
210 /*
211  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
212  * of the change so that they can update their dependent limits as required.
213  */
214 static int
215 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
216 {
217 	int error, newmaxsockets;
218 
219 	newmaxsockets = maxsockets;
220 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
221 	if (error == 0 && req->newptr) {
222 		if (newmaxsockets > maxsockets) {
223 			maxsockets = newmaxsockets;
224 			if (maxsockets > ((maxfiles / 4) * 3)) {
225 				maxfiles = (maxsockets * 5) / 4;
226 				maxfilesperproc = (maxfiles * 9) / 10;
227 			}
228 			EVENTHANDLER_INVOKE(maxsockets_change);
229 		} else
230 			error = EINVAL;
231 	}
232 	return (error);
233 }
234 
235 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
236     &maxsockets, 0, sysctl_maxsockets, "IU",
237     "Maximum number of sockets avaliable");
238 
239 /*
240  * Initialise maxsockets.  This SYSINIT must be run after
241  * tunable_mbinit().
242  */
243 static void
244 init_maxsockets(void *ignored)
245 {
246 
247 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
248 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
249 }
250 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
251 
252 /*
253  * Socket operation routines.  These routines are called by the routines in
254  * sys_socket.c or from a system process, and implement the semantics of
255  * socket operations by switching out to the protocol specific routines.
256  */
257 
258 /*
259  * Get a socket structure from our zone, and initialize it.  Note that it
260  * would probably be better to allocate socket and PCB at the same time, but
261  * I'm not convinced that all the protocols can be easily modified to do
262  * this.
263  *
264  * soalloc() returns a socket with a ref count of 0.
265  */
266 static struct socket *
267 soalloc(struct vnet *vnet)
268 {
269 	struct socket *so;
270 
271 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
272 	if (so == NULL)
273 		return (NULL);
274 #ifdef MAC
275 	if (mac_socket_init(so, M_NOWAIT) != 0) {
276 		uma_zfree(socket_zone, so);
277 		return (NULL);
278 	}
279 #endif
280 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
281 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
282 	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
283 	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
284 	TAILQ_INIT(&so->so_aiojobq);
285 	mtx_lock(&so_global_mtx);
286 	so->so_gencnt = ++so_gencnt;
287 	++numopensockets;
288 #ifdef VIMAGE
289 	vnet->vnet_sockcnt++;
290 	so->so_vnet = vnet;
291 #endif
292 	mtx_unlock(&so_global_mtx);
293 	return (so);
294 }
295 
296 /*
297  * Free the storage associated with a socket at the socket layer, tear down
298  * locks, labels, etc.  All protocol state is assumed already to have been
299  * torn down (and possibly never set up) by the caller.
300  */
301 static void
302 sodealloc(struct socket *so)
303 {
304 
305 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
306 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
307 
308 	mtx_lock(&so_global_mtx);
309 	so->so_gencnt = ++so_gencnt;
310 	--numopensockets;	/* Could be below, but faster here. */
311 #ifdef VIMAGE
312 	so->so_vnet->vnet_sockcnt--;
313 #endif
314 	mtx_unlock(&so_global_mtx);
315 	if (so->so_rcv.sb_hiwat)
316 		(void)chgsbsize(so->so_cred->cr_uidinfo,
317 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
318 	if (so->so_snd.sb_hiwat)
319 		(void)chgsbsize(so->so_cred->cr_uidinfo,
320 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
321 #ifdef INET
322 	/* remove acccept filter if one is present. */
323 	if (so->so_accf != NULL)
324 		do_setopt_accept_filter(so, NULL);
325 #endif
326 #ifdef MAC
327 	mac_socket_destroy(so);
328 #endif
329 	crfree(so->so_cred);
330 	sx_destroy(&so->so_snd.sb_sx);
331 	sx_destroy(&so->so_rcv.sb_sx);
332 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
333 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
334 	uma_zfree(socket_zone, so);
335 }
336 
337 /*
338  * socreate returns a socket with a ref count of 1.  The socket should be
339  * closed with soclose().
340  */
341 int
342 socreate(int dom, struct socket **aso, int type, int proto,
343     struct ucred *cred, struct thread *td)
344 {
345 	struct protosw *prp;
346 	struct socket *so;
347 	int error;
348 
349 	if (proto)
350 		prp = pffindproto(dom, proto, type);
351 	else
352 		prp = pffindtype(dom, type);
353 
354 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
355 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
356 		return (EPROTONOSUPPORT);
357 
358 	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
359 		return (EPROTONOSUPPORT);
360 
361 	if (prp->pr_type != type)
362 		return (EPROTOTYPE);
363 	so = soalloc(CRED_TO_VNET(cred));
364 	if (so == NULL)
365 		return (ENOBUFS);
366 
367 	TAILQ_INIT(&so->so_incomp);
368 	TAILQ_INIT(&so->so_comp);
369 	so->so_type = type;
370 	so->so_cred = crhold(cred);
371 	if ((prp->pr_domain->dom_family == PF_INET) ||
372 	    (prp->pr_domain->dom_family == PF_ROUTE))
373 		so->so_fibnum = td->td_proc->p_fibnum;
374 	else
375 		so->so_fibnum = 0;
376 	so->so_proto = prp;
377 #ifdef MAC
378 	mac_socket_create(cred, so);
379 #endif
380 	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
381 	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
382 	so->so_count = 1;
383 	/*
384 	 * Auto-sizing of socket buffers is managed by the protocols and
385 	 * the appropriate flags must be set in the pru_attach function.
386 	 */
387 	CURVNET_SET(so->so_vnet);
388 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
389 	CURVNET_RESTORE();
390 	if (error) {
391 		KASSERT(so->so_count == 1, ("socreate: so_count %d",
392 		    so->so_count));
393 		so->so_count = 0;
394 		sodealloc(so);
395 		return (error);
396 	}
397 	*aso = so;
398 	return (0);
399 }
400 
401 #ifdef REGRESSION
402 static int regression_sonewconn_earlytest = 1;
403 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
404     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
405 #endif
406 
407 /*
408  * When an attempt at a new connection is noted on a socket which accepts
409  * connections, sonewconn is called.  If the connection is possible (subject
410  * to space constraints, etc.) then we allocate a new structure, propoerly
411  * linked into the data structure of the original socket, and return this.
412  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
413  *
414  * Note: the ref count on the socket is 0 on return.
415  */
416 struct socket *
417 sonewconn(struct socket *head, int connstatus)
418 {
419 	struct socket *so;
420 	int over;
421 
422 	ACCEPT_LOCK();
423 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
424 	ACCEPT_UNLOCK();
425 #ifdef REGRESSION
426 	if (regression_sonewconn_earlytest && over)
427 #else
428 	if (over)
429 #endif
430 		return (NULL);
431 	VNET_ASSERT(head->so_vnet);
432 	so = soalloc(head->so_vnet);
433 	if (so == NULL)
434 		return (NULL);
435 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
436 		connstatus = 0;
437 	so->so_head = head;
438 	so->so_type = head->so_type;
439 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
440 	so->so_linger = head->so_linger;
441 	so->so_state = head->so_state | SS_NOFDREF;
442 	so->so_fibnum = head->so_fibnum;
443 	so->so_proto = head->so_proto;
444 	so->so_cred = crhold(head->so_cred);
445 #ifdef MAC
446 	mac_socket_newconn(head, so);
447 #endif
448 	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
449 	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
450 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
451 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
452 		sodealloc(so);
453 		return (NULL);
454 	}
455 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
456 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
457 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
458 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
459 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
460 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
461 	so->so_state |= connstatus;
462 	ACCEPT_LOCK();
463 	if (connstatus) {
464 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
465 		so->so_qstate |= SQ_COMP;
466 		head->so_qlen++;
467 	} else {
468 		/*
469 		 * Keep removing sockets from the head until there's room for
470 		 * us to insert on the tail.  In pre-locking revisions, this
471 		 * was a simple if(), but as we could be racing with other
472 		 * threads and soabort() requires dropping locks, we must
473 		 * loop waiting for the condition to be true.
474 		 */
475 		while (head->so_incqlen > head->so_qlimit) {
476 			struct socket *sp;
477 			sp = TAILQ_FIRST(&head->so_incomp);
478 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
479 			head->so_incqlen--;
480 			sp->so_qstate &= ~SQ_INCOMP;
481 			sp->so_head = NULL;
482 			ACCEPT_UNLOCK();
483 			soabort(sp);
484 			ACCEPT_LOCK();
485 		}
486 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
487 		so->so_qstate |= SQ_INCOMP;
488 		head->so_incqlen++;
489 	}
490 	ACCEPT_UNLOCK();
491 	if (connstatus) {
492 		sorwakeup(head);
493 		wakeup_one(&head->so_timeo);
494 	}
495 	return (so);
496 }
497 
498 int
499 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
500 {
501 	int error;
502 
503 	CURVNET_SET(so->so_vnet);
504 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
505 	CURVNET_RESTORE();
506 	return error;
507 }
508 
509 /*
510  * solisten() transitions a socket from a non-listening state to a listening
511  * state, but can also be used to update the listen queue depth on an
512  * existing listen socket.  The protocol will call back into the sockets
513  * layer using solisten_proto_check() and solisten_proto() to check and set
514  * socket-layer listen state.  Call backs are used so that the protocol can
515  * acquire both protocol and socket layer locks in whatever order is required
516  * by the protocol.
517  *
518  * Protocol implementors are advised to hold the socket lock across the
519  * socket-layer test and set to avoid races at the socket layer.
520  */
521 int
522 solisten(struct socket *so, int backlog, struct thread *td)
523 {
524 
525 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
526 }
527 
528 int
529 solisten_proto_check(struct socket *so)
530 {
531 
532 	SOCK_LOCK_ASSERT(so);
533 
534 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
535 	    SS_ISDISCONNECTING))
536 		return (EINVAL);
537 	return (0);
538 }
539 
540 void
541 solisten_proto(struct socket *so, int backlog)
542 {
543 
544 	SOCK_LOCK_ASSERT(so);
545 
546 	if (backlog < 0 || backlog > somaxconn)
547 		backlog = somaxconn;
548 	so->so_qlimit = backlog;
549 	so->so_options |= SO_ACCEPTCONN;
550 }
551 
552 /*
553  * Attempt to free a socket.  This should really be sotryfree().
554  *
555  * sofree() will succeed if:
556  *
557  * - There are no outstanding file descriptor references or related consumers
558  *   (so_count == 0).
559  *
560  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
561  *
562  * - The protocol does not have an outstanding strong reference on the socket
563  *   (SS_PROTOREF).
564  *
565  * - The socket is not in a completed connection queue, so a process has been
566  *   notified that it is present.  If it is removed, the user process may
567  *   block in accept() despite select() saying the socket was ready.
568  *
569  * Otherwise, it will quietly abort so that a future call to sofree(), when
570  * conditions are right, can succeed.
571  */
572 void
573 sofree(struct socket *so)
574 {
575 	struct protosw *pr = so->so_proto;
576 	struct socket *head;
577 
578 	ACCEPT_LOCK_ASSERT();
579 	SOCK_LOCK_ASSERT(so);
580 
581 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
582 	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
583 		SOCK_UNLOCK(so);
584 		ACCEPT_UNLOCK();
585 		return;
586 	}
587 
588 	head = so->so_head;
589 	if (head != NULL) {
590 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
591 		    (so->so_qstate & SQ_INCOMP) != 0,
592 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
593 		    "SQ_INCOMP"));
594 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
595 		    (so->so_qstate & SQ_INCOMP) == 0,
596 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
597 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
598 		head->so_incqlen--;
599 		so->so_qstate &= ~SQ_INCOMP;
600 		so->so_head = NULL;
601 	}
602 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
603 	    (so->so_qstate & SQ_INCOMP) == 0,
604 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
605 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
606 	if (so->so_options & SO_ACCEPTCONN) {
607 		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
608 		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
609 	}
610 	SOCK_UNLOCK(so);
611 	ACCEPT_UNLOCK();
612 
613 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
614 		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
615 	if (pr->pr_usrreqs->pru_detach != NULL)
616 		(*pr->pr_usrreqs->pru_detach)(so);
617 
618 	/*
619 	 * From this point on, we assume that no other references to this
620 	 * socket exist anywhere else in the stack.  Therefore, no locks need
621 	 * to be acquired or held.
622 	 *
623 	 * We used to do a lot of socket buffer and socket locking here, as
624 	 * well as invoke sorflush() and perform wakeups.  The direct call to
625 	 * dom_dispose() and sbrelease_internal() are an inlining of what was
626 	 * necessary from sorflush().
627 	 *
628 	 * Notice that the socket buffer and kqueue state are torn down
629 	 * before calling pru_detach.  This means that protocols shold not
630 	 * assume they can perform socket wakeups, etc, in their detach code.
631 	 */
632 	sbdestroy(&so->so_snd, so);
633 	sbdestroy(&so->so_rcv, so);
634 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
635 	knlist_destroy(&so->so_snd.sb_sel.si_note);
636 	sodealloc(so);
637 }
638 
639 /*
640  * Close a socket on last file table reference removal.  Initiate disconnect
641  * if connected.  Free socket when disconnect complete.
642  *
643  * This function will sorele() the socket.  Note that soclose() may be called
644  * prior to the ref count reaching zero.  The actual socket structure will
645  * not be freed until the ref count reaches zero.
646  */
647 int
648 soclose(struct socket *so)
649 {
650 	int error = 0;
651 
652 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
653 
654 	CURVNET_SET(so->so_vnet);
655 	funsetown(&so->so_sigio);
656 	if (so->so_state & SS_ISCONNECTED) {
657 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
658 			error = sodisconnect(so);
659 			if (error)
660 				goto drop;
661 		}
662 		if (so->so_options & SO_LINGER) {
663 			if ((so->so_state & SS_ISDISCONNECTING) &&
664 			    (so->so_state & SS_NBIO))
665 				goto drop;
666 			while (so->so_state & SS_ISCONNECTED) {
667 				error = tsleep(&so->so_timeo,
668 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
669 				if (error)
670 					break;
671 			}
672 		}
673 	}
674 
675 drop:
676 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
677 		(*so->so_proto->pr_usrreqs->pru_close)(so);
678 	if (so->so_options & SO_ACCEPTCONN) {
679 		struct socket *sp;
680 		ACCEPT_LOCK();
681 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
682 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
683 			so->so_incqlen--;
684 			sp->so_qstate &= ~SQ_INCOMP;
685 			sp->so_head = NULL;
686 			ACCEPT_UNLOCK();
687 			soabort(sp);
688 			ACCEPT_LOCK();
689 		}
690 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
691 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
692 			so->so_qlen--;
693 			sp->so_qstate &= ~SQ_COMP;
694 			sp->so_head = NULL;
695 			ACCEPT_UNLOCK();
696 			soabort(sp);
697 			ACCEPT_LOCK();
698 		}
699 		ACCEPT_UNLOCK();
700 	}
701 	ACCEPT_LOCK();
702 	SOCK_LOCK(so);
703 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
704 	so->so_state |= SS_NOFDREF;
705 	sorele(so);
706 	CURVNET_RESTORE();
707 	return (error);
708 }
709 
710 /*
711  * soabort() is used to abruptly tear down a connection, such as when a
712  * resource limit is reached (listen queue depth exceeded), or if a listen
713  * socket is closed while there are sockets waiting to be accepted.
714  *
715  * This interface is tricky, because it is called on an unreferenced socket,
716  * and must be called only by a thread that has actually removed the socket
717  * from the listen queue it was on, or races with other threads are risked.
718  *
719  * This interface will call into the protocol code, so must not be called
720  * with any socket locks held.  Protocols do call it while holding their own
721  * recursible protocol mutexes, but this is something that should be subject
722  * to review in the future.
723  */
724 void
725 soabort(struct socket *so)
726 {
727 
728 	/*
729 	 * In as much as is possible, assert that no references to this
730 	 * socket are held.  This is not quite the same as asserting that the
731 	 * current thread is responsible for arranging for no references, but
732 	 * is as close as we can get for now.
733 	 */
734 	KASSERT(so->so_count == 0, ("soabort: so_count"));
735 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
736 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
737 	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
738 	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
739 
740 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
741 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
742 	ACCEPT_LOCK();
743 	SOCK_LOCK(so);
744 	sofree(so);
745 }
746 
747 int
748 soaccept(struct socket *so, struct sockaddr **nam)
749 {
750 	int error;
751 
752 	SOCK_LOCK(so);
753 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
754 	so->so_state &= ~SS_NOFDREF;
755 	SOCK_UNLOCK(so);
756 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
757 	return (error);
758 }
759 
760 int
761 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
762 {
763 	int error;
764 
765 	if (so->so_options & SO_ACCEPTCONN)
766 		return (EOPNOTSUPP);
767 	/*
768 	 * If protocol is connection-based, can only connect once.
769 	 * Otherwise, if connected, try to disconnect first.  This allows
770 	 * user to disconnect by connecting to, e.g., a null address.
771 	 */
772 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
773 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
774 	    (error = sodisconnect(so)))) {
775 		error = EISCONN;
776 	} else {
777 		/*
778 		 * Prevent accumulated error from previous connection from
779 		 * biting us.
780 		 */
781 		so->so_error = 0;
782 		CURVNET_SET(so->so_vnet);
783 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
784 		CURVNET_RESTORE();
785 	}
786 
787 	return (error);
788 }
789 
790 int
791 soconnect2(struct socket *so1, struct socket *so2)
792 {
793 
794 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
795 }
796 
797 int
798 sodisconnect(struct socket *so)
799 {
800 	int error;
801 
802 	if ((so->so_state & SS_ISCONNECTED) == 0)
803 		return (ENOTCONN);
804 	if (so->so_state & SS_ISDISCONNECTING)
805 		return (EALREADY);
806 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
807 	return (error);
808 }
809 
810 #ifdef ZERO_COPY_SOCKETS
811 struct so_zerocopy_stats{
812 	int size_ok;
813 	int align_ok;
814 	int found_ifp;
815 };
816 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
817 #include <netinet/in.h>
818 #include <net/route.h>
819 #include <netinet/in_pcb.h>
820 #include <vm/vm.h>
821 #include <vm/vm_page.h>
822 #include <vm/vm_object.h>
823 
824 /*
825  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
826  * sosend_dgram() and sosend_generic() use m_uiotombuf().
827  *
828  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
829  * all of the data referenced by the uio.  If desired, it uses zero-copy.
830  * *space will be updated to reflect data copied in.
831  *
832  * NB: If atomic I/O is requested, the caller must already have checked that
833  * space can hold resid bytes.
834  *
835  * NB: In the event of an error, the caller may need to free the partial
836  * chain pointed to by *mpp.  The contents of both *uio and *space may be
837  * modified even in the case of an error.
838  */
839 static int
840 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
841     int flags)
842 {
843 	struct mbuf *m, **mp, *top;
844 	long len, resid;
845 	int error;
846 #ifdef ZERO_COPY_SOCKETS
847 	int cow_send;
848 #endif
849 
850 	*retmp = top = NULL;
851 	mp = &top;
852 	len = 0;
853 	resid = uio->uio_resid;
854 	error = 0;
855 	do {
856 #ifdef ZERO_COPY_SOCKETS
857 		cow_send = 0;
858 #endif /* ZERO_COPY_SOCKETS */
859 		if (resid >= MINCLSIZE) {
860 #ifdef ZERO_COPY_SOCKETS
861 			if (top == NULL) {
862 				m = m_gethdr(M_WAITOK, MT_DATA);
863 				m->m_pkthdr.len = 0;
864 				m->m_pkthdr.rcvif = NULL;
865 			} else
866 				m = m_get(M_WAITOK, MT_DATA);
867 			if (so_zero_copy_send &&
868 			    resid>=PAGE_SIZE &&
869 			    *space>=PAGE_SIZE &&
870 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
871 				so_zerocp_stats.size_ok++;
872 				so_zerocp_stats.align_ok++;
873 				cow_send = socow_setup(m, uio);
874 				len = cow_send;
875 			}
876 			if (!cow_send) {
877 				m_clget(m, M_WAITOK);
878 				len = min(min(MCLBYTES, resid), *space);
879 			}
880 #else /* ZERO_COPY_SOCKETS */
881 			if (top == NULL) {
882 				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
883 				m->m_pkthdr.len = 0;
884 				m->m_pkthdr.rcvif = NULL;
885 			} else
886 				m = m_getcl(M_WAIT, MT_DATA, 0);
887 			len = min(min(MCLBYTES, resid), *space);
888 #endif /* ZERO_COPY_SOCKETS */
889 		} else {
890 			if (top == NULL) {
891 				m = m_gethdr(M_WAIT, MT_DATA);
892 				m->m_pkthdr.len = 0;
893 				m->m_pkthdr.rcvif = NULL;
894 
895 				len = min(min(MHLEN, resid), *space);
896 				/*
897 				 * For datagram protocols, leave room
898 				 * for protocol headers in first mbuf.
899 				 */
900 				if (atomic && m && len < MHLEN)
901 					MH_ALIGN(m, len);
902 			} else {
903 				m = m_get(M_WAIT, MT_DATA);
904 				len = min(min(MLEN, resid), *space);
905 			}
906 		}
907 		if (m == NULL) {
908 			error = ENOBUFS;
909 			goto out;
910 		}
911 
912 		*space -= len;
913 #ifdef ZERO_COPY_SOCKETS
914 		if (cow_send)
915 			error = 0;
916 		else
917 #endif /* ZERO_COPY_SOCKETS */
918 		error = uiomove(mtod(m, void *), (int)len, uio);
919 		resid = uio->uio_resid;
920 		m->m_len = len;
921 		*mp = m;
922 		top->m_pkthdr.len += len;
923 		if (error)
924 			goto out;
925 		mp = &m->m_next;
926 		if (resid <= 0) {
927 			if (flags & MSG_EOR)
928 				top->m_flags |= M_EOR;
929 			break;
930 		}
931 	} while (*space > 0 && atomic);
932 out:
933 	*retmp = top;
934 	return (error);
935 }
936 #endif /*ZERO_COPY_SOCKETS*/
937 
938 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
939 
940 int
941 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
942     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
943 {
944 	long space, resid;
945 	int clen = 0, error, dontroute;
946 #ifdef ZERO_COPY_SOCKETS
947 	int atomic = sosendallatonce(so) || top;
948 #endif
949 
950 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
951 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
952 	    ("sodgram_send: !PR_ATOMIC"));
953 
954 	if (uio != NULL)
955 		resid = uio->uio_resid;
956 	else
957 		resid = top->m_pkthdr.len;
958 	/*
959 	 * In theory resid should be unsigned.  However, space must be
960 	 * signed, as it might be less than 0 if we over-committed, and we
961 	 * must use a signed comparison of space and resid.  On the other
962 	 * hand, a negative resid causes us to loop sending 0-length
963 	 * segments to the protocol.
964 	 *
965 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
966 	 * type sockets since that's an error.
967 	 */
968 	if (resid < 0) {
969 		error = EINVAL;
970 		goto out;
971 	}
972 
973 	dontroute =
974 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
975 	if (td != NULL)
976 		td->td_ru.ru_msgsnd++;
977 	if (control != NULL)
978 		clen = control->m_len;
979 
980 	SOCKBUF_LOCK(&so->so_snd);
981 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
982 		SOCKBUF_UNLOCK(&so->so_snd);
983 		error = EPIPE;
984 		goto out;
985 	}
986 	if (so->so_error) {
987 		error = so->so_error;
988 		so->so_error = 0;
989 		SOCKBUF_UNLOCK(&so->so_snd);
990 		goto out;
991 	}
992 	if ((so->so_state & SS_ISCONNECTED) == 0) {
993 		/*
994 		 * `sendto' and `sendmsg' is allowed on a connection-based
995 		 * socket if it supports implied connect.  Return ENOTCONN if
996 		 * not connected and no address is supplied.
997 		 */
998 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
999 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1000 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1001 			    !(resid == 0 && clen != 0)) {
1002 				SOCKBUF_UNLOCK(&so->so_snd);
1003 				error = ENOTCONN;
1004 				goto out;
1005 			}
1006 		} else if (addr == NULL) {
1007 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1008 				error = ENOTCONN;
1009 			else
1010 				error = EDESTADDRREQ;
1011 			SOCKBUF_UNLOCK(&so->so_snd);
1012 			goto out;
1013 		}
1014 	}
1015 
1016 	/*
1017 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1018 	 * problem and need fixing.
1019 	 */
1020 	space = sbspace(&so->so_snd);
1021 	if (flags & MSG_OOB)
1022 		space += 1024;
1023 	space -= clen;
1024 	SOCKBUF_UNLOCK(&so->so_snd);
1025 	if (resid > space) {
1026 		error = EMSGSIZE;
1027 		goto out;
1028 	}
1029 	if (uio == NULL) {
1030 		resid = 0;
1031 		if (flags & MSG_EOR)
1032 			top->m_flags |= M_EOR;
1033 	} else {
1034 #ifdef ZERO_COPY_SOCKETS
1035 		error = sosend_copyin(uio, &top, atomic, &space, flags);
1036 		if (error)
1037 			goto out;
1038 #else
1039 		/*
1040 		 * Copy the data from userland into a mbuf chain.
1041 		 * If no data is to be copied in, a single empty mbuf
1042 		 * is returned.
1043 		 */
1044 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1045 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1046 		if (top == NULL) {
1047 			error = EFAULT;	/* only possible error */
1048 			goto out;
1049 		}
1050 		space -= resid - uio->uio_resid;
1051 #endif
1052 		resid = uio->uio_resid;
1053 	}
1054 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1055 	/*
1056 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1057 	 * than with.
1058 	 */
1059 	if (dontroute) {
1060 		SOCK_LOCK(so);
1061 		so->so_options |= SO_DONTROUTE;
1062 		SOCK_UNLOCK(so);
1063 	}
1064 	/*
1065 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1066 	 * of date.  We could have recieved a reset packet in an interrupt or
1067 	 * maybe we slept while doing page faults in uiomove() etc.  We could
1068 	 * probably recheck again inside the locking protection here, but
1069 	 * there are probably other places that this also happens.  We must
1070 	 * rethink this.
1071 	 */
1072 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1073 	    (flags & MSG_OOB) ? PRUS_OOB :
1074 	/*
1075 	 * If the user set MSG_EOF, the protocol understands this flag and
1076 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1077 	 */
1078 	    ((flags & MSG_EOF) &&
1079 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1080 	     (resid <= 0)) ?
1081 		PRUS_EOF :
1082 		/* If there is more to send set PRUS_MORETOCOME */
1083 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1084 		top, addr, control, td);
1085 	if (dontroute) {
1086 		SOCK_LOCK(so);
1087 		so->so_options &= ~SO_DONTROUTE;
1088 		SOCK_UNLOCK(so);
1089 	}
1090 	clen = 0;
1091 	control = NULL;
1092 	top = NULL;
1093 out:
1094 	if (top != NULL)
1095 		m_freem(top);
1096 	if (control != NULL)
1097 		m_freem(control);
1098 	return (error);
1099 }
1100 
1101 /*
1102  * Send on a socket.  If send must go all at once and message is larger than
1103  * send buffering, then hard error.  Lock against other senders.  If must go
1104  * all at once and not enough room now, then inform user that this would
1105  * block and do nothing.  Otherwise, if nonblocking, send as much as
1106  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1107  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1108  * in mbuf chain must be small enough to send all at once.
1109  *
1110  * Returns nonzero on error, timeout or signal; callers must check for short
1111  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1112  * on return.
1113  */
1114 int
1115 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1116     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1117 {
1118 	long space, resid;
1119 	int clen = 0, error, dontroute;
1120 	int atomic = sosendallatonce(so) || top;
1121 
1122 	if (uio != NULL)
1123 		resid = uio->uio_resid;
1124 	else
1125 		resid = top->m_pkthdr.len;
1126 	/*
1127 	 * In theory resid should be unsigned.  However, space must be
1128 	 * signed, as it might be less than 0 if we over-committed, and we
1129 	 * must use a signed comparison of space and resid.  On the other
1130 	 * hand, a negative resid causes us to loop sending 0-length
1131 	 * segments to the protocol.
1132 	 *
1133 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1134 	 * type sockets since that's an error.
1135 	 */
1136 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1137 		error = EINVAL;
1138 		goto out;
1139 	}
1140 
1141 	dontroute =
1142 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1143 	    (so->so_proto->pr_flags & PR_ATOMIC);
1144 	if (td != NULL)
1145 		td->td_ru.ru_msgsnd++;
1146 	if (control != NULL)
1147 		clen = control->m_len;
1148 
1149 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1150 	if (error)
1151 		goto out;
1152 
1153 restart:
1154 	do {
1155 		SOCKBUF_LOCK(&so->so_snd);
1156 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1157 			SOCKBUF_UNLOCK(&so->so_snd);
1158 			error = EPIPE;
1159 			goto release;
1160 		}
1161 		if (so->so_error) {
1162 			error = so->so_error;
1163 			so->so_error = 0;
1164 			SOCKBUF_UNLOCK(&so->so_snd);
1165 			goto release;
1166 		}
1167 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1168 			/*
1169 			 * `sendto' and `sendmsg' is allowed on a connection-
1170 			 * based socket if it supports implied connect.
1171 			 * Return ENOTCONN if not connected and no address is
1172 			 * supplied.
1173 			 */
1174 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1175 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1176 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1177 				    !(resid == 0 && clen != 0)) {
1178 					SOCKBUF_UNLOCK(&so->so_snd);
1179 					error = ENOTCONN;
1180 					goto release;
1181 				}
1182 			} else if (addr == NULL) {
1183 				SOCKBUF_UNLOCK(&so->so_snd);
1184 				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1185 					error = ENOTCONN;
1186 				else
1187 					error = EDESTADDRREQ;
1188 				goto release;
1189 			}
1190 		}
1191 		space = sbspace(&so->so_snd);
1192 		if (flags & MSG_OOB)
1193 			space += 1024;
1194 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1195 		    clen > so->so_snd.sb_hiwat) {
1196 			SOCKBUF_UNLOCK(&so->so_snd);
1197 			error = EMSGSIZE;
1198 			goto release;
1199 		}
1200 		if (space < resid + clen &&
1201 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1202 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1203 				SOCKBUF_UNLOCK(&so->so_snd);
1204 				error = EWOULDBLOCK;
1205 				goto release;
1206 			}
1207 			error = sbwait(&so->so_snd);
1208 			SOCKBUF_UNLOCK(&so->so_snd);
1209 			if (error)
1210 				goto release;
1211 			goto restart;
1212 		}
1213 		SOCKBUF_UNLOCK(&so->so_snd);
1214 		space -= clen;
1215 		do {
1216 			if (uio == NULL) {
1217 				resid = 0;
1218 				if (flags & MSG_EOR)
1219 					top->m_flags |= M_EOR;
1220 			} else {
1221 #ifdef ZERO_COPY_SOCKETS
1222 				error = sosend_copyin(uio, &top, atomic,
1223 				    &space, flags);
1224 				if (error != 0)
1225 					goto release;
1226 #else
1227 				/*
1228 				 * Copy the data from userland into a mbuf
1229 				 * chain.  If no data is to be copied in,
1230 				 * a single empty mbuf is returned.
1231 				 */
1232 				top = m_uiotombuf(uio, M_WAITOK, space,
1233 				    (atomic ? max_hdr : 0),
1234 				    (atomic ? M_PKTHDR : 0) |
1235 				    ((flags & MSG_EOR) ? M_EOR : 0));
1236 				if (top == NULL) {
1237 					error = EFAULT; /* only possible error */
1238 					goto release;
1239 				}
1240 				space -= resid - uio->uio_resid;
1241 #endif
1242 				resid = uio->uio_resid;
1243 			}
1244 			if (dontroute) {
1245 				SOCK_LOCK(so);
1246 				so->so_options |= SO_DONTROUTE;
1247 				SOCK_UNLOCK(so);
1248 			}
1249 			/*
1250 			 * XXX all the SBS_CANTSENDMORE checks previously
1251 			 * done could be out of date.  We could have recieved
1252 			 * a reset packet in an interrupt or maybe we slept
1253 			 * while doing page faults in uiomove() etc.  We
1254 			 * could probably recheck again inside the locking
1255 			 * protection here, but there are probably other
1256 			 * places that this also happens.  We must rethink
1257 			 * this.
1258 			 */
1259 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1260 			    (flags & MSG_OOB) ? PRUS_OOB :
1261 			/*
1262 			 * If the user set MSG_EOF, the protocol understands
1263 			 * this flag and nothing left to send then use
1264 			 * PRU_SEND_EOF instead of PRU_SEND.
1265 			 */
1266 			    ((flags & MSG_EOF) &&
1267 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1268 			     (resid <= 0)) ?
1269 				PRUS_EOF :
1270 			/* If there is more to send set PRUS_MORETOCOME. */
1271 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1272 			    top, addr, control, td);
1273 			if (dontroute) {
1274 				SOCK_LOCK(so);
1275 				so->so_options &= ~SO_DONTROUTE;
1276 				SOCK_UNLOCK(so);
1277 			}
1278 			clen = 0;
1279 			control = NULL;
1280 			top = NULL;
1281 			if (error)
1282 				goto release;
1283 		} while (resid && space > 0);
1284 	} while (resid);
1285 
1286 release:
1287 	sbunlock(&so->so_snd);
1288 out:
1289 	if (top != NULL)
1290 		m_freem(top);
1291 	if (control != NULL)
1292 		m_freem(control);
1293 	return (error);
1294 }
1295 
1296 int
1297 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1298     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1299 {
1300 	int error;
1301 
1302 	CURVNET_SET(so->so_vnet);
1303 	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1304 	    control, flags, td);
1305 	CURVNET_RESTORE();
1306 	return (error);
1307 }
1308 
1309 /*
1310  * The part of soreceive() that implements reading non-inline out-of-band
1311  * data from a socket.  For more complete comments, see soreceive(), from
1312  * which this code originated.
1313  *
1314  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1315  * unable to return an mbuf chain to the caller.
1316  */
1317 static int
1318 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1319 {
1320 	struct protosw *pr = so->so_proto;
1321 	struct mbuf *m;
1322 	int error;
1323 
1324 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1325 
1326 	m = m_get(M_WAIT, MT_DATA);
1327 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1328 	if (error)
1329 		goto bad;
1330 	do {
1331 #ifdef ZERO_COPY_SOCKETS
1332 		if (so_zero_copy_receive) {
1333 			int disposable;
1334 
1335 			if ((m->m_flags & M_EXT)
1336 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1337 				disposable = 1;
1338 			else
1339 				disposable = 0;
1340 
1341 			error = uiomoveco(mtod(m, void *),
1342 					  min(uio->uio_resid, m->m_len),
1343 					  uio, disposable);
1344 		} else
1345 #endif /* ZERO_COPY_SOCKETS */
1346 		error = uiomove(mtod(m, void *),
1347 		    (int) min(uio->uio_resid, m->m_len), uio);
1348 		m = m_free(m);
1349 	} while (uio->uio_resid && error == 0 && m);
1350 bad:
1351 	if (m != NULL)
1352 		m_freem(m);
1353 	return (error);
1354 }
1355 
1356 /*
1357  * Following replacement or removal of the first mbuf on the first mbuf chain
1358  * of a socket buffer, push necessary state changes back into the socket
1359  * buffer so that other consumers see the values consistently.  'nextrecord'
1360  * is the callers locally stored value of the original value of
1361  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1362  * NOTE: 'nextrecord' may be NULL.
1363  */
1364 static __inline void
1365 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1366 {
1367 
1368 	SOCKBUF_LOCK_ASSERT(sb);
1369 	/*
1370 	 * First, update for the new value of nextrecord.  If necessary, make
1371 	 * it the first record.
1372 	 */
1373 	if (sb->sb_mb != NULL)
1374 		sb->sb_mb->m_nextpkt = nextrecord;
1375 	else
1376 		sb->sb_mb = nextrecord;
1377 
1378         /*
1379          * Now update any dependent socket buffer fields to reflect the new
1380          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1381 	 * addition of a second clause that takes care of the case where
1382 	 * sb_mb has been updated, but remains the last record.
1383          */
1384         if (sb->sb_mb == NULL) {
1385                 sb->sb_mbtail = NULL;
1386                 sb->sb_lastrecord = NULL;
1387         } else if (sb->sb_mb->m_nextpkt == NULL)
1388                 sb->sb_lastrecord = sb->sb_mb;
1389 }
1390 
1391 
1392 /*
1393  * Implement receive operations on a socket.  We depend on the way that
1394  * records are added to the sockbuf by sbappend.  In particular, each record
1395  * (mbufs linked through m_next) must begin with an address if the protocol
1396  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1397  * data, and then zero or more mbufs of data.  In order to allow parallelism
1398  * between network receive and copying to user space, as well as avoid
1399  * sleeping with a mutex held, we release the socket buffer mutex during the
1400  * user space copy.  Although the sockbuf is locked, new data may still be
1401  * appended, and thus we must maintain consistency of the sockbuf during that
1402  * time.
1403  *
1404  * The caller may receive the data as a single mbuf chain by supplying an
1405  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1406  * the count in uio_resid.
1407  */
1408 int
1409 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1410     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1411 {
1412 	struct mbuf *m, **mp;
1413 	int flags, len, error, offset;
1414 	struct protosw *pr = so->so_proto;
1415 	struct mbuf *nextrecord;
1416 	int moff, type = 0;
1417 	int orig_resid = uio->uio_resid;
1418 
1419 	mp = mp0;
1420 	if (psa != NULL)
1421 		*psa = NULL;
1422 	if (controlp != NULL)
1423 		*controlp = NULL;
1424 	if (flagsp != NULL)
1425 		flags = *flagsp &~ MSG_EOR;
1426 	else
1427 		flags = 0;
1428 	if (flags & MSG_OOB)
1429 		return (soreceive_rcvoob(so, uio, flags));
1430 	if (mp != NULL)
1431 		*mp = NULL;
1432 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1433 	    && uio->uio_resid)
1434 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1435 
1436 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1437 	if (error)
1438 		return (error);
1439 
1440 restart:
1441 	SOCKBUF_LOCK(&so->so_rcv);
1442 	m = so->so_rcv.sb_mb;
1443 	/*
1444 	 * If we have less data than requested, block awaiting more (subject
1445 	 * to any timeout) if:
1446 	 *   1. the current count is less than the low water mark, or
1447 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1448 	 *	receive operation at once if we block (resid <= hiwat).
1449 	 *   3. MSG_DONTWAIT is not set
1450 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1451 	 * we have to do the receive in sections, and thus risk returning a
1452 	 * short count if a timeout or signal occurs after we start.
1453 	 */
1454 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1455 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1456 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1457 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1458 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1459 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1460 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1461 		    m, so->so_rcv.sb_cc));
1462 		if (so->so_error) {
1463 			if (m != NULL)
1464 				goto dontblock;
1465 			error = so->so_error;
1466 			if ((flags & MSG_PEEK) == 0)
1467 				so->so_error = 0;
1468 			SOCKBUF_UNLOCK(&so->so_rcv);
1469 			goto release;
1470 		}
1471 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1472 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1473 			if (m == NULL) {
1474 				SOCKBUF_UNLOCK(&so->so_rcv);
1475 				goto release;
1476 			} else
1477 				goto dontblock;
1478 		}
1479 		for (; m != NULL; m = m->m_next)
1480 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1481 				m = so->so_rcv.sb_mb;
1482 				goto dontblock;
1483 			}
1484 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1485 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1486 			SOCKBUF_UNLOCK(&so->so_rcv);
1487 			error = ENOTCONN;
1488 			goto release;
1489 		}
1490 		if (uio->uio_resid == 0) {
1491 			SOCKBUF_UNLOCK(&so->so_rcv);
1492 			goto release;
1493 		}
1494 		if ((so->so_state & SS_NBIO) ||
1495 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1496 			SOCKBUF_UNLOCK(&so->so_rcv);
1497 			error = EWOULDBLOCK;
1498 			goto release;
1499 		}
1500 		SBLASTRECORDCHK(&so->so_rcv);
1501 		SBLASTMBUFCHK(&so->so_rcv);
1502 		error = sbwait(&so->so_rcv);
1503 		SOCKBUF_UNLOCK(&so->so_rcv);
1504 		if (error)
1505 			goto release;
1506 		goto restart;
1507 	}
1508 dontblock:
1509 	/*
1510 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1511 	 * pointer to the next record in the socket buffer.  We must keep the
1512 	 * various socket buffer pointers and local stack versions of the
1513 	 * pointers in sync, pushing out modifications before dropping the
1514 	 * socket buffer mutex, and re-reading them when picking it up.
1515 	 *
1516 	 * Otherwise, we will race with the network stack appending new data
1517 	 * or records onto the socket buffer by using inconsistent/stale
1518 	 * versions of the field, possibly resulting in socket buffer
1519 	 * corruption.
1520 	 *
1521 	 * By holding the high-level sblock(), we prevent simultaneous
1522 	 * readers from pulling off the front of the socket buffer.
1523 	 */
1524 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1525 	if (uio->uio_td)
1526 		uio->uio_td->td_ru.ru_msgrcv++;
1527 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1528 	SBLASTRECORDCHK(&so->so_rcv);
1529 	SBLASTMBUFCHK(&so->so_rcv);
1530 	nextrecord = m->m_nextpkt;
1531 	if (pr->pr_flags & PR_ADDR) {
1532 		KASSERT(m->m_type == MT_SONAME,
1533 		    ("m->m_type == %d", m->m_type));
1534 		orig_resid = 0;
1535 		if (psa != NULL)
1536 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1537 			    M_NOWAIT);
1538 		if (flags & MSG_PEEK) {
1539 			m = m->m_next;
1540 		} else {
1541 			sbfree(&so->so_rcv, m);
1542 			so->so_rcv.sb_mb = m_free(m);
1543 			m = so->so_rcv.sb_mb;
1544 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1545 		}
1546 	}
1547 
1548 	/*
1549 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1550 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1551 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1552 	 * perform externalization (or freeing if controlp == NULL).
1553 	 */
1554 	if (m != NULL && m->m_type == MT_CONTROL) {
1555 		struct mbuf *cm = NULL, *cmn;
1556 		struct mbuf **cme = &cm;
1557 
1558 		do {
1559 			if (flags & MSG_PEEK) {
1560 				if (controlp != NULL) {
1561 					*controlp = m_copy(m, 0, m->m_len);
1562 					controlp = &(*controlp)->m_next;
1563 				}
1564 				m = m->m_next;
1565 			} else {
1566 				sbfree(&so->so_rcv, m);
1567 				so->so_rcv.sb_mb = m->m_next;
1568 				m->m_next = NULL;
1569 				*cme = m;
1570 				cme = &(*cme)->m_next;
1571 				m = so->so_rcv.sb_mb;
1572 			}
1573 		} while (m != NULL && m->m_type == MT_CONTROL);
1574 		if ((flags & MSG_PEEK) == 0)
1575 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1576 		while (cm != NULL) {
1577 			cmn = cm->m_next;
1578 			cm->m_next = NULL;
1579 			if (pr->pr_domain->dom_externalize != NULL) {
1580 				SOCKBUF_UNLOCK(&so->so_rcv);
1581 				error = (*pr->pr_domain->dom_externalize)
1582 				    (cm, controlp);
1583 				SOCKBUF_LOCK(&so->so_rcv);
1584 			} else if (controlp != NULL)
1585 				*controlp = cm;
1586 			else
1587 				m_freem(cm);
1588 			if (controlp != NULL) {
1589 				orig_resid = 0;
1590 				while (*controlp != NULL)
1591 					controlp = &(*controlp)->m_next;
1592 			}
1593 			cm = cmn;
1594 		}
1595 		if (m != NULL)
1596 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1597 		else
1598 			nextrecord = so->so_rcv.sb_mb;
1599 		orig_resid = 0;
1600 	}
1601 	if (m != NULL) {
1602 		if ((flags & MSG_PEEK) == 0) {
1603 			KASSERT(m->m_nextpkt == nextrecord,
1604 			    ("soreceive: post-control, nextrecord !sync"));
1605 			if (nextrecord == NULL) {
1606 				KASSERT(so->so_rcv.sb_mb == m,
1607 				    ("soreceive: post-control, sb_mb!=m"));
1608 				KASSERT(so->so_rcv.sb_lastrecord == m,
1609 				    ("soreceive: post-control, lastrecord!=m"));
1610 			}
1611 		}
1612 		type = m->m_type;
1613 		if (type == MT_OOBDATA)
1614 			flags |= MSG_OOB;
1615 	} else {
1616 		if ((flags & MSG_PEEK) == 0) {
1617 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1618 			    ("soreceive: sb_mb != nextrecord"));
1619 			if (so->so_rcv.sb_mb == NULL) {
1620 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1621 				    ("soreceive: sb_lastercord != NULL"));
1622 			}
1623 		}
1624 	}
1625 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1626 	SBLASTRECORDCHK(&so->so_rcv);
1627 	SBLASTMBUFCHK(&so->so_rcv);
1628 
1629 	/*
1630 	 * Now continue to read any data mbufs off of the head of the socket
1631 	 * buffer until the read request is satisfied.  Note that 'type' is
1632 	 * used to store the type of any mbuf reads that have happened so far
1633 	 * such that soreceive() can stop reading if the type changes, which
1634 	 * causes soreceive() to return only one of regular data and inline
1635 	 * out-of-band data in a single socket receive operation.
1636 	 */
1637 	moff = 0;
1638 	offset = 0;
1639 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1640 		/*
1641 		 * If the type of mbuf has changed since the last mbuf
1642 		 * examined ('type'), end the receive operation.
1643 	 	 */
1644 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1645 		if (m->m_type == MT_OOBDATA) {
1646 			if (type != MT_OOBDATA)
1647 				break;
1648 		} else if (type == MT_OOBDATA)
1649 			break;
1650 		else
1651 		    KASSERT(m->m_type == MT_DATA,
1652 			("m->m_type == %d", m->m_type));
1653 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1654 		len = uio->uio_resid;
1655 		if (so->so_oobmark && len > so->so_oobmark - offset)
1656 			len = so->so_oobmark - offset;
1657 		if (len > m->m_len - moff)
1658 			len = m->m_len - moff;
1659 		/*
1660 		 * If mp is set, just pass back the mbufs.  Otherwise copy
1661 		 * them out via the uio, then free.  Sockbuf must be
1662 		 * consistent here (points to current mbuf, it points to next
1663 		 * record) when we drop priority; we must note any additions
1664 		 * to the sockbuf when we block interrupts again.
1665 		 */
1666 		if (mp == NULL) {
1667 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1668 			SBLASTRECORDCHK(&so->so_rcv);
1669 			SBLASTMBUFCHK(&so->so_rcv);
1670 			SOCKBUF_UNLOCK(&so->so_rcv);
1671 #ifdef ZERO_COPY_SOCKETS
1672 			if (so_zero_copy_receive) {
1673 				int disposable;
1674 
1675 				if ((m->m_flags & M_EXT)
1676 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1677 					disposable = 1;
1678 				else
1679 					disposable = 0;
1680 
1681 				error = uiomoveco(mtod(m, char *) + moff,
1682 						  (int)len, uio,
1683 						  disposable);
1684 			} else
1685 #endif /* ZERO_COPY_SOCKETS */
1686 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1687 			SOCKBUF_LOCK(&so->so_rcv);
1688 			if (error) {
1689 				/*
1690 				 * The MT_SONAME mbuf has already been removed
1691 				 * from the record, so it is necessary to
1692 				 * remove the data mbufs, if any, to preserve
1693 				 * the invariant in the case of PR_ADDR that
1694 				 * requires MT_SONAME mbufs at the head of
1695 				 * each record.
1696 				 */
1697 				if (m && pr->pr_flags & PR_ATOMIC &&
1698 				    ((flags & MSG_PEEK) == 0))
1699 					(void)sbdroprecord_locked(&so->so_rcv);
1700 				SOCKBUF_UNLOCK(&so->so_rcv);
1701 				goto release;
1702 			}
1703 		} else
1704 			uio->uio_resid -= len;
1705 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1706 		if (len == m->m_len - moff) {
1707 			if (m->m_flags & M_EOR)
1708 				flags |= MSG_EOR;
1709 			if (flags & MSG_PEEK) {
1710 				m = m->m_next;
1711 				moff = 0;
1712 			} else {
1713 				nextrecord = m->m_nextpkt;
1714 				sbfree(&so->so_rcv, m);
1715 				if (mp != NULL) {
1716 					*mp = m;
1717 					mp = &m->m_next;
1718 					so->so_rcv.sb_mb = m = m->m_next;
1719 					*mp = NULL;
1720 				} else {
1721 					so->so_rcv.sb_mb = m_free(m);
1722 					m = so->so_rcv.sb_mb;
1723 				}
1724 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1725 				SBLASTRECORDCHK(&so->so_rcv);
1726 				SBLASTMBUFCHK(&so->so_rcv);
1727 			}
1728 		} else {
1729 			if (flags & MSG_PEEK)
1730 				moff += len;
1731 			else {
1732 				if (mp != NULL) {
1733 					int copy_flag;
1734 
1735 					if (flags & MSG_DONTWAIT)
1736 						copy_flag = M_DONTWAIT;
1737 					else
1738 						copy_flag = M_WAIT;
1739 					if (copy_flag == M_WAIT)
1740 						SOCKBUF_UNLOCK(&so->so_rcv);
1741 					*mp = m_copym(m, 0, len, copy_flag);
1742 					if (copy_flag == M_WAIT)
1743 						SOCKBUF_LOCK(&so->so_rcv);
1744  					if (*mp == NULL) {
1745  						/*
1746  						 * m_copym() couldn't
1747 						 * allocate an mbuf.  Adjust
1748 						 * uio_resid back (it was
1749 						 * adjusted down by len
1750 						 * bytes, which we didn't end
1751 						 * up "copying" over).
1752  						 */
1753  						uio->uio_resid += len;
1754  						break;
1755  					}
1756 				}
1757 				m->m_data += len;
1758 				m->m_len -= len;
1759 				so->so_rcv.sb_cc -= len;
1760 			}
1761 		}
1762 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1763 		if (so->so_oobmark) {
1764 			if ((flags & MSG_PEEK) == 0) {
1765 				so->so_oobmark -= len;
1766 				if (so->so_oobmark == 0) {
1767 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1768 					break;
1769 				}
1770 			} else {
1771 				offset += len;
1772 				if (offset == so->so_oobmark)
1773 					break;
1774 			}
1775 		}
1776 		if (flags & MSG_EOR)
1777 			break;
1778 		/*
1779 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1780 		 * must not quit until "uio->uio_resid == 0" or an error
1781 		 * termination.  If a signal/timeout occurs, return with a
1782 		 * short count but without error.  Keep sockbuf locked
1783 		 * against other readers.
1784 		 */
1785 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1786 		    !sosendallatonce(so) && nextrecord == NULL) {
1787 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1788 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1789 				break;
1790 			/*
1791 			 * Notify the protocol that some data has been
1792 			 * drained before blocking.
1793 			 */
1794 			if (pr->pr_flags & PR_WANTRCVD) {
1795 				SOCKBUF_UNLOCK(&so->so_rcv);
1796 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1797 				SOCKBUF_LOCK(&so->so_rcv);
1798 			}
1799 			SBLASTRECORDCHK(&so->so_rcv);
1800 			SBLASTMBUFCHK(&so->so_rcv);
1801 			error = sbwait(&so->so_rcv);
1802 			if (error) {
1803 				SOCKBUF_UNLOCK(&so->so_rcv);
1804 				goto release;
1805 			}
1806 			m = so->so_rcv.sb_mb;
1807 			if (m != NULL)
1808 				nextrecord = m->m_nextpkt;
1809 		}
1810 	}
1811 
1812 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1813 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1814 		flags |= MSG_TRUNC;
1815 		if ((flags & MSG_PEEK) == 0)
1816 			(void) sbdroprecord_locked(&so->so_rcv);
1817 	}
1818 	if ((flags & MSG_PEEK) == 0) {
1819 		if (m == NULL) {
1820 			/*
1821 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1822 			 * part makes sure sb_lastrecord is up-to-date if
1823 			 * there is still data in the socket buffer.
1824 			 */
1825 			so->so_rcv.sb_mb = nextrecord;
1826 			if (so->so_rcv.sb_mb == NULL) {
1827 				so->so_rcv.sb_mbtail = NULL;
1828 				so->so_rcv.sb_lastrecord = NULL;
1829 			} else if (nextrecord->m_nextpkt == NULL)
1830 				so->so_rcv.sb_lastrecord = nextrecord;
1831 		}
1832 		SBLASTRECORDCHK(&so->so_rcv);
1833 		SBLASTMBUFCHK(&so->so_rcv);
1834 		/*
1835 		 * If soreceive() is being done from the socket callback,
1836 		 * then don't need to generate ACK to peer to update window,
1837 		 * since ACK will be generated on return to TCP.
1838 		 */
1839 		if (!(flags & MSG_SOCALLBCK) &&
1840 		    (pr->pr_flags & PR_WANTRCVD)) {
1841 			SOCKBUF_UNLOCK(&so->so_rcv);
1842 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1843 			SOCKBUF_LOCK(&so->so_rcv);
1844 		}
1845 	}
1846 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1847 	if (orig_resid == uio->uio_resid && orig_resid &&
1848 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1849 		SOCKBUF_UNLOCK(&so->so_rcv);
1850 		goto restart;
1851 	}
1852 	SOCKBUF_UNLOCK(&so->so_rcv);
1853 
1854 	if (flagsp != NULL)
1855 		*flagsp |= flags;
1856 release:
1857 	sbunlock(&so->so_rcv);
1858 	return (error);
1859 }
1860 
1861 /*
1862  * Optimized version of soreceive() for stream (TCP) sockets.
1863  */
1864 int
1865 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1866     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1867 {
1868 	int len = 0, error = 0, flags, oresid;
1869 	struct sockbuf *sb;
1870 	struct mbuf *m, *n = NULL;
1871 
1872 	/* We only do stream sockets. */
1873 	if (so->so_type != SOCK_STREAM)
1874 		return (EINVAL);
1875 	if (psa != NULL)
1876 		*psa = NULL;
1877 	if (controlp != NULL)
1878 		return (EINVAL);
1879 	if (flagsp != NULL)
1880 		flags = *flagsp &~ MSG_EOR;
1881 	else
1882 		flags = 0;
1883 	if (flags & MSG_OOB)
1884 		return (soreceive_rcvoob(so, uio, flags));
1885 	if (mp0 != NULL)
1886 		*mp0 = NULL;
1887 
1888 	sb = &so->so_rcv;
1889 
1890 	/* Prevent other readers from entering the socket. */
1891 	error = sblock(sb, SBLOCKWAIT(flags));
1892 	if (error)
1893 		goto out;
1894 	SOCKBUF_LOCK(sb);
1895 
1896 	/* Easy one, no space to copyout anything. */
1897 	if (uio->uio_resid == 0) {
1898 		error = EINVAL;
1899 		goto out;
1900 	}
1901 	oresid = uio->uio_resid;
1902 
1903 	/* We will never ever get anything unless we are connected. */
1904 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1905 		/* When disconnecting there may be still some data left. */
1906 		if (sb->sb_cc > 0)
1907 			goto deliver;
1908 		if (!(so->so_state & SS_ISDISCONNECTED))
1909 			error = ENOTCONN;
1910 		goto out;
1911 	}
1912 
1913 	/* Socket buffer is empty and we shall not block. */
1914 	if (sb->sb_cc == 0 &&
1915 	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1916 		error = EAGAIN;
1917 		goto out;
1918 	}
1919 
1920 restart:
1921 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1922 
1923 	/* Abort if socket has reported problems. */
1924 	if (so->so_error) {
1925 		if (sb->sb_cc > 0)
1926 			goto deliver;
1927 		if (oresid > uio->uio_resid)
1928 			goto out;
1929 		error = so->so_error;
1930 		if (!(flags & MSG_PEEK))
1931 			so->so_error = 0;
1932 		goto out;
1933 	}
1934 
1935 	/* Door is closed.  Deliver what is left, if any. */
1936 	if (sb->sb_state & SBS_CANTRCVMORE) {
1937 		if (sb->sb_cc > 0)
1938 			goto deliver;
1939 		else
1940 			goto out;
1941 	}
1942 
1943 	/* Socket buffer got some data that we shall deliver now. */
1944 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1945 	    ((sb->sb_flags & SS_NBIO) ||
1946 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1947 	     sb->sb_cc >= sb->sb_lowat ||
1948 	     sb->sb_cc >= uio->uio_resid ||
1949 	     sb->sb_cc >= sb->sb_hiwat) ) {
1950 		goto deliver;
1951 	}
1952 
1953 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1954 	if ((flags & MSG_WAITALL) &&
1955 	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1956 		goto deliver;
1957 
1958 	/*
1959 	 * Wait and block until (more) data comes in.
1960 	 * NB: Drops the sockbuf lock during wait.
1961 	 */
1962 	error = sbwait(sb);
1963 	if (error)
1964 		goto out;
1965 	goto restart;
1966 
1967 deliver:
1968 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1969 	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1970 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1971 
1972 	/* Statistics. */
1973 	if (uio->uio_td)
1974 		uio->uio_td->td_ru.ru_msgrcv++;
1975 
1976 	/* Fill uio until full or current end of socket buffer is reached. */
1977 	len = min(uio->uio_resid, sb->sb_cc);
1978 	if (mp0 != NULL) {
1979 		/* Dequeue as many mbufs as possible. */
1980 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1981 			for (*mp0 = m = sb->sb_mb;
1982 			     m != NULL && m->m_len <= len;
1983 			     m = m->m_next) {
1984 				len -= m->m_len;
1985 				uio->uio_resid -= m->m_len;
1986 				sbfree(sb, m);
1987 				n = m;
1988 			}
1989 			sb->sb_mb = m;
1990 			if (sb->sb_mb == NULL)
1991 				SB_EMPTY_FIXUP(sb);
1992 			n->m_next = NULL;
1993 		}
1994 		/* Copy the remainder. */
1995 		if (len > 0) {
1996 			KASSERT(sb->sb_mb != NULL,
1997 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1998 
1999 			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
2000 			if (m == NULL)
2001 				len = 0;	/* Don't flush data from sockbuf. */
2002 			else
2003 				uio->uio_resid -= m->m_len;
2004 			if (*mp0 != NULL)
2005 				n->m_next = m;
2006 			else
2007 				*mp0 = m;
2008 			if (*mp0 == NULL) {
2009 				error = ENOBUFS;
2010 				goto out;
2011 			}
2012 		}
2013 	} else {
2014 		/* NB: Must unlock socket buffer as uiomove may sleep. */
2015 		SOCKBUF_UNLOCK(sb);
2016 		error = m_mbuftouio(uio, sb->sb_mb, len);
2017 		SOCKBUF_LOCK(sb);
2018 		if (error)
2019 			goto out;
2020 	}
2021 	SBLASTRECORDCHK(sb);
2022 	SBLASTMBUFCHK(sb);
2023 
2024 	/*
2025 	 * Remove the delivered data from the socket buffer unless we
2026 	 * were only peeking.
2027 	 */
2028 	if (!(flags & MSG_PEEK)) {
2029 		if (len > 0)
2030 			sbdrop_locked(sb, len);
2031 
2032 		/* Notify protocol that we drained some data. */
2033 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2034 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2035 		     !(flags & MSG_SOCALLBCK))) {
2036 			SOCKBUF_UNLOCK(sb);
2037 			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2038 			SOCKBUF_LOCK(sb);
2039 		}
2040 	}
2041 
2042 	/*
2043 	 * For MSG_WAITALL we may have to loop again and wait for
2044 	 * more data to come in.
2045 	 */
2046 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2047 		goto restart;
2048 out:
2049 	SOCKBUF_LOCK_ASSERT(sb);
2050 	SBLASTRECORDCHK(sb);
2051 	SBLASTMBUFCHK(sb);
2052 	SOCKBUF_UNLOCK(sb);
2053 	sbunlock(sb);
2054 	return (error);
2055 }
2056 
2057 /*
2058  * Optimized version of soreceive() for simple datagram cases from userspace.
2059  * Unlike in the stream case, we're able to drop a datagram if copyout()
2060  * fails, and because we handle datagrams atomically, we don't need to use a
2061  * sleep lock to prevent I/O interlacing.
2062  */
2063 int
2064 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2065     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2066 {
2067 	struct mbuf *m, *m2;
2068 	int flags, len, error;
2069 	struct protosw *pr = so->so_proto;
2070 	struct mbuf *nextrecord;
2071 
2072 	if (psa != NULL)
2073 		*psa = NULL;
2074 	if (controlp != NULL)
2075 		*controlp = NULL;
2076 	if (flagsp != NULL)
2077 		flags = *flagsp &~ MSG_EOR;
2078 	else
2079 		flags = 0;
2080 
2081 	/*
2082 	 * For any complicated cases, fall back to the full
2083 	 * soreceive_generic().
2084 	 */
2085 	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2086 		return (soreceive_generic(so, psa, uio, mp0, controlp,
2087 		    flagsp));
2088 
2089 	/*
2090 	 * Enforce restrictions on use.
2091 	 */
2092 	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2093 	    ("soreceive_dgram: wantrcvd"));
2094 	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2095 	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2096 	    ("soreceive_dgram: SBS_RCVATMARK"));
2097 	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2098 	    ("soreceive_dgram: P_CONNREQUIRED"));
2099 
2100 	/*
2101 	 * Loop blocking while waiting for a datagram.
2102 	 */
2103 	SOCKBUF_LOCK(&so->so_rcv);
2104 	while ((m = so->so_rcv.sb_mb) == NULL) {
2105 		KASSERT(so->so_rcv.sb_cc == 0,
2106 		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2107 		    so->so_rcv.sb_cc));
2108 		if (so->so_error) {
2109 			error = so->so_error;
2110 			so->so_error = 0;
2111 			SOCKBUF_UNLOCK(&so->so_rcv);
2112 			return (error);
2113 		}
2114 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2115 		    uio->uio_resid == 0) {
2116 			SOCKBUF_UNLOCK(&so->so_rcv);
2117 			return (0);
2118 		}
2119 		if ((so->so_state & SS_NBIO) ||
2120 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2121 			SOCKBUF_UNLOCK(&so->so_rcv);
2122 			return (EWOULDBLOCK);
2123 		}
2124 		SBLASTRECORDCHK(&so->so_rcv);
2125 		SBLASTMBUFCHK(&so->so_rcv);
2126 		error = sbwait(&so->so_rcv);
2127 		if (error) {
2128 			SOCKBUF_UNLOCK(&so->so_rcv);
2129 			return (error);
2130 		}
2131 	}
2132 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2133 
2134 	if (uio->uio_td)
2135 		uio->uio_td->td_ru.ru_msgrcv++;
2136 	SBLASTRECORDCHK(&so->so_rcv);
2137 	SBLASTMBUFCHK(&so->so_rcv);
2138 	nextrecord = m->m_nextpkt;
2139 	if (nextrecord == NULL) {
2140 		KASSERT(so->so_rcv.sb_lastrecord == m,
2141 		    ("soreceive_dgram: lastrecord != m"));
2142 	}
2143 
2144 	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2145 	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2146 
2147 	/*
2148 	 * Pull 'm' and its chain off the front of the packet queue.
2149 	 */
2150 	so->so_rcv.sb_mb = NULL;
2151 	sockbuf_pushsync(&so->so_rcv, nextrecord);
2152 
2153 	/*
2154 	 * Walk 'm's chain and free that many bytes from the socket buffer.
2155 	 */
2156 	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2157 		sbfree(&so->so_rcv, m2);
2158 
2159 	/*
2160 	 * Do a few last checks before we let go of the lock.
2161 	 */
2162 	SBLASTRECORDCHK(&so->so_rcv);
2163 	SBLASTMBUFCHK(&so->so_rcv);
2164 	SOCKBUF_UNLOCK(&so->so_rcv);
2165 
2166 	if (pr->pr_flags & PR_ADDR) {
2167 		KASSERT(m->m_type == MT_SONAME,
2168 		    ("m->m_type == %d", m->m_type));
2169 		if (psa != NULL)
2170 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2171 			    M_NOWAIT);
2172 		m = m_free(m);
2173 	}
2174 	if (m == NULL) {
2175 		/* XXXRW: Can this happen? */
2176 		return (0);
2177 	}
2178 
2179 	/*
2180 	 * Packet to copyout() is now in 'm' and it is disconnected from the
2181 	 * queue.
2182 	 *
2183 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2184 	 * in the first mbuf chain on the socket buffer.  We call into the
2185 	 * protocol to perform externalization (or freeing if controlp ==
2186 	 * NULL).
2187 	 */
2188 	if (m->m_type == MT_CONTROL) {
2189 		struct mbuf *cm = NULL, *cmn;
2190 		struct mbuf **cme = &cm;
2191 
2192 		do {
2193 			m2 = m->m_next;
2194 			m->m_next = NULL;
2195 			*cme = m;
2196 			cme = &(*cme)->m_next;
2197 			m = m2;
2198 		} while (m != NULL && m->m_type == MT_CONTROL);
2199 		while (cm != NULL) {
2200 			cmn = cm->m_next;
2201 			cm->m_next = NULL;
2202 			if (pr->pr_domain->dom_externalize != NULL) {
2203 				error = (*pr->pr_domain->dom_externalize)
2204 				    (cm, controlp);
2205 			} else if (controlp != NULL)
2206 				*controlp = cm;
2207 			else
2208 				m_freem(cm);
2209 			if (controlp != NULL) {
2210 				while (*controlp != NULL)
2211 					controlp = &(*controlp)->m_next;
2212 			}
2213 			cm = cmn;
2214 		}
2215 	}
2216 	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2217 
2218 	while (m != NULL && uio->uio_resid > 0) {
2219 		len = uio->uio_resid;
2220 		if (len > m->m_len)
2221 			len = m->m_len;
2222 		error = uiomove(mtod(m, char *), (int)len, uio);
2223 		if (error) {
2224 			m_freem(m);
2225 			return (error);
2226 		}
2227 		m = m_free(m);
2228 	}
2229 	if (m != NULL)
2230 		flags |= MSG_TRUNC;
2231 	m_freem(m);
2232 	if (flagsp != NULL)
2233 		*flagsp |= flags;
2234 	return (0);
2235 }
2236 
2237 int
2238 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2239     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2240 {
2241 
2242 	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2243 	    controlp, flagsp));
2244 }
2245 
2246 int
2247 soshutdown(struct socket *so, int how)
2248 {
2249 	struct protosw *pr = so->so_proto;
2250 	int error;
2251 
2252 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2253 		return (EINVAL);
2254 	if (pr->pr_usrreqs->pru_flush != NULL) {
2255 	        (*pr->pr_usrreqs->pru_flush)(so, how);
2256 	}
2257 	if (how != SHUT_WR)
2258 		sorflush(so);
2259 	if (how != SHUT_RD) {
2260 		CURVNET_SET(so->so_vnet);
2261 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2262 		CURVNET_RESTORE();
2263 		return (error);
2264 	}
2265 	return (0);
2266 }
2267 
2268 void
2269 sorflush(struct socket *so)
2270 {
2271 	struct sockbuf *sb = &so->so_rcv;
2272 	struct protosw *pr = so->so_proto;
2273 	struct sockbuf asb;
2274 
2275 	/*
2276 	 * In order to avoid calling dom_dispose with the socket buffer mutex
2277 	 * held, and in order to generally avoid holding the lock for a long
2278 	 * time, we make a copy of the socket buffer and clear the original
2279 	 * (except locks, state).  The new socket buffer copy won't have
2280 	 * initialized locks so we can only call routines that won't use or
2281 	 * assert those locks.
2282 	 *
2283 	 * Dislodge threads currently blocked in receive and wait to acquire
2284 	 * a lock against other simultaneous readers before clearing the
2285 	 * socket buffer.  Don't let our acquire be interrupted by a signal
2286 	 * despite any existing socket disposition on interruptable waiting.
2287 	 */
2288 	CURVNET_SET(so->so_vnet);
2289 	socantrcvmore(so);
2290 	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2291 
2292 	/*
2293 	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2294 	 * and mutex data unchanged.
2295 	 */
2296 	SOCKBUF_LOCK(sb);
2297 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2298 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2299 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2300 	bzero(&sb->sb_startzero,
2301 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2302 	SOCKBUF_UNLOCK(sb);
2303 	sbunlock(sb);
2304 
2305 	/*
2306 	 * Dispose of special rights and flush the socket buffer.  Don't call
2307 	 * any unsafe routines (that rely on locks being initialized) on asb.
2308 	 */
2309 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2310 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2311 	sbrelease_internal(&asb, so);
2312 	CURVNET_RESTORE();
2313 }
2314 
2315 /*
2316  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2317  * additional variant to handle the case where the option value needs to be
2318  * some kind of integer, but not a specific size.  In addition to their use
2319  * here, these functions are also called by the protocol-level pr_ctloutput()
2320  * routines.
2321  */
2322 int
2323 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2324 {
2325 	size_t	valsize;
2326 
2327 	/*
2328 	 * If the user gives us more than we wanted, we ignore it, but if we
2329 	 * don't get the minimum length the caller wants, we return EINVAL.
2330 	 * On success, sopt->sopt_valsize is set to however much we actually
2331 	 * retrieved.
2332 	 */
2333 	if ((valsize = sopt->sopt_valsize) < minlen)
2334 		return EINVAL;
2335 	if (valsize > len)
2336 		sopt->sopt_valsize = valsize = len;
2337 
2338 	if (sopt->sopt_td != NULL)
2339 		return (copyin(sopt->sopt_val, buf, valsize));
2340 
2341 	bcopy(sopt->sopt_val, buf, valsize);
2342 	return (0);
2343 }
2344 
2345 /*
2346  * Kernel version of setsockopt(2).
2347  *
2348  * XXX: optlen is size_t, not socklen_t
2349  */
2350 int
2351 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2352     size_t optlen)
2353 {
2354 	struct sockopt sopt;
2355 
2356 	sopt.sopt_level = level;
2357 	sopt.sopt_name = optname;
2358 	sopt.sopt_dir = SOPT_SET;
2359 	sopt.sopt_val = optval;
2360 	sopt.sopt_valsize = optlen;
2361 	sopt.sopt_td = NULL;
2362 	return (sosetopt(so, &sopt));
2363 }
2364 
2365 int
2366 sosetopt(struct socket *so, struct sockopt *sopt)
2367 {
2368 	int	error, optval;
2369 	struct	linger l;
2370 	struct	timeval tv;
2371 	u_long  val;
2372 #ifdef MAC
2373 	struct mac extmac;
2374 #endif
2375 
2376 	error = 0;
2377 	if (sopt->sopt_level != SOL_SOCKET) {
2378 		if (so->so_proto && so->so_proto->pr_ctloutput)
2379 			return ((*so->so_proto->pr_ctloutput)
2380 				  (so, sopt));
2381 		error = ENOPROTOOPT;
2382 	} else {
2383 		switch (sopt->sopt_name) {
2384 #ifdef INET
2385 		case SO_ACCEPTFILTER:
2386 			error = do_setopt_accept_filter(so, sopt);
2387 			if (error)
2388 				goto bad;
2389 			break;
2390 #endif
2391 		case SO_LINGER:
2392 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2393 			if (error)
2394 				goto bad;
2395 
2396 			SOCK_LOCK(so);
2397 			so->so_linger = l.l_linger;
2398 			if (l.l_onoff)
2399 				so->so_options |= SO_LINGER;
2400 			else
2401 				so->so_options &= ~SO_LINGER;
2402 			SOCK_UNLOCK(so);
2403 			break;
2404 
2405 		case SO_DEBUG:
2406 		case SO_KEEPALIVE:
2407 		case SO_DONTROUTE:
2408 		case SO_USELOOPBACK:
2409 		case SO_BROADCAST:
2410 		case SO_REUSEADDR:
2411 		case SO_REUSEPORT:
2412 		case SO_OOBINLINE:
2413 		case SO_TIMESTAMP:
2414 		case SO_BINTIME:
2415 		case SO_NOSIGPIPE:
2416 		case SO_NO_DDP:
2417 		case SO_NO_OFFLOAD:
2418 			error = sooptcopyin(sopt, &optval, sizeof optval,
2419 					    sizeof optval);
2420 			if (error)
2421 				goto bad;
2422 			SOCK_LOCK(so);
2423 			if (optval)
2424 				so->so_options |= sopt->sopt_name;
2425 			else
2426 				so->so_options &= ~sopt->sopt_name;
2427 			SOCK_UNLOCK(so);
2428 			break;
2429 
2430 		case SO_SETFIB:
2431 			error = sooptcopyin(sopt, &optval, sizeof optval,
2432 					    sizeof optval);
2433 			if (optval < 1 || optval > rt_numfibs) {
2434 				error = EINVAL;
2435 				goto bad;
2436 			}
2437 			if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
2438 			    (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
2439 				so->so_fibnum = optval;
2440 				/* Note: ignore error */
2441 				if (so->so_proto && so->so_proto->pr_ctloutput)
2442 					(*so->so_proto->pr_ctloutput)(so, sopt);
2443 			} else {
2444 				so->so_fibnum = 0;
2445 			}
2446 			break;
2447 		case SO_SNDBUF:
2448 		case SO_RCVBUF:
2449 		case SO_SNDLOWAT:
2450 		case SO_RCVLOWAT:
2451 			error = sooptcopyin(sopt, &optval, sizeof optval,
2452 					    sizeof optval);
2453 			if (error)
2454 				goto bad;
2455 
2456 			/*
2457 			 * Values < 1 make no sense for any of these options,
2458 			 * so disallow them.
2459 			 */
2460 			if (optval < 1) {
2461 				error = EINVAL;
2462 				goto bad;
2463 			}
2464 
2465 			switch (sopt->sopt_name) {
2466 			case SO_SNDBUF:
2467 			case SO_RCVBUF:
2468 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2469 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2470 				    so, curthread) == 0) {
2471 					error = ENOBUFS;
2472 					goto bad;
2473 				}
2474 				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2475 				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2476 				break;
2477 
2478 			/*
2479 			 * Make sure the low-water is never greater than the
2480 			 * high-water.
2481 			 */
2482 			case SO_SNDLOWAT:
2483 				SOCKBUF_LOCK(&so->so_snd);
2484 				so->so_snd.sb_lowat =
2485 				    (optval > so->so_snd.sb_hiwat) ?
2486 				    so->so_snd.sb_hiwat : optval;
2487 				SOCKBUF_UNLOCK(&so->so_snd);
2488 				break;
2489 			case SO_RCVLOWAT:
2490 				SOCKBUF_LOCK(&so->so_rcv);
2491 				so->so_rcv.sb_lowat =
2492 				    (optval > so->so_rcv.sb_hiwat) ?
2493 				    so->so_rcv.sb_hiwat : optval;
2494 				SOCKBUF_UNLOCK(&so->so_rcv);
2495 				break;
2496 			}
2497 			break;
2498 
2499 		case SO_SNDTIMEO:
2500 		case SO_RCVTIMEO:
2501 #ifdef COMPAT_IA32
2502 			if (SV_CURPROC_FLAG(SV_ILP32)) {
2503 				struct timeval32 tv32;
2504 
2505 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2506 				    sizeof tv32);
2507 				CP(tv32, tv, tv_sec);
2508 				CP(tv32, tv, tv_usec);
2509 			} else
2510 #endif
2511 				error = sooptcopyin(sopt, &tv, sizeof tv,
2512 				    sizeof tv);
2513 			if (error)
2514 				goto bad;
2515 
2516 			/* assert(hz > 0); */
2517 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2518 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2519 				error = EDOM;
2520 				goto bad;
2521 			}
2522 			/* assert(tick > 0); */
2523 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2524 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2525 			if (val > INT_MAX) {
2526 				error = EDOM;
2527 				goto bad;
2528 			}
2529 			if (val == 0 && tv.tv_usec != 0)
2530 				val = 1;
2531 
2532 			switch (sopt->sopt_name) {
2533 			case SO_SNDTIMEO:
2534 				so->so_snd.sb_timeo = val;
2535 				break;
2536 			case SO_RCVTIMEO:
2537 				so->so_rcv.sb_timeo = val;
2538 				break;
2539 			}
2540 			break;
2541 
2542 		case SO_LABEL:
2543 #ifdef MAC
2544 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2545 			    sizeof extmac);
2546 			if (error)
2547 				goto bad;
2548 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2549 			    so, &extmac);
2550 #else
2551 			error = EOPNOTSUPP;
2552 #endif
2553 			break;
2554 
2555 		default:
2556 			error = ENOPROTOOPT;
2557 			break;
2558 		}
2559 		if (error == 0 && so->so_proto != NULL &&
2560 		    so->so_proto->pr_ctloutput != NULL) {
2561 			(void) ((*so->so_proto->pr_ctloutput)
2562 				  (so, sopt));
2563 		}
2564 	}
2565 bad:
2566 	return (error);
2567 }
2568 
2569 /*
2570  * Helper routine for getsockopt.
2571  */
2572 int
2573 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2574 {
2575 	int	error;
2576 	size_t	valsize;
2577 
2578 	error = 0;
2579 
2580 	/*
2581 	 * Documented get behavior is that we always return a value, possibly
2582 	 * truncated to fit in the user's buffer.  Traditional behavior is
2583 	 * that we always tell the user precisely how much we copied, rather
2584 	 * than something useful like the total amount we had available for
2585 	 * her.  Note that this interface is not idempotent; the entire
2586 	 * answer must generated ahead of time.
2587 	 */
2588 	valsize = min(len, sopt->sopt_valsize);
2589 	sopt->sopt_valsize = valsize;
2590 	if (sopt->sopt_val != NULL) {
2591 		if (sopt->sopt_td != NULL)
2592 			error = copyout(buf, sopt->sopt_val, valsize);
2593 		else
2594 			bcopy(buf, sopt->sopt_val, valsize);
2595 	}
2596 	return (error);
2597 }
2598 
2599 int
2600 sogetopt(struct socket *so, struct sockopt *sopt)
2601 {
2602 	int	error, optval;
2603 	struct	linger l;
2604 	struct	timeval tv;
2605 #ifdef MAC
2606 	struct mac extmac;
2607 #endif
2608 
2609 	error = 0;
2610 	if (sopt->sopt_level != SOL_SOCKET) {
2611 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2612 			return ((*so->so_proto->pr_ctloutput)
2613 				  (so, sopt));
2614 		} else
2615 			return (ENOPROTOOPT);
2616 	} else {
2617 		switch (sopt->sopt_name) {
2618 #ifdef INET
2619 		case SO_ACCEPTFILTER:
2620 			error = do_getopt_accept_filter(so, sopt);
2621 			break;
2622 #endif
2623 		case SO_LINGER:
2624 			SOCK_LOCK(so);
2625 			l.l_onoff = so->so_options & SO_LINGER;
2626 			l.l_linger = so->so_linger;
2627 			SOCK_UNLOCK(so);
2628 			error = sooptcopyout(sopt, &l, sizeof l);
2629 			break;
2630 
2631 		case SO_USELOOPBACK:
2632 		case SO_DONTROUTE:
2633 		case SO_DEBUG:
2634 		case SO_KEEPALIVE:
2635 		case SO_REUSEADDR:
2636 		case SO_REUSEPORT:
2637 		case SO_BROADCAST:
2638 		case SO_OOBINLINE:
2639 		case SO_ACCEPTCONN:
2640 		case SO_TIMESTAMP:
2641 		case SO_BINTIME:
2642 		case SO_NOSIGPIPE:
2643 			optval = so->so_options & sopt->sopt_name;
2644 integer:
2645 			error = sooptcopyout(sopt, &optval, sizeof optval);
2646 			break;
2647 
2648 		case SO_TYPE:
2649 			optval = so->so_type;
2650 			goto integer;
2651 
2652 		case SO_ERROR:
2653 			SOCK_LOCK(so);
2654 			optval = so->so_error;
2655 			so->so_error = 0;
2656 			SOCK_UNLOCK(so);
2657 			goto integer;
2658 
2659 		case SO_SNDBUF:
2660 			optval = so->so_snd.sb_hiwat;
2661 			goto integer;
2662 
2663 		case SO_RCVBUF:
2664 			optval = so->so_rcv.sb_hiwat;
2665 			goto integer;
2666 
2667 		case SO_SNDLOWAT:
2668 			optval = so->so_snd.sb_lowat;
2669 			goto integer;
2670 
2671 		case SO_RCVLOWAT:
2672 			optval = so->so_rcv.sb_lowat;
2673 			goto integer;
2674 
2675 		case SO_SNDTIMEO:
2676 		case SO_RCVTIMEO:
2677 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2678 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2679 
2680 			tv.tv_sec = optval / hz;
2681 			tv.tv_usec = (optval % hz) * tick;
2682 #ifdef COMPAT_IA32
2683 			if (SV_CURPROC_FLAG(SV_ILP32)) {
2684 				struct timeval32 tv32;
2685 
2686 				CP(tv, tv32, tv_sec);
2687 				CP(tv, tv32, tv_usec);
2688 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2689 			} else
2690 #endif
2691 				error = sooptcopyout(sopt, &tv, sizeof tv);
2692 			break;
2693 
2694 		case SO_LABEL:
2695 #ifdef MAC
2696 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2697 			    sizeof(extmac));
2698 			if (error)
2699 				return (error);
2700 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2701 			    so, &extmac);
2702 			if (error)
2703 				return (error);
2704 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2705 #else
2706 			error = EOPNOTSUPP;
2707 #endif
2708 			break;
2709 
2710 		case SO_PEERLABEL:
2711 #ifdef MAC
2712 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2713 			    sizeof(extmac));
2714 			if (error)
2715 				return (error);
2716 			error = mac_getsockopt_peerlabel(
2717 			    sopt->sopt_td->td_ucred, so, &extmac);
2718 			if (error)
2719 				return (error);
2720 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2721 #else
2722 			error = EOPNOTSUPP;
2723 #endif
2724 			break;
2725 
2726 		case SO_LISTENQLIMIT:
2727 			optval = so->so_qlimit;
2728 			goto integer;
2729 
2730 		case SO_LISTENQLEN:
2731 			optval = so->so_qlen;
2732 			goto integer;
2733 
2734 		case SO_LISTENINCQLEN:
2735 			optval = so->so_incqlen;
2736 			goto integer;
2737 
2738 		default:
2739 			error = ENOPROTOOPT;
2740 			break;
2741 		}
2742 		return (error);
2743 	}
2744 }
2745 
2746 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2747 int
2748 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2749 {
2750 	struct mbuf *m, *m_prev;
2751 	int sopt_size = sopt->sopt_valsize;
2752 
2753 	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2754 	if (m == NULL)
2755 		return ENOBUFS;
2756 	if (sopt_size > MLEN) {
2757 		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
2758 		if ((m->m_flags & M_EXT) == 0) {
2759 			m_free(m);
2760 			return ENOBUFS;
2761 		}
2762 		m->m_len = min(MCLBYTES, sopt_size);
2763 	} else {
2764 		m->m_len = min(MLEN, sopt_size);
2765 	}
2766 	sopt_size -= m->m_len;
2767 	*mp = m;
2768 	m_prev = m;
2769 
2770 	while (sopt_size) {
2771 		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
2772 		if (m == NULL) {
2773 			m_freem(*mp);
2774 			return ENOBUFS;
2775 		}
2776 		if (sopt_size > MLEN) {
2777 			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
2778 			    M_DONTWAIT);
2779 			if ((m->m_flags & M_EXT) == 0) {
2780 				m_freem(m);
2781 				m_freem(*mp);
2782 				return ENOBUFS;
2783 			}
2784 			m->m_len = min(MCLBYTES, sopt_size);
2785 		} else {
2786 			m->m_len = min(MLEN, sopt_size);
2787 		}
2788 		sopt_size -= m->m_len;
2789 		m_prev->m_next = m;
2790 		m_prev = m;
2791 	}
2792 	return (0);
2793 }
2794 
2795 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2796 int
2797 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2798 {
2799 	struct mbuf *m0 = m;
2800 
2801 	if (sopt->sopt_val == NULL)
2802 		return (0);
2803 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2804 		if (sopt->sopt_td != NULL) {
2805 			int error;
2806 
2807 			error = copyin(sopt->sopt_val, mtod(m, char *),
2808 				       m->m_len);
2809 			if (error != 0) {
2810 				m_freem(m0);
2811 				return(error);
2812 			}
2813 		} else
2814 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2815 		sopt->sopt_valsize -= m->m_len;
2816 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2817 		m = m->m_next;
2818 	}
2819 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2820 		panic("ip6_sooptmcopyin");
2821 	return (0);
2822 }
2823 
2824 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2825 int
2826 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2827 {
2828 	struct mbuf *m0 = m;
2829 	size_t valsize = 0;
2830 
2831 	if (sopt->sopt_val == NULL)
2832 		return (0);
2833 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2834 		if (sopt->sopt_td != NULL) {
2835 			int error;
2836 
2837 			error = copyout(mtod(m, char *), sopt->sopt_val,
2838 				       m->m_len);
2839 			if (error != 0) {
2840 				m_freem(m0);
2841 				return(error);
2842 			}
2843 		} else
2844 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2845 	       sopt->sopt_valsize -= m->m_len;
2846 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2847 	       valsize += m->m_len;
2848 	       m = m->m_next;
2849 	}
2850 	if (m != NULL) {
2851 		/* enough soopt buffer should be given from user-land */
2852 		m_freem(m0);
2853 		return(EINVAL);
2854 	}
2855 	sopt->sopt_valsize = valsize;
2856 	return (0);
2857 }
2858 
2859 /*
2860  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2861  * out-of-band data, which will then notify socket consumers.
2862  */
2863 void
2864 sohasoutofband(struct socket *so)
2865 {
2866 
2867 	if (so->so_sigio != NULL)
2868 		pgsigio(&so->so_sigio, SIGURG, 0);
2869 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2870 }
2871 
2872 int
2873 sopoll(struct socket *so, int events, struct ucred *active_cred,
2874     struct thread *td)
2875 {
2876 
2877 	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2878 	    td));
2879 }
2880 
2881 int
2882 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2883     struct thread *td)
2884 {
2885 	int revents = 0;
2886 
2887 	SOCKBUF_LOCK(&so->so_snd);
2888 	SOCKBUF_LOCK(&so->so_rcv);
2889 	if (events & (POLLIN | POLLRDNORM))
2890 		if (soreadabledata(so))
2891 			revents |= events & (POLLIN | POLLRDNORM);
2892 
2893 	if (events & (POLLOUT | POLLWRNORM))
2894 		if (sowriteable(so))
2895 			revents |= events & (POLLOUT | POLLWRNORM);
2896 
2897 	if (events & (POLLPRI | POLLRDBAND))
2898 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2899 			revents |= events & (POLLPRI | POLLRDBAND);
2900 
2901 	if ((events & POLLINIGNEOF) == 0) {
2902 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2903 			revents |= events & (POLLIN | POLLRDNORM);
2904 			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2905 				revents |= POLLHUP;
2906 		}
2907 	}
2908 
2909 	if (revents == 0) {
2910 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2911 			selrecord(td, &so->so_rcv.sb_sel);
2912 			so->so_rcv.sb_flags |= SB_SEL;
2913 		}
2914 
2915 		if (events & (POLLOUT | POLLWRNORM)) {
2916 			selrecord(td, &so->so_snd.sb_sel);
2917 			so->so_snd.sb_flags |= SB_SEL;
2918 		}
2919 	}
2920 
2921 	SOCKBUF_UNLOCK(&so->so_rcv);
2922 	SOCKBUF_UNLOCK(&so->so_snd);
2923 	return (revents);
2924 }
2925 
2926 int
2927 soo_kqfilter(struct file *fp, struct knote *kn)
2928 {
2929 	struct socket *so = kn->kn_fp->f_data;
2930 	struct sockbuf *sb;
2931 
2932 	switch (kn->kn_filter) {
2933 	case EVFILT_READ:
2934 		if (so->so_options & SO_ACCEPTCONN)
2935 			kn->kn_fop = &solisten_filtops;
2936 		else
2937 			kn->kn_fop = &soread_filtops;
2938 		sb = &so->so_rcv;
2939 		break;
2940 	case EVFILT_WRITE:
2941 		kn->kn_fop = &sowrite_filtops;
2942 		sb = &so->so_snd;
2943 		break;
2944 	default:
2945 		return (EINVAL);
2946 	}
2947 
2948 	SOCKBUF_LOCK(sb);
2949 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2950 	sb->sb_flags |= SB_KNOTE;
2951 	SOCKBUF_UNLOCK(sb);
2952 	return (0);
2953 }
2954 
2955 /*
2956  * Some routines that return EOPNOTSUPP for entry points that are not
2957  * supported by a protocol.  Fill in as needed.
2958  */
2959 int
2960 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2961 {
2962 
2963 	return EOPNOTSUPP;
2964 }
2965 
2966 int
2967 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2968 {
2969 
2970 	return EOPNOTSUPP;
2971 }
2972 
2973 int
2974 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2975 {
2976 
2977 	return EOPNOTSUPP;
2978 }
2979 
2980 int
2981 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2982 {
2983 
2984 	return EOPNOTSUPP;
2985 }
2986 
2987 int
2988 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2989 {
2990 
2991 	return EOPNOTSUPP;
2992 }
2993 
2994 int
2995 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2996     struct ifnet *ifp, struct thread *td)
2997 {
2998 
2999 	return EOPNOTSUPP;
3000 }
3001 
3002 int
3003 pru_disconnect_notsupp(struct socket *so)
3004 {
3005 
3006 	return EOPNOTSUPP;
3007 }
3008 
3009 int
3010 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3011 {
3012 
3013 	return EOPNOTSUPP;
3014 }
3015 
3016 int
3017 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3018 {
3019 
3020 	return EOPNOTSUPP;
3021 }
3022 
3023 int
3024 pru_rcvd_notsupp(struct socket *so, int flags)
3025 {
3026 
3027 	return EOPNOTSUPP;
3028 }
3029 
3030 int
3031 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3032 {
3033 
3034 	return EOPNOTSUPP;
3035 }
3036 
3037 int
3038 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3039     struct sockaddr *addr, struct mbuf *control, struct thread *td)
3040 {
3041 
3042 	return EOPNOTSUPP;
3043 }
3044 
3045 /*
3046  * This isn't really a ``null'' operation, but it's the default one and
3047  * doesn't do anything destructive.
3048  */
3049 int
3050 pru_sense_null(struct socket *so, struct stat *sb)
3051 {
3052 
3053 	sb->st_blksize = so->so_snd.sb_hiwat;
3054 	return 0;
3055 }
3056 
3057 int
3058 pru_shutdown_notsupp(struct socket *so)
3059 {
3060 
3061 	return EOPNOTSUPP;
3062 }
3063 
3064 int
3065 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3066 {
3067 
3068 	return EOPNOTSUPP;
3069 }
3070 
3071 int
3072 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3073     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3074 {
3075 
3076 	return EOPNOTSUPP;
3077 }
3078 
3079 int
3080 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3081     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3082 {
3083 
3084 	return EOPNOTSUPP;
3085 }
3086 
3087 int
3088 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3089     struct thread *td)
3090 {
3091 
3092 	return EOPNOTSUPP;
3093 }
3094 
3095 static void
3096 filt_sordetach(struct knote *kn)
3097 {
3098 	struct socket *so = kn->kn_fp->f_data;
3099 
3100 	SOCKBUF_LOCK(&so->so_rcv);
3101 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3102 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3103 		so->so_rcv.sb_flags &= ~SB_KNOTE;
3104 	SOCKBUF_UNLOCK(&so->so_rcv);
3105 }
3106 
3107 /*ARGSUSED*/
3108 static int
3109 filt_soread(struct knote *kn, long hint)
3110 {
3111 	struct socket *so;
3112 
3113 	so = kn->kn_fp->f_data;
3114 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3115 
3116 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3117 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3118 		kn->kn_flags |= EV_EOF;
3119 		kn->kn_fflags = so->so_error;
3120 		return (1);
3121 	} else if (so->so_error)	/* temporary udp error */
3122 		return (1);
3123 	else if (kn->kn_sfflags & NOTE_LOWAT)
3124 		return (kn->kn_data >= kn->kn_sdata);
3125 	else
3126 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3127 }
3128 
3129 static void
3130 filt_sowdetach(struct knote *kn)
3131 {
3132 	struct socket *so = kn->kn_fp->f_data;
3133 
3134 	SOCKBUF_LOCK(&so->so_snd);
3135 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3136 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3137 		so->so_snd.sb_flags &= ~SB_KNOTE;
3138 	SOCKBUF_UNLOCK(&so->so_snd);
3139 }
3140 
3141 /*ARGSUSED*/
3142 static int
3143 filt_sowrite(struct knote *kn, long hint)
3144 {
3145 	struct socket *so;
3146 
3147 	so = kn->kn_fp->f_data;
3148 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3149 	kn->kn_data = sbspace(&so->so_snd);
3150 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3151 		kn->kn_flags |= EV_EOF;
3152 		kn->kn_fflags = so->so_error;
3153 		return (1);
3154 	} else if (so->so_error)	/* temporary udp error */
3155 		return (1);
3156 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3157 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3158 		return (0);
3159 	else if (kn->kn_sfflags & NOTE_LOWAT)
3160 		return (kn->kn_data >= kn->kn_sdata);
3161 	else
3162 		return (kn->kn_data >= so->so_snd.sb_lowat);
3163 }
3164 
3165 /*ARGSUSED*/
3166 static int
3167 filt_solisten(struct knote *kn, long hint)
3168 {
3169 	struct socket *so = kn->kn_fp->f_data;
3170 
3171 	kn->kn_data = so->so_qlen;
3172 	return (! TAILQ_EMPTY(&so->so_comp));
3173 }
3174 
3175 int
3176 socheckuid(struct socket *so, uid_t uid)
3177 {
3178 
3179 	if (so == NULL)
3180 		return (EPERM);
3181 	if (so->so_cred->cr_uid != uid)
3182 		return (EPERM);
3183 	return (0);
3184 }
3185 
3186 static int
3187 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
3188 {
3189 	int error;
3190 	int val;
3191 
3192 	val = somaxconn;
3193 	error = sysctl_handle_int(oidp, &val, 0, req);
3194 	if (error || !req->newptr )
3195 		return (error);
3196 
3197 	if (val < 1 || val > USHRT_MAX)
3198 		return (EINVAL);
3199 
3200 	somaxconn = val;
3201 	return (0);
3202 }
3203 
3204 /*
3205  * These functions are used by protocols to notify the socket layer (and its
3206  * consumers) of state changes in the sockets driven by protocol-side events.
3207  */
3208 
3209 /*
3210  * Procedures to manipulate state flags of socket and do appropriate wakeups.
3211  *
3212  * Normal sequence from the active (originating) side is that
3213  * soisconnecting() is called during processing of connect() call, resulting
3214  * in an eventual call to soisconnected() if/when the connection is
3215  * established.  When the connection is torn down soisdisconnecting() is
3216  * called during processing of disconnect() call, and soisdisconnected() is
3217  * called when the connection to the peer is totally severed.  The semantics
3218  * of these routines are such that connectionless protocols can call
3219  * soisconnected() and soisdisconnected() only, bypassing the in-progress
3220  * calls when setting up a ``connection'' takes no time.
3221  *
3222  * From the passive side, a socket is created with two queues of sockets:
3223  * so_incomp for connections in progress and so_comp for connections already
3224  * made and awaiting user acceptance.  As a protocol is preparing incoming
3225  * connections, it creates a socket structure queued on so_incomp by calling
3226  * sonewconn().  When the connection is established, soisconnected() is
3227  * called, and transfers the socket structure to so_comp, making it available
3228  * to accept().
3229  *
3230  * If a socket is closed with sockets on either so_incomp or so_comp, these
3231  * sockets are dropped.
3232  *
3233  * If higher-level protocols are implemented in the kernel, the wakeups done
3234  * here will sometimes cause software-interrupt process scheduling.
3235  */
3236 void
3237 soisconnecting(struct socket *so)
3238 {
3239 
3240 	SOCK_LOCK(so);
3241 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3242 	so->so_state |= SS_ISCONNECTING;
3243 	SOCK_UNLOCK(so);
3244 }
3245 
3246 void
3247 soisconnected(struct socket *so)
3248 {
3249 	struct socket *head;
3250 	int ret;
3251 
3252 restart:
3253 	ACCEPT_LOCK();
3254 	SOCK_LOCK(so);
3255 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3256 	so->so_state |= SS_ISCONNECTED;
3257 	head = so->so_head;
3258 	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3259 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3260 			SOCK_UNLOCK(so);
3261 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3262 			head->so_incqlen--;
3263 			so->so_qstate &= ~SQ_INCOMP;
3264 			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3265 			head->so_qlen++;
3266 			so->so_qstate |= SQ_COMP;
3267 			ACCEPT_UNLOCK();
3268 			sorwakeup(head);
3269 			wakeup_one(&head->so_timeo);
3270 		} else {
3271 			ACCEPT_UNLOCK();
3272 			soupcall_set(so, SO_RCV,
3273 			    head->so_accf->so_accept_filter->accf_callback,
3274 			    head->so_accf->so_accept_filter_arg);
3275 			so->so_options &= ~SO_ACCEPTFILTER;
3276 			ret = head->so_accf->so_accept_filter->accf_callback(so,
3277 			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
3278 			if (ret == SU_ISCONNECTED)
3279 				soupcall_clear(so, SO_RCV);
3280 			SOCK_UNLOCK(so);
3281 			if (ret == SU_ISCONNECTED)
3282 				goto restart;
3283 		}
3284 		return;
3285 	}
3286 	SOCK_UNLOCK(so);
3287 	ACCEPT_UNLOCK();
3288 	wakeup(&so->so_timeo);
3289 	sorwakeup(so);
3290 	sowwakeup(so);
3291 }
3292 
3293 void
3294 soisdisconnecting(struct socket *so)
3295 {
3296 
3297 	/*
3298 	 * Note: This code assumes that SOCK_LOCK(so) and
3299 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3300 	 */
3301 	SOCKBUF_LOCK(&so->so_rcv);
3302 	so->so_state &= ~SS_ISCONNECTING;
3303 	so->so_state |= SS_ISDISCONNECTING;
3304 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3305 	sorwakeup_locked(so);
3306 	SOCKBUF_LOCK(&so->so_snd);
3307 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3308 	sowwakeup_locked(so);
3309 	wakeup(&so->so_timeo);
3310 }
3311 
3312 void
3313 soisdisconnected(struct socket *so)
3314 {
3315 
3316 	/*
3317 	 * Note: This code assumes that SOCK_LOCK(so) and
3318 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3319 	 */
3320 	SOCKBUF_LOCK(&so->so_rcv);
3321 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3322 	so->so_state |= SS_ISDISCONNECTED;
3323 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3324 	sorwakeup_locked(so);
3325 	SOCKBUF_LOCK(&so->so_snd);
3326 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3327 	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3328 	sowwakeup_locked(so);
3329 	wakeup(&so->so_timeo);
3330 }
3331 
3332 /*
3333  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3334  */
3335 struct sockaddr *
3336 sodupsockaddr(const struct sockaddr *sa, int mflags)
3337 {
3338 	struct sockaddr *sa2;
3339 
3340 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3341 	if (sa2)
3342 		bcopy(sa, sa2, sa->sa_len);
3343 	return sa2;
3344 }
3345 
3346 /*
3347  * Register per-socket buffer upcalls.
3348  */
3349 void
3350 soupcall_set(struct socket *so, int which,
3351     int (*func)(struct socket *, void *, int), void *arg)
3352 {
3353 	struct sockbuf *sb;
3354 
3355 	switch (which) {
3356 	case SO_RCV:
3357 		sb = &so->so_rcv;
3358 		break;
3359 	case SO_SND:
3360 		sb = &so->so_snd;
3361 		break;
3362 	default:
3363 		panic("soupcall_set: bad which");
3364 	}
3365 	SOCKBUF_LOCK_ASSERT(sb);
3366 #if 0
3367 	/* XXX: accf_http actually wants to do this on purpose. */
3368 	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3369 #endif
3370 	sb->sb_upcall = func;
3371 	sb->sb_upcallarg = arg;
3372 	sb->sb_flags |= SB_UPCALL;
3373 }
3374 
3375 void
3376 soupcall_clear(struct socket *so, int which)
3377 {
3378 	struct sockbuf *sb;
3379 
3380 	switch (which) {
3381 	case SO_RCV:
3382 		sb = &so->so_rcv;
3383 		break;
3384 	case SO_SND:
3385 		sb = &so->so_snd;
3386 		break;
3387 	default:
3388 		panic("soupcall_clear: bad which");
3389 	}
3390 	SOCKBUF_LOCK_ASSERT(sb);
3391 	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3392 	sb->sb_upcall = NULL;
3393 	sb->sb_upcallarg = NULL;
3394 	sb->sb_flags &= ~SB_UPCALL;
3395 }
3396 
3397 /*
3398  * Create an external-format (``xsocket'') structure using the information in
3399  * the kernel-format socket structure pointed to by so.  This is done to
3400  * reduce the spew of irrelevant information over this interface, to isolate
3401  * user code from changes in the kernel structure, and potentially to provide
3402  * information-hiding if we decide that some of this information should be
3403  * hidden from users.
3404  */
3405 void
3406 sotoxsocket(struct socket *so, struct xsocket *xso)
3407 {
3408 
3409 	xso->xso_len = sizeof *xso;
3410 	xso->xso_so = so;
3411 	xso->so_type = so->so_type;
3412 	xso->so_options = so->so_options;
3413 	xso->so_linger = so->so_linger;
3414 	xso->so_state = so->so_state;
3415 	xso->so_pcb = so->so_pcb;
3416 	xso->xso_protocol = so->so_proto->pr_protocol;
3417 	xso->xso_family = so->so_proto->pr_domain->dom_family;
3418 	xso->so_qlen = so->so_qlen;
3419 	xso->so_incqlen = so->so_incqlen;
3420 	xso->so_qlimit = so->so_qlimit;
3421 	xso->so_timeo = so->so_timeo;
3422 	xso->so_error = so->so_error;
3423 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3424 	xso->so_oobmark = so->so_oobmark;
3425 	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3426 	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3427 	xso->so_uid = so->so_cred->cr_uid;
3428 }
3429 
3430 
3431 /*
3432  * Socket accessor functions to provide external consumers with
3433  * a safe interface to socket state
3434  *
3435  */
3436 
3437 void
3438 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
3439 {
3440 
3441 	TAILQ_FOREACH(so, &so->so_comp, so_list)
3442 		func(so, arg);
3443 }
3444 
3445 struct sockbuf *
3446 so_sockbuf_rcv(struct socket *so)
3447 {
3448 
3449 	return (&so->so_rcv);
3450 }
3451 
3452 struct sockbuf *
3453 so_sockbuf_snd(struct socket *so)
3454 {
3455 
3456 	return (&so->so_snd);
3457 }
3458 
3459 int
3460 so_state_get(const struct socket *so)
3461 {
3462 
3463 	return (so->so_state);
3464 }
3465 
3466 void
3467 so_state_set(struct socket *so, int val)
3468 {
3469 
3470 	so->so_state = val;
3471 }
3472 
3473 int
3474 so_options_get(const struct socket *so)
3475 {
3476 
3477 	return (so->so_options);
3478 }
3479 
3480 void
3481 so_options_set(struct socket *so, int val)
3482 {
3483 
3484 	so->so_options = val;
3485 }
3486 
3487 int
3488 so_error_get(const struct socket *so)
3489 {
3490 
3491 	return (so->so_error);
3492 }
3493 
3494 void
3495 so_error_set(struct socket *so, int val)
3496 {
3497 
3498 	so->so_error = val;
3499 }
3500 
3501 int
3502 so_linger_get(const struct socket *so)
3503 {
3504 
3505 	return (so->so_linger);
3506 }
3507 
3508 void
3509 so_linger_set(struct socket *so, int val)
3510 {
3511 
3512 	so->so_linger = val;
3513 }
3514 
3515 struct protosw *
3516 so_protosw_get(const struct socket *so)
3517 {
3518 
3519 	return (so->so_proto);
3520 }
3521 
3522 void
3523 so_protosw_set(struct socket *so, struct protosw *val)
3524 {
3525 
3526 	so->so_proto = val;
3527 }
3528 
3529 void
3530 so_sorwakeup(struct socket *so)
3531 {
3532 
3533 	sorwakeup(so);
3534 }
3535 
3536 void
3537 so_sowwakeup(struct socket *so)
3538 {
3539 
3540 	sowwakeup(so);
3541 }
3542 
3543 void
3544 so_sorwakeup_locked(struct socket *so)
3545 {
3546 
3547 	sorwakeup_locked(so);
3548 }
3549 
3550 void
3551 so_sowwakeup_locked(struct socket *so)
3552 {
3553 
3554 	sowwakeup_locked(so);
3555 }
3556 
3557 void
3558 so_lock(struct socket *so)
3559 {
3560 	SOCK_LOCK(so);
3561 }
3562 
3563 void
3564 so_unlock(struct socket *so)
3565 {
3566 	SOCK_UNLOCK(so);
3567 }
3568