xref: /freebsd/sys/kern/uipc_socket.c (revision acd3428b7d3e94cef0e1881c868cb4b131d4ff41)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2006 Robert N. M. Watson
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32  */
33 
34 /*
35  * Comments on the socket life cycle:
36  *
37  * soalloc() sets of socket layer state for a socket, called only by
38  * socreate() and sonewconn().  Socket layer private.
39  *
40  * sodealloc() tears down socket layer state for a socket, called only by
41  * sofree() and sonewconn().  Socket layer private.
42  *
43  * pru_attach() associates protocol layer state with an allocated socket;
44  * called only once, may fail, aborting socket allocation.  This is called
45  * from socreate() and sonewconn().  Socket layer private.
46  *
47  * pru_detach() disassociates protocol layer state from an attached socket,
48  * and will be called exactly once for sockets in which pru_attach() has
49  * been successfully called.  If pru_attach() returned an error,
50  * pru_detach() will not be called.  Socket layer private.
51  *
52  * pru_abort() and pru_close() notify the protocol layer that the last
53  * consumer of a socket is starting to tear down the socket, and that the
54  * protocol should terminate the connection.  Historically, pru_abort() also
55  * detached protocol state from the socket state, but this is no longer the
56  * case.
57  *
58  * socreate() creates a socket and attaches protocol state.  This is a public
59  * interface that may be used by socket layer consumers to create new
60  * sockets.
61  *
62  * sonewconn() creates a socket and attaches protocol state.  This is a
63  * public interface  that may be used by protocols to create new sockets when
64  * a new connection is received and will be available for accept() on a
65  * listen socket.
66  *
67  * soclose() destroys a socket after possibly waiting for it to disconnect.
68  * This is a public interface that socket consumers should use to close and
69  * release a socket when done with it.
70  *
71  * soabort() destroys a socket without waiting for it to disconnect (used
72  * only for incoming connections that are already partially or fully
73  * connected).  This is used internally by the socket layer when clearing
74  * listen socket queues (due to overflow or close on the listen socket), but
75  * is also a public interface protocols may use to abort connections in
76  * their incomplete listen queues should they no longer be required.  Sockets
77  * placed in completed connection listen queues should not be aborted for
78  * reasons described in the comment above the soclose() implementation.  This
79  * is not a general purpose close routine, and except in the specific
80  * circumstances described here, should not be used.
81  *
82  * sofree() will free a socket and its protocol state if all references on
83  * the socket have been released, and is the public interface to attempt to
84  * free a socket when a reference is removed.  This is a socket layer private
85  * interface.
86  *
87  * NOTE: In addition to socreate() and soclose(), which provide a single
88  * socket reference to the consumer to be managed as required, there are two
89  * calls to explicitly manage socket references, soref(), and sorele().
90  * Currently, these are generally required only when transitioning a socket
91  * from a listen queue to a file descriptor, in order to prevent garbage
92  * collection of the socket at an untimely moment.  For a number of reasons,
93  * these interfaces are not preferred, and should be avoided.
94  */
95 
96 #include <sys/cdefs.h>
97 __FBSDID("$FreeBSD$");
98 
99 #include "opt_inet.h"
100 #include "opt_mac.h"
101 #include "opt_zero.h"
102 #include "opt_compat.h"
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/fcntl.h>
107 #include <sys/limits.h>
108 #include <sys/lock.h>
109 #include <sys/mac.h>
110 #include <sys/malloc.h>
111 #include <sys/mbuf.h>
112 #include <sys/mutex.h>
113 #include <sys/domain.h>
114 #include <sys/file.h>			/* for struct knote */
115 #include <sys/kernel.h>
116 #include <sys/event.h>
117 #include <sys/eventhandler.h>
118 #include <sys/poll.h>
119 #include <sys/proc.h>
120 #include <sys/protosw.h>
121 #include <sys/socket.h>
122 #include <sys/socketvar.h>
123 #include <sys/resourcevar.h>
124 #include <sys/signalvar.h>
125 #include <sys/sysctl.h>
126 #include <sys/uio.h>
127 #include <sys/jail.h>
128 
129 #include <security/mac/mac_framework.h>
130 
131 #include <vm/uma.h>
132 
133 #ifdef COMPAT_IA32
134 #include <sys/mount.h>
135 #include <compat/freebsd32/freebsd32.h>
136 
137 extern struct sysentvec ia32_freebsd_sysvec;
138 #endif
139 
140 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
141 		    int flags);
142 
143 static void	filt_sordetach(struct knote *kn);
144 static int	filt_soread(struct knote *kn, long hint);
145 static void	filt_sowdetach(struct knote *kn);
146 static int	filt_sowrite(struct knote *kn, long hint);
147 static int	filt_solisten(struct knote *kn, long hint);
148 
149 static struct filterops solisten_filtops =
150 	{ 1, NULL, filt_sordetach, filt_solisten };
151 static struct filterops soread_filtops =
152 	{ 1, NULL, filt_sordetach, filt_soread };
153 static struct filterops sowrite_filtops =
154 	{ 1, NULL, filt_sowdetach, filt_sowrite };
155 
156 uma_zone_t socket_zone;
157 so_gen_t	so_gencnt;	/* generation count for sockets */
158 
159 int	maxsockets;
160 
161 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
162 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
163 
164 static int somaxconn = SOMAXCONN;
165 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
166 /* XXX: we dont have SYSCTL_USHORT */
167 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
168     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
169     "queue size");
170 static int numopensockets;
171 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
172     &numopensockets, 0, "Number of open sockets");
173 #ifdef ZERO_COPY_SOCKETS
174 /* These aren't static because they're used in other files. */
175 int so_zero_copy_send = 1;
176 int so_zero_copy_receive = 1;
177 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
178     "Zero copy controls");
179 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
180     &so_zero_copy_receive, 0, "Enable zero copy receive");
181 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
182     &so_zero_copy_send, 0, "Enable zero copy send");
183 #endif /* ZERO_COPY_SOCKETS */
184 
185 /*
186  * accept_mtx locks down per-socket fields relating to accept queues.  See
187  * socketvar.h for an annotation of the protected fields of struct socket.
188  */
189 struct mtx accept_mtx;
190 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
191 
192 /*
193  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
194  * so_gencnt field.
195  */
196 static struct mtx so_global_mtx;
197 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
198 
199 /*
200  * General IPC sysctl name space, used by sockets and a variety of other IPC
201  * types.
202  */
203 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
204 
205 /*
206  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
207  * of the change so that they can update their dependent limits as required.
208  */
209 static int
210 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
211 {
212 	int error, newmaxsockets;
213 
214 	newmaxsockets = maxsockets;
215 	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
216 	if (error == 0 && req->newptr) {
217 		if (newmaxsockets > maxsockets) {
218 			maxsockets = newmaxsockets;
219 			if (maxsockets > ((maxfiles / 4) * 3)) {
220 				maxfiles = (maxsockets * 5) / 4;
221 				maxfilesperproc = (maxfiles * 9) / 10;
222 			}
223 			EVENTHANDLER_INVOKE(maxsockets_change);
224 		} else
225 			error = EINVAL;
226 	}
227 	return (error);
228 }
229 
230 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
231     &maxsockets, 0, sysctl_maxsockets, "IU",
232     "Maximum number of sockets avaliable");
233 
234 /*
235  * Initialise maxsockets.
236  */
237 static void init_maxsockets(void *ignored)
238 {
239 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
240 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
241 }
242 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
243 
244 /*
245  * Socket operation routines.  These routines are called by the routines in
246  * sys_socket.c or from a system process, and implement the semantics of
247  * socket operations by switching out to the protocol specific routines.
248  */
249 
250 /*
251  * Get a socket structure from our zone, and initialize it.  Note that it
252  * would probably be better to allocate socket and PCB at the same time, but
253  * I'm not convinced that all the protocols can be easily modified to do
254  * this.
255  *
256  * soalloc() returns a socket with a ref count of 0.
257  */
258 static struct socket *
259 soalloc(int mflags)
260 {
261 	struct socket *so;
262 
263 	so = uma_zalloc(socket_zone, mflags | M_ZERO);
264 	if (so == NULL)
265 		return (NULL);
266 #ifdef MAC
267 	if (mac_init_socket(so, mflags) != 0) {
268 		uma_zfree(socket_zone, so);
269 		return (NULL);
270 	}
271 #endif
272 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
273 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
274 	TAILQ_INIT(&so->so_aiojobq);
275 	mtx_lock(&so_global_mtx);
276 	so->so_gencnt = ++so_gencnt;
277 	++numopensockets;
278 	mtx_unlock(&so_global_mtx);
279 	return (so);
280 }
281 
282 /*
283  * Free the storage associated with a socket at the socket layer, tear down
284  * locks, labels, etc.  All protocol state is assumed already to have been
285  * torn down (and possibly never set up) by the caller.
286  */
287 static void
288 sodealloc(struct socket *so)
289 {
290 
291 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
292 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
293 
294 	mtx_lock(&so_global_mtx);
295 	so->so_gencnt = ++so_gencnt;
296 	--numopensockets;	/* Could be below, but faster here. */
297 	mtx_unlock(&so_global_mtx);
298 	if (so->so_rcv.sb_hiwat)
299 		(void)chgsbsize(so->so_cred->cr_uidinfo,
300 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
301 	if (so->so_snd.sb_hiwat)
302 		(void)chgsbsize(so->so_cred->cr_uidinfo,
303 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
304 #ifdef INET
305 	/* remove acccept filter if one is present. */
306 	if (so->so_accf != NULL)
307 		do_setopt_accept_filter(so, NULL);
308 #endif
309 #ifdef MAC
310 	mac_destroy_socket(so);
311 #endif
312 	crfree(so->so_cred);
313 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
314 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
315 	uma_zfree(socket_zone, so);
316 }
317 
318 /*
319  * socreate returns a socket with a ref count of 1.  The socket should be
320  * closed with soclose().
321  */
322 int
323 socreate(dom, aso, type, proto, cred, td)
324 	int dom;
325 	struct socket **aso;
326 	int type;
327 	int proto;
328 	struct ucred *cred;
329 	struct thread *td;
330 {
331 	struct protosw *prp;
332 	struct socket *so;
333 	int error;
334 
335 	if (proto)
336 		prp = pffindproto(dom, proto, type);
337 	else
338 		prp = pffindtype(dom, type);
339 
340 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
341 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
342 		return (EPROTONOSUPPORT);
343 
344 	if (jailed(cred) && jail_socket_unixiproute_only &&
345 	    prp->pr_domain->dom_family != PF_LOCAL &&
346 	    prp->pr_domain->dom_family != PF_INET &&
347 	    prp->pr_domain->dom_family != PF_ROUTE) {
348 		return (EPROTONOSUPPORT);
349 	}
350 
351 	if (prp->pr_type != type)
352 		return (EPROTOTYPE);
353 	so = soalloc(M_WAITOK);
354 	if (so == NULL)
355 		return (ENOBUFS);
356 
357 	TAILQ_INIT(&so->so_incomp);
358 	TAILQ_INIT(&so->so_comp);
359 	so->so_type = type;
360 	so->so_cred = crhold(cred);
361 	so->so_proto = prp;
362 #ifdef MAC
363 	mac_create_socket(cred, so);
364 #endif
365 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
366 	    NULL, NULL, NULL);
367 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
368 	    NULL, NULL, NULL);
369 	so->so_count = 1;
370 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
371 	if (error) {
372 		KASSERT(so->so_count == 1, ("socreate: so_count %d",
373 		    so->so_count));
374 		so->so_count = 0;
375 		sodealloc(so);
376 		return (error);
377 	}
378 	*aso = so;
379 	return (0);
380 }
381 
382 #ifdef REGRESSION
383 static int regression_sonewconn_earlytest = 1;
384 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
385     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
386 #endif
387 
388 /*
389  * When an attempt at a new connection is noted on a socket which accepts
390  * connections, sonewconn is called.  If the connection is possible (subject
391  * to space constraints, etc.) then we allocate a new structure, propoerly
392  * linked into the data structure of the original socket, and return this.
393  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
394  *
395  * Note: the ref count on the socket is 0 on return.
396  */
397 struct socket *
398 sonewconn(head, connstatus)
399 	register struct socket *head;
400 	int connstatus;
401 {
402 	register struct socket *so;
403 	int over;
404 
405 	ACCEPT_LOCK();
406 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
407 	ACCEPT_UNLOCK();
408 #ifdef REGRESSION
409 	if (regression_sonewconn_earlytest && over)
410 #else
411 	if (over)
412 #endif
413 		return (NULL);
414 	so = soalloc(M_NOWAIT);
415 	if (so == NULL)
416 		return (NULL);
417 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
418 		connstatus = 0;
419 	so->so_head = head;
420 	so->so_type = head->so_type;
421 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
422 	so->so_linger = head->so_linger;
423 	so->so_state = head->so_state | SS_NOFDREF;
424 	so->so_proto = head->so_proto;
425 	so->so_cred = crhold(head->so_cred);
426 #ifdef MAC
427 	SOCK_LOCK(head);
428 	mac_create_socket_from_socket(head, so);
429 	SOCK_UNLOCK(head);
430 #endif
431 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
432 	    NULL, NULL, NULL);
433 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
434 	    NULL, NULL, NULL);
435 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
436 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
437 		sodealloc(so);
438 		return (NULL);
439 	}
440 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
441 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
442 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
443 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
444 	so->so_state |= connstatus;
445 	ACCEPT_LOCK();
446 	if (connstatus) {
447 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
448 		so->so_qstate |= SQ_COMP;
449 		head->so_qlen++;
450 	} else {
451 		/*
452 		 * Keep removing sockets from the head until there's room for
453 		 * us to insert on the tail.  In pre-locking revisions, this
454 		 * was a simple if(), but as we could be racing with other
455 		 * threads and soabort() requires dropping locks, we must
456 		 * loop waiting for the condition to be true.
457 		 */
458 		while (head->so_incqlen > head->so_qlimit) {
459 			struct socket *sp;
460 			sp = TAILQ_FIRST(&head->so_incomp);
461 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
462 			head->so_incqlen--;
463 			sp->so_qstate &= ~SQ_INCOMP;
464 			sp->so_head = NULL;
465 			ACCEPT_UNLOCK();
466 			soabort(sp);
467 			ACCEPT_LOCK();
468 		}
469 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
470 		so->so_qstate |= SQ_INCOMP;
471 		head->so_incqlen++;
472 	}
473 	ACCEPT_UNLOCK();
474 	if (connstatus) {
475 		sorwakeup(head);
476 		wakeup_one(&head->so_timeo);
477 	}
478 	return (so);
479 }
480 
481 int
482 sobind(so, nam, td)
483 	struct socket *so;
484 	struct sockaddr *nam;
485 	struct thread *td;
486 {
487 
488 	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
489 }
490 
491 /*
492  * solisten() transitions a socket from a non-listening state to a listening
493  * state, but can also be used to update the listen queue depth on an
494  * existing listen socket.  The protocol will call back into the sockets
495  * layer using solisten_proto_check() and solisten_proto() to check and set
496  * socket-layer listen state.  Call backs are used so that the protocol can
497  * acquire both protocol and socket layer locks in whatever order is required
498  * by the protocol.
499  *
500  * Protocol implementors are advised to hold the socket lock across the
501  * socket-layer test and set to avoid races at the socket layer.
502  */
503 int
504 solisten(so, backlog, td)
505 	struct socket *so;
506 	int backlog;
507 	struct thread *td;
508 {
509 
510 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
511 }
512 
513 int
514 solisten_proto_check(so)
515 	struct socket *so;
516 {
517 
518 	SOCK_LOCK_ASSERT(so);
519 
520 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
521 	    SS_ISDISCONNECTING))
522 		return (EINVAL);
523 	return (0);
524 }
525 
526 void
527 solisten_proto(so, backlog)
528 	struct socket *so;
529 	int backlog;
530 {
531 
532 	SOCK_LOCK_ASSERT(so);
533 
534 	if (backlog < 0 || backlog > somaxconn)
535 		backlog = somaxconn;
536 	so->so_qlimit = backlog;
537 	so->so_options |= SO_ACCEPTCONN;
538 }
539 
540 /*
541  * Attempt to free a socket.  This should really be sotryfree().
542  *
543  * sofree() will succeed if:
544  *
545  * - There are no outstanding file descriptor references or related consumers
546  *   (so_count == 0).
547  *
548  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
549  *
550  * - The protocol does not have an outstanding strong reference on the socket
551  *   (SS_PROTOREF).
552  *
553  * - The socket is not in a completed connection queue, so a process has been
554  *   notified that it is present.  If it is removed, the user process may
555  *   block in accept() despite select() saying the socket was ready.
556  *
557  * Otherwise, it will quietly abort so that a future call to sofree(), when
558  * conditions are right, can succeed.
559  */
560 void
561 sofree(so)
562 	struct socket *so;
563 {
564 	struct protosw *pr = so->so_proto;
565 	struct socket *head;
566 
567 	ACCEPT_LOCK_ASSERT();
568 	SOCK_LOCK_ASSERT(so);
569 
570 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
571 	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
572 		SOCK_UNLOCK(so);
573 		ACCEPT_UNLOCK();
574 		return;
575 	}
576 
577 	head = so->so_head;
578 	if (head != NULL) {
579 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
580 		    (so->so_qstate & SQ_INCOMP) != 0,
581 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
582 		    "SQ_INCOMP"));
583 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
584 		    (so->so_qstate & SQ_INCOMP) == 0,
585 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
586 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
587 		head->so_incqlen--;
588 		so->so_qstate &= ~SQ_INCOMP;
589 		so->so_head = NULL;
590 	}
591 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
592 	    (so->so_qstate & SQ_INCOMP) == 0,
593 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
594 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
595 	SOCK_UNLOCK(so);
596 	ACCEPT_UNLOCK();
597 
598 	/*
599 	 * From this point on, we assume that no other references to this
600 	 * socket exist anywhere else in the stack.  Therefore, no locks need
601 	 * to be acquired or held.
602 	 *
603 	 * We used to do a lot of socket buffer and socket locking here, as
604 	 * well as invoke sorflush() and perform wakeups.  The direct call to
605 	 * dom_dispose() and sbrelease_internal() are an inlining of what was
606 	 * necessary from sorflush().
607 	 *
608 	 * Notice that the socket buffer and kqueue state are torn down
609 	 * before calling pru_detach.  This means that protocols shold not
610 	 * assume they can perform socket wakeups, etc, in their detach
611 	 * code.
612 	 */
613 	KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
614 	KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
615 	sbdestroy(&so->so_snd, so);
616 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
617 		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
618 	sbdestroy(&so->so_rcv, so);
619 	if (pr->pr_usrreqs->pru_detach != NULL)
620 		(*pr->pr_usrreqs->pru_detach)(so);
621 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
622 	knlist_destroy(&so->so_snd.sb_sel.si_note);
623 	sodealloc(so);
624 }
625 
626 /*
627  * Close a socket on last file table reference removal.  Initiate disconnect
628  * if connected.  Free socket when disconnect complete.
629  *
630  * This function will sorele() the socket.  Note that soclose() may be called
631  * prior to the ref count reaching zero.  The actual socket structure will
632  * not be freed until the ref count reaches zero.
633  */
634 int
635 soclose(so)
636 	struct socket *so;
637 {
638 	int error = 0;
639 
640 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
641 
642 	funsetown(&so->so_sigio);
643 	if (so->so_options & SO_ACCEPTCONN) {
644 		struct socket *sp;
645 		ACCEPT_LOCK();
646 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
647 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
648 			so->so_incqlen--;
649 			sp->so_qstate &= ~SQ_INCOMP;
650 			sp->so_head = NULL;
651 			ACCEPT_UNLOCK();
652 			soabort(sp);
653 			ACCEPT_LOCK();
654 		}
655 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
656 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
657 			so->so_qlen--;
658 			sp->so_qstate &= ~SQ_COMP;
659 			sp->so_head = NULL;
660 			ACCEPT_UNLOCK();
661 			soabort(sp);
662 			ACCEPT_LOCK();
663 		}
664 		ACCEPT_UNLOCK();
665 	}
666 	if (so->so_state & SS_ISCONNECTED) {
667 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
668 			error = sodisconnect(so);
669 			if (error)
670 				goto drop;
671 		}
672 		if (so->so_options & SO_LINGER) {
673 			if ((so->so_state & SS_ISDISCONNECTING) &&
674 			    (so->so_state & SS_NBIO))
675 				goto drop;
676 			while (so->so_state & SS_ISCONNECTED) {
677 				error = tsleep(&so->so_timeo,
678 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
679 				if (error)
680 					break;
681 			}
682 		}
683 	}
684 
685 drop:
686 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
687 		(*so->so_proto->pr_usrreqs->pru_close)(so);
688 	ACCEPT_LOCK();
689 	SOCK_LOCK(so);
690 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
691 	so->so_state |= SS_NOFDREF;
692 	sorele(so);
693 	return (error);
694 }
695 
696 /*
697  * soabort() is used to abruptly tear down a connection, such as when a
698  * resource limit is reached (listen queue depth exceeded), or if a listen
699  * socket is closed while there are sockets waiting to be accepted.
700  *
701  * This interface is tricky, because it is called on an unreferenced socket,
702  * and must be called only by a thread that has actually removed the socket
703  * from the listen queue it was on, or races with other threads are risked.
704  *
705  * This interface will call into the protocol code, so must not be called
706  * with any socket locks held.  Protocols do call it while holding their own
707  * recursible protocol mutexes, but this is something that should be subject
708  * to review in the future.
709  */
710 void
711 soabort(so)
712 	struct socket *so;
713 {
714 
715 	/*
716 	 * In as much as is possible, assert that no references to this
717 	 * socket are held.  This is not quite the same as asserting that the
718 	 * current thread is responsible for arranging for no references, but
719 	 * is as close as we can get for now.
720 	 */
721 	KASSERT(so->so_count == 0, ("soabort: so_count"));
722 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
723 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
724 	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
725 	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
726 
727 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
728 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
729 	ACCEPT_LOCK();
730 	SOCK_LOCK(so);
731 	sofree(so);
732 }
733 
734 int
735 soaccept(so, nam)
736 	struct socket *so;
737 	struct sockaddr **nam;
738 {
739 	int error;
740 
741 	SOCK_LOCK(so);
742 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
743 	so->so_state &= ~SS_NOFDREF;
744 	SOCK_UNLOCK(so);
745 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
746 	return (error);
747 }
748 
749 int
750 soconnect(so, nam, td)
751 	struct socket *so;
752 	struct sockaddr *nam;
753 	struct thread *td;
754 {
755 	int error;
756 
757 	if (so->so_options & SO_ACCEPTCONN)
758 		return (EOPNOTSUPP);
759 	/*
760 	 * If protocol is connection-based, can only connect once.
761 	 * Otherwise, if connected, try to disconnect first.  This allows
762 	 * user to disconnect by connecting to, e.g., a null address.
763 	 */
764 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
765 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
766 	    (error = sodisconnect(so)))) {
767 		error = EISCONN;
768 	} else {
769 		/*
770 		 * Prevent accumulated error from previous connection from
771 		 * biting us.
772 		 */
773 		so->so_error = 0;
774 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
775 	}
776 
777 	return (error);
778 }
779 
780 int
781 soconnect2(so1, so2)
782 	struct socket *so1;
783 	struct socket *so2;
784 {
785 
786 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
787 }
788 
789 int
790 sodisconnect(so)
791 	struct socket *so;
792 {
793 	int error;
794 
795 	if ((so->so_state & SS_ISCONNECTED) == 0)
796 		return (ENOTCONN);
797 	if (so->so_state & SS_ISDISCONNECTING)
798 		return (EALREADY);
799 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
800 	return (error);
801 }
802 
803 #ifdef ZERO_COPY_SOCKETS
804 struct so_zerocopy_stats{
805 	int size_ok;
806 	int align_ok;
807 	int found_ifp;
808 };
809 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
810 #include <netinet/in.h>
811 #include <net/route.h>
812 #include <netinet/in_pcb.h>
813 #include <vm/vm.h>
814 #include <vm/vm_page.h>
815 #include <vm/vm_object.h>
816 
817 /*
818  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
819  * sosend_dgram() and sosend_generic() use m_uiotombuf().
820  *
821  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
822  * all of the data referenced by the uio.  If desired, it uses zero-copy.
823  * *space will be updated to reflect data copied in.
824  *
825  * NB: If atomic I/O is requested, the caller must already have checked that
826  * space can hold resid bytes.
827  *
828  * NB: In the event of an error, the caller may need to free the partial
829  * chain pointed to by *mpp.  The contents of both *uio and *space may be
830  * modified even in the case of an error.
831  */
832 static int
833 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
834     int flags)
835 {
836 	struct mbuf *m, **mp, *top;
837 	long len, resid;
838 	int error;
839 #ifdef ZERO_COPY_SOCKETS
840 	int cow_send;
841 #endif
842 
843 	*retmp = top = NULL;
844 	mp = &top;
845 	len = 0;
846 	resid = uio->uio_resid;
847 	error = 0;
848 	do {
849 #ifdef ZERO_COPY_SOCKETS
850 		cow_send = 0;
851 #endif /* ZERO_COPY_SOCKETS */
852 		if (resid >= MINCLSIZE) {
853 #ifdef ZERO_COPY_SOCKETS
854 			if (top == NULL) {
855 				MGETHDR(m, M_TRYWAIT, MT_DATA);
856 				if (m == NULL) {
857 					error = ENOBUFS;
858 					goto out;
859 				}
860 				m->m_pkthdr.len = 0;
861 				m->m_pkthdr.rcvif = NULL;
862 			} else {
863 				MGET(m, M_TRYWAIT, MT_DATA);
864 				if (m == NULL) {
865 					error = ENOBUFS;
866 					goto out;
867 				}
868 			}
869 			if (so_zero_copy_send &&
870 			    resid>=PAGE_SIZE &&
871 			    *space>=PAGE_SIZE &&
872 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
873 				so_zerocp_stats.size_ok++;
874 				so_zerocp_stats.align_ok++;
875 				cow_send = socow_setup(m, uio);
876 				len = cow_send;
877 			}
878 			if (!cow_send) {
879 				MCLGET(m, M_TRYWAIT);
880 				if ((m->m_flags & M_EXT) == 0) {
881 					m_free(m);
882 					m = NULL;
883 				} else {
884 					len = min(min(MCLBYTES, resid),
885 					    *space);
886 				}
887 			}
888 #else /* ZERO_COPY_SOCKETS */
889 			if (top == NULL) {
890 				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
891 				m->m_pkthdr.len = 0;
892 				m->m_pkthdr.rcvif = NULL;
893 			} else
894 				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
895 			len = min(min(MCLBYTES, resid), *space);
896 #endif /* ZERO_COPY_SOCKETS */
897 		} else {
898 			if (top == NULL) {
899 				m = m_gethdr(M_TRYWAIT, MT_DATA);
900 				m->m_pkthdr.len = 0;
901 				m->m_pkthdr.rcvif = NULL;
902 
903 				len = min(min(MHLEN, resid), *space);
904 				/*
905 				 * For datagram protocols, leave room
906 				 * for protocol headers in first mbuf.
907 				 */
908 				if (atomic && m && len < MHLEN)
909 					MH_ALIGN(m, len);
910 			} else {
911 				m = m_get(M_TRYWAIT, MT_DATA);
912 				len = min(min(MLEN, resid), *space);
913 			}
914 		}
915 		if (m == NULL) {
916 			error = ENOBUFS;
917 			goto out;
918 		}
919 
920 		*space -= len;
921 #ifdef ZERO_COPY_SOCKETS
922 		if (cow_send)
923 			error = 0;
924 		else
925 #endif /* ZERO_COPY_SOCKETS */
926 		error = uiomove(mtod(m, void *), (int)len, uio);
927 		resid = uio->uio_resid;
928 		m->m_len = len;
929 		*mp = m;
930 		top->m_pkthdr.len += len;
931 		if (error)
932 			goto out;
933 		mp = &m->m_next;
934 		if (resid <= 0) {
935 			if (flags & MSG_EOR)
936 				top->m_flags |= M_EOR;
937 			break;
938 		}
939 	} while (*space > 0 && atomic);
940 out:
941 	*retmp = top;
942 	return (error);
943 }
944 #endif /*ZERO_COPY_SOCKETS*/
945 
946 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
947 
948 int
949 sosend_dgram(so, addr, uio, top, control, flags, td)
950 	struct socket *so;
951 	struct sockaddr *addr;
952 	struct uio *uio;
953 	struct mbuf *top;
954 	struct mbuf *control;
955 	int flags;
956 	struct thread *td;
957 {
958 	long space, resid;
959 	int clen = 0, error, dontroute;
960 #ifdef ZERO_COPY_SOCKETS
961 	int atomic = sosendallatonce(so) || top;
962 #endif
963 
964 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
965 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
966 	    ("sodgram_send: !PR_ATOMIC"));
967 
968 	if (uio != NULL)
969 		resid = uio->uio_resid;
970 	else
971 		resid = top->m_pkthdr.len;
972 	/*
973 	 * In theory resid should be unsigned.  However, space must be
974 	 * signed, as it might be less than 0 if we over-committed, and we
975 	 * must use a signed comparison of space and resid.  On the other
976 	 * hand, a negative resid causes us to loop sending 0-length
977 	 * segments to the protocol.
978 	 *
979 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
980 	 * type sockets since that's an error.
981 	 */
982 	if (resid < 0) {
983 		error = EINVAL;
984 		goto out;
985 	}
986 
987 	dontroute =
988 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
989 	if (td != NULL)
990 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
991 	if (control != NULL)
992 		clen = control->m_len;
993 
994 	SOCKBUF_LOCK(&so->so_snd);
995 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
996 		SOCKBUF_UNLOCK(&so->so_snd);
997 		error = EPIPE;
998 		goto out;
999 	}
1000 	if (so->so_error) {
1001 		error = so->so_error;
1002 		so->so_error = 0;
1003 		SOCKBUF_UNLOCK(&so->so_snd);
1004 		goto out;
1005 	}
1006 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1007 		/*
1008 		 * `sendto' and `sendmsg' is allowed on a connection-based
1009 		 * socket if it supports implied connect.  Return ENOTCONN if
1010 		 * not connected and no address is supplied.
1011 		 */
1012 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1013 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1014 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1015 			    !(resid == 0 && clen != 0)) {
1016 				SOCKBUF_UNLOCK(&so->so_snd);
1017 				error = ENOTCONN;
1018 				goto out;
1019 			}
1020 		} else if (addr == NULL) {
1021 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1022 				error = ENOTCONN;
1023 			else
1024 				error = EDESTADDRREQ;
1025 			SOCKBUF_UNLOCK(&so->so_snd);
1026 			goto out;
1027 		}
1028 	}
1029 
1030 	/*
1031 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1032 	 * problem and need fixing.
1033 	 */
1034 	space = sbspace(&so->so_snd);
1035 	if (flags & MSG_OOB)
1036 		space += 1024;
1037 	space -= clen;
1038 	SOCKBUF_UNLOCK(&so->so_snd);
1039 	if (resid > space) {
1040 		error = EMSGSIZE;
1041 		goto out;
1042 	}
1043 	if (uio == NULL) {
1044 		resid = 0;
1045 		if (flags & MSG_EOR)
1046 			top->m_flags |= M_EOR;
1047 	} else {
1048 #ifdef ZERO_COPY_SOCKETS
1049 		error = sosend_copyin(uio, &top, atomic, &space, flags);
1050 		if (error)
1051 			goto out;
1052 #else
1053 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1054 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1055 		if (top == NULL) {
1056 			error = EFAULT;	/* only possible error */
1057 			goto out;
1058 		}
1059 		space -= resid - uio->uio_resid;
1060 #endif
1061 		resid = uio->uio_resid;
1062 	}
1063 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1064 	/*
1065 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1066 	 * than with.
1067 	 */
1068 	if (dontroute) {
1069 		SOCK_LOCK(so);
1070 		so->so_options |= SO_DONTROUTE;
1071 		SOCK_UNLOCK(so);
1072 	}
1073 	/*
1074 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1075 	 * of date.  We could have recieved a reset packet in an interrupt or
1076 	 * maybe we slept while doing page faults in uiomove() etc.  We could
1077 	 * probably recheck again inside the locking protection here, but
1078 	 * there are probably other places that this also happens.  We must
1079 	 * rethink this.
1080 	 */
1081 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1082 	    (flags & MSG_OOB) ? PRUS_OOB :
1083 	/*
1084 	 * If the user set MSG_EOF, the protocol understands this flag and
1085 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1086 	 */
1087 	    ((flags & MSG_EOF) &&
1088 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1089 	     (resid <= 0)) ?
1090 		PRUS_EOF :
1091 		/* If there is more to send set PRUS_MORETOCOME */
1092 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1093 		top, addr, control, td);
1094 	if (dontroute) {
1095 		SOCK_LOCK(so);
1096 		so->so_options &= ~SO_DONTROUTE;
1097 		SOCK_UNLOCK(so);
1098 	}
1099 	clen = 0;
1100 	control = NULL;
1101 	top = NULL;
1102 out:
1103 	if (top != NULL)
1104 		m_freem(top);
1105 	if (control != NULL)
1106 		m_freem(control);
1107 	return (error);
1108 }
1109 
1110 /*
1111  * Send on a socket.  If send must go all at once and message is larger than
1112  * send buffering, then hard error.  Lock against other senders.  If must go
1113  * all at once and not enough room now, then inform user that this would
1114  * block and do nothing.  Otherwise, if nonblocking, send as much as
1115  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1116  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1117  * in mbuf chain must be small enough to send all at once.
1118  *
1119  * Returns nonzero on error, timeout or signal; callers must check for short
1120  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1121  * on return.
1122  */
1123 #define	snderr(errno)	{ error = (errno); goto release; }
1124 int
1125 sosend_generic(so, addr, uio, top, control, flags, td)
1126 	struct socket *so;
1127 	struct sockaddr *addr;
1128 	struct uio *uio;
1129 	struct mbuf *top;
1130 	struct mbuf *control;
1131 	int flags;
1132 	struct thread *td;
1133 {
1134 	long space, resid;
1135 	int clen = 0, error, dontroute;
1136 	int atomic = sosendallatonce(so) || top;
1137 
1138 	if (uio != NULL)
1139 		resid = uio->uio_resid;
1140 	else
1141 		resid = top->m_pkthdr.len;
1142 	/*
1143 	 * In theory resid should be unsigned.  However, space must be
1144 	 * signed, as it might be less than 0 if we over-committed, and we
1145 	 * must use a signed comparison of space and resid.  On the other
1146 	 * hand, a negative resid causes us to loop sending 0-length
1147 	 * segments to the protocol.
1148 	 *
1149 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1150 	 * type sockets since that's an error.
1151 	 */
1152 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1153 		error = EINVAL;
1154 		goto out;
1155 	}
1156 
1157 	dontroute =
1158 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1159 	    (so->so_proto->pr_flags & PR_ATOMIC);
1160 	if (td != NULL)
1161 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1162 	if (control != NULL)
1163 		clen = control->m_len;
1164 
1165 	SOCKBUF_LOCK(&so->so_snd);
1166 restart:
1167 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1168 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1169 	if (error)
1170 		goto out_locked;
1171 	do {
1172 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1173 		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1174 			snderr(EPIPE);
1175 		if (so->so_error) {
1176 			error = so->so_error;
1177 			so->so_error = 0;
1178 			goto release;
1179 		}
1180 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1181 			/*
1182 			 * `sendto' and `sendmsg' is allowed on a connection-
1183 			 * based socket if it supports implied connect.
1184 			 * Return ENOTCONN if not connected and no address is
1185 			 * supplied.
1186 			 */
1187 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1188 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1189 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1190 				    !(resid == 0 && clen != 0))
1191 					snderr(ENOTCONN);
1192 			} else if (addr == NULL)
1193 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1194 				   ENOTCONN : EDESTADDRREQ);
1195 		}
1196 		space = sbspace(&so->so_snd);
1197 		if (flags & MSG_OOB)
1198 			space += 1024;
1199 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1200 		    clen > so->so_snd.sb_hiwat)
1201 			snderr(EMSGSIZE);
1202 		if (space < resid + clen &&
1203 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1204 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1205 				snderr(EWOULDBLOCK);
1206 			sbunlock(&so->so_snd);
1207 			error = sbwait(&so->so_snd);
1208 			if (error)
1209 				goto out_locked;
1210 			goto restart;
1211 		}
1212 		SOCKBUF_UNLOCK(&so->so_snd);
1213 		space -= clen;
1214 		do {
1215 			if (uio == NULL) {
1216 				resid = 0;
1217 				if (flags & MSG_EOR)
1218 					top->m_flags |= M_EOR;
1219 			} else {
1220 #ifdef ZERO_COPY_SOCKETS
1221 				error = sosend_copyin(uio, &top, atomic,
1222 				    &space, flags);
1223 				if (error != 0) {
1224 					SOCKBUF_LOCK(&so->so_snd);
1225 					goto release;
1226 				}
1227 #else
1228 				top = m_uiotombuf(uio, M_WAITOK, space,
1229 				    (atomic ? max_hdr : 0),
1230 				    (atomic ? M_PKTHDR : 0) |
1231 				    ((flags & MSG_EOR) ? M_EOR : 0));
1232 				if (top == NULL) {
1233 					SOCKBUF_LOCK(&so->so_snd);
1234 					error = EFAULT; /* only possible error */
1235 					goto release;
1236 				}
1237 				space -= resid - uio->uio_resid;
1238 #endif
1239 				resid = uio->uio_resid;
1240 			}
1241 			if (dontroute) {
1242 				SOCK_LOCK(so);
1243 				so->so_options |= SO_DONTROUTE;
1244 				SOCK_UNLOCK(so);
1245 			}
1246 			/*
1247 			 * XXX all the SBS_CANTSENDMORE checks previously
1248 			 * done could be out of date.  We could have recieved
1249 			 * a reset packet in an interrupt or maybe we slept
1250 			 * while doing page faults in uiomove() etc.  We
1251 			 * could probably recheck again inside the locking
1252 			 * protection here, but there are probably other
1253 			 * places that this also happens.  We must rethink
1254 			 * this.
1255 			 */
1256 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1257 			    (flags & MSG_OOB) ? PRUS_OOB :
1258 			/*
1259 			 * If the user set MSG_EOF, the protocol understands
1260 			 * this flag and nothing left to send then use
1261 			 * PRU_SEND_EOF instead of PRU_SEND.
1262 			 */
1263 			    ((flags & MSG_EOF) &&
1264 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1265 			     (resid <= 0)) ?
1266 				PRUS_EOF :
1267 			/* If there is more to send set PRUS_MORETOCOME. */
1268 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1269 			    top, addr, control, td);
1270 			if (dontroute) {
1271 				SOCK_LOCK(so);
1272 				so->so_options &= ~SO_DONTROUTE;
1273 				SOCK_UNLOCK(so);
1274 			}
1275 			clen = 0;
1276 			control = NULL;
1277 			top = NULL;
1278 			if (error) {
1279 				SOCKBUF_LOCK(&so->so_snd);
1280 				goto release;
1281 			}
1282 		} while (resid && space > 0);
1283 		SOCKBUF_LOCK(&so->so_snd);
1284 	} while (resid);
1285 
1286 release:
1287 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1288 	sbunlock(&so->so_snd);
1289 out_locked:
1290 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1291 	SOCKBUF_UNLOCK(&so->so_snd);
1292 out:
1293 	if (top != NULL)
1294 		m_freem(top);
1295 	if (control != NULL)
1296 		m_freem(control);
1297 	return (error);
1298 }
1299 #undef snderr
1300 
1301 int
1302 sosend(so, addr, uio, top, control, flags, td)
1303 	struct socket *so;
1304 	struct sockaddr *addr;
1305 	struct uio *uio;
1306 	struct mbuf *top;
1307 	struct mbuf *control;
1308 	int flags;
1309 	struct thread *td;
1310 {
1311 
1312 	/* XXXRW: Temporary debugging. */
1313 	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1314 	    ("sosend: protocol calls sosend"));
1315 
1316 	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1317 	    control, flags, td));
1318 }
1319 
1320 /*
1321  * The part of soreceive() that implements reading non-inline out-of-band
1322  * data from a socket.  For more complete comments, see soreceive(), from
1323  * which this code originated.
1324  *
1325  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1326  * unable to return an mbuf chain to the caller.
1327  */
1328 static int
1329 soreceive_rcvoob(so, uio, flags)
1330 	struct socket *so;
1331 	struct uio *uio;
1332 	int flags;
1333 {
1334 	struct protosw *pr = so->so_proto;
1335 	struct mbuf *m;
1336 	int error;
1337 
1338 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1339 
1340 	m = m_get(M_TRYWAIT, MT_DATA);
1341 	if (m == NULL)
1342 		return (ENOBUFS);
1343 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1344 	if (error)
1345 		goto bad;
1346 	do {
1347 #ifdef ZERO_COPY_SOCKETS
1348 		if (so_zero_copy_receive) {
1349 			int disposable;
1350 
1351 			if ((m->m_flags & M_EXT)
1352 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1353 				disposable = 1;
1354 			else
1355 				disposable = 0;
1356 
1357 			error = uiomoveco(mtod(m, void *),
1358 					  min(uio->uio_resid, m->m_len),
1359 					  uio, disposable);
1360 		} else
1361 #endif /* ZERO_COPY_SOCKETS */
1362 		error = uiomove(mtod(m, void *),
1363 		    (int) min(uio->uio_resid, m->m_len), uio);
1364 		m = m_free(m);
1365 	} while (uio->uio_resid && error == 0 && m);
1366 bad:
1367 	if (m != NULL)
1368 		m_freem(m);
1369 	return (error);
1370 }
1371 
1372 /*
1373  * Following replacement or removal of the first mbuf on the first mbuf chain
1374  * of a socket buffer, push necessary state changes back into the socket
1375  * buffer so that other consumers see the values consistently.  'nextrecord'
1376  * is the callers locally stored value of the original value of
1377  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1378  * NOTE: 'nextrecord' may be NULL.
1379  */
1380 static __inline void
1381 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1382 {
1383 
1384 	SOCKBUF_LOCK_ASSERT(sb);
1385 	/*
1386 	 * First, update for the new value of nextrecord.  If necessary, make
1387 	 * it the first record.
1388 	 */
1389 	if (sb->sb_mb != NULL)
1390 		sb->sb_mb->m_nextpkt = nextrecord;
1391 	else
1392 		sb->sb_mb = nextrecord;
1393 
1394         /*
1395          * Now update any dependent socket buffer fields to reflect the new
1396          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1397 	 * addition of a second clause that takes care of the case where
1398 	 * sb_mb has been updated, but remains the last record.
1399          */
1400         if (sb->sb_mb == NULL) {
1401                 sb->sb_mbtail = NULL;
1402                 sb->sb_lastrecord = NULL;
1403         } else if (sb->sb_mb->m_nextpkt == NULL)
1404                 sb->sb_lastrecord = sb->sb_mb;
1405 }
1406 
1407 
1408 /*
1409  * Implement receive operations on a socket.  We depend on the way that
1410  * records are added to the sockbuf by sbappend.  In particular, each record
1411  * (mbufs linked through m_next) must begin with an address if the protocol
1412  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1413  * data, and then zero or more mbufs of data.  In order to allow parallelism
1414  * between network receive and copying to user space, as well as avoid
1415  * sleeping with a mutex held, we release the socket buffer mutex during the
1416  * user space copy.  Although the sockbuf is locked, new data may still be
1417  * appended, and thus we must maintain consistency of the sockbuf during that
1418  * time.
1419  *
1420  * The caller may receive the data as a single mbuf chain by supplying an
1421  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1422  * the count in uio_resid.
1423  */
1424 int
1425 soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1426 	struct socket *so;
1427 	struct sockaddr **psa;
1428 	struct uio *uio;
1429 	struct mbuf **mp0;
1430 	struct mbuf **controlp;
1431 	int *flagsp;
1432 {
1433 	struct mbuf *m, **mp;
1434 	int flags, len, error, offset;
1435 	struct protosw *pr = so->so_proto;
1436 	struct mbuf *nextrecord;
1437 	int moff, type = 0;
1438 	int mbuf_removed = 0;
1439 	int orig_resid = uio->uio_resid;
1440 
1441 	mp = mp0;
1442 	if (psa != NULL)
1443 		*psa = NULL;
1444 	if (controlp != NULL)
1445 		*controlp = NULL;
1446 	if (flagsp != NULL)
1447 		flags = *flagsp &~ MSG_EOR;
1448 	else
1449 		flags = 0;
1450 	if (flags & MSG_OOB)
1451 		return (soreceive_rcvoob(so, uio, flags));
1452 	if (mp != NULL)
1453 		*mp = NULL;
1454 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1455 	    && uio->uio_resid)
1456 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1457 
1458 	SOCKBUF_LOCK(&so->so_rcv);
1459 restart:
1460 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1461 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1462 	if (error)
1463 		goto out;
1464 
1465 	m = so->so_rcv.sb_mb;
1466 	/*
1467 	 * If we have less data than requested, block awaiting more (subject
1468 	 * to any timeout) if:
1469 	 *   1. the current count is less than the low water mark, or
1470 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1471 	 *	receive operation at once if we block (resid <= hiwat).
1472 	 *   3. MSG_DONTWAIT is not set
1473 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1474 	 * we have to do the receive in sections, and thus risk returning a
1475 	 * short count if a timeout or signal occurs after we start.
1476 	 */
1477 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1478 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1479 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1480 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1481 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1482 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1483 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1484 		    m, so->so_rcv.sb_cc));
1485 		if (so->so_error) {
1486 			if (m != NULL)
1487 				goto dontblock;
1488 			error = so->so_error;
1489 			if ((flags & MSG_PEEK) == 0)
1490 				so->so_error = 0;
1491 			goto release;
1492 		}
1493 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1494 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1495 			if (m)
1496 				goto dontblock;
1497 			else
1498 				goto release;
1499 		}
1500 		for (; m != NULL; m = m->m_next)
1501 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1502 				m = so->so_rcv.sb_mb;
1503 				goto dontblock;
1504 			}
1505 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1506 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1507 			error = ENOTCONN;
1508 			goto release;
1509 		}
1510 		if (uio->uio_resid == 0)
1511 			goto release;
1512 		if ((so->so_state & SS_NBIO) ||
1513 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1514 			error = EWOULDBLOCK;
1515 			goto release;
1516 		}
1517 		SBLASTRECORDCHK(&so->so_rcv);
1518 		SBLASTMBUFCHK(&so->so_rcv);
1519 		sbunlock(&so->so_rcv);
1520 		error = sbwait(&so->so_rcv);
1521 		if (error)
1522 			goto out;
1523 		goto restart;
1524 	}
1525 dontblock:
1526 	/*
1527 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1528 	 * pointer to the next record in the socket buffer.  We must keep the
1529 	 * various socket buffer pointers and local stack versions of the
1530 	 * pointers in sync, pushing out modifications before dropping the
1531 	 * socket buffer mutex, and re-reading them when picking it up.
1532 	 *
1533 	 * Otherwise, we will race with the network stack appending new data
1534 	 * or records onto the socket buffer by using inconsistent/stale
1535 	 * versions of the field, possibly resulting in socket buffer
1536 	 * corruption.
1537 	 *
1538 	 * By holding the high-level sblock(), we prevent simultaneous
1539 	 * readers from pulling off the front of the socket buffer.
1540 	 */
1541 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1542 	if (uio->uio_td)
1543 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1544 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1545 	SBLASTRECORDCHK(&so->so_rcv);
1546 	SBLASTMBUFCHK(&so->so_rcv);
1547 	nextrecord = m->m_nextpkt;
1548 	if (pr->pr_flags & PR_ADDR) {
1549 		KASSERT(m->m_type == MT_SONAME,
1550 		    ("m->m_type == %d", m->m_type));
1551 		orig_resid = 0;
1552 		if (psa != NULL)
1553 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1554 			    M_NOWAIT);
1555 		if (flags & MSG_PEEK) {
1556 			m = m->m_next;
1557 		} else {
1558 			sbfree(&so->so_rcv, m);
1559 			mbuf_removed = 1;
1560 			so->so_rcv.sb_mb = m_free(m);
1561 			m = so->so_rcv.sb_mb;
1562 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1563 		}
1564 	}
1565 
1566 	/*
1567 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1568 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1569 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1570 	 * perform externalization (or freeing if controlp == NULL).
1571 	 */
1572 	if (m != NULL && m->m_type == MT_CONTROL) {
1573 		struct mbuf *cm = NULL, *cmn;
1574 		struct mbuf **cme = &cm;
1575 
1576 		do {
1577 			if (flags & MSG_PEEK) {
1578 				if (controlp != NULL) {
1579 					*controlp = m_copy(m, 0, m->m_len);
1580 					controlp = &(*controlp)->m_next;
1581 				}
1582 				m = m->m_next;
1583 			} else {
1584 				sbfree(&so->so_rcv, m);
1585 				mbuf_removed = 1;
1586 				so->so_rcv.sb_mb = m->m_next;
1587 				m->m_next = NULL;
1588 				*cme = m;
1589 				cme = &(*cme)->m_next;
1590 				m = so->so_rcv.sb_mb;
1591 			}
1592 		} while (m != NULL && m->m_type == MT_CONTROL);
1593 		if ((flags & MSG_PEEK) == 0)
1594 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1595 		while (cm != NULL) {
1596 			cmn = cm->m_next;
1597 			cm->m_next = NULL;
1598 			if (pr->pr_domain->dom_externalize != NULL) {
1599 				SOCKBUF_UNLOCK(&so->so_rcv);
1600 				error = (*pr->pr_domain->dom_externalize)
1601 				    (cm, controlp);
1602 				SOCKBUF_LOCK(&so->so_rcv);
1603 			} else if (controlp != NULL)
1604 				*controlp = cm;
1605 			else
1606 				m_freem(cm);
1607 			if (controlp != NULL) {
1608 				orig_resid = 0;
1609 				while (*controlp != NULL)
1610 					controlp = &(*controlp)->m_next;
1611 			}
1612 			cm = cmn;
1613 		}
1614 		if (m != NULL)
1615 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1616 		else
1617 			nextrecord = so->so_rcv.sb_mb;
1618 		orig_resid = 0;
1619 	}
1620 	if (m != NULL) {
1621 		if ((flags & MSG_PEEK) == 0) {
1622 			KASSERT(m->m_nextpkt == nextrecord,
1623 			    ("soreceive: post-control, nextrecord !sync"));
1624 			if (nextrecord == NULL) {
1625 				KASSERT(so->so_rcv.sb_mb == m,
1626 				    ("soreceive: post-control, sb_mb!=m"));
1627 				KASSERT(so->so_rcv.sb_lastrecord == m,
1628 				    ("soreceive: post-control, lastrecord!=m"));
1629 			}
1630 		}
1631 		type = m->m_type;
1632 		if (type == MT_OOBDATA)
1633 			flags |= MSG_OOB;
1634 	} else {
1635 		if ((flags & MSG_PEEK) == 0) {
1636 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1637 			    ("soreceive: sb_mb != nextrecord"));
1638 			if (so->so_rcv.sb_mb == NULL) {
1639 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1640 				    ("soreceive: sb_lastercord != NULL"));
1641 			}
1642 		}
1643 	}
1644 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1645 	SBLASTRECORDCHK(&so->so_rcv);
1646 	SBLASTMBUFCHK(&so->so_rcv);
1647 
1648 	/*
1649 	 * Now continue to read any data mbufs off of the head of the socket
1650 	 * buffer until the read request is satisfied.  Note that 'type' is
1651 	 * used to store the type of any mbuf reads that have happened so far
1652 	 * such that soreceive() can stop reading if the type changes, which
1653 	 * causes soreceive() to return only one of regular data and inline
1654 	 * out-of-band data in a single socket receive operation.
1655 	 */
1656 	moff = 0;
1657 	offset = 0;
1658 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1659 		/*
1660 		 * If the type of mbuf has changed since the last mbuf
1661 		 * examined ('type'), end the receive operation.
1662 	 	 */
1663 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1664 		if (m->m_type == MT_OOBDATA) {
1665 			if (type != MT_OOBDATA)
1666 				break;
1667 		} else if (type == MT_OOBDATA)
1668 			break;
1669 		else
1670 		    KASSERT(m->m_type == MT_DATA,
1671 			("m->m_type == %d", m->m_type));
1672 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1673 		len = uio->uio_resid;
1674 		if (so->so_oobmark && len > so->so_oobmark - offset)
1675 			len = so->so_oobmark - offset;
1676 		if (len > m->m_len - moff)
1677 			len = m->m_len - moff;
1678 		/*
1679 		 * If mp is set, just pass back the mbufs.  Otherwise copy
1680 		 * them out via the uio, then free.  Sockbuf must be
1681 		 * consistent here (points to current mbuf, it points to next
1682 		 * record) when we drop priority; we must note any additions
1683 		 * to the sockbuf when we block interrupts again.
1684 		 */
1685 		if (mp == NULL) {
1686 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1687 			SBLASTRECORDCHK(&so->so_rcv);
1688 			SBLASTMBUFCHK(&so->so_rcv);
1689 			SOCKBUF_UNLOCK(&so->so_rcv);
1690 #ifdef ZERO_COPY_SOCKETS
1691 			if (so_zero_copy_receive) {
1692 				int disposable;
1693 
1694 				if ((m->m_flags & M_EXT)
1695 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1696 					disposable = 1;
1697 				else
1698 					disposable = 0;
1699 
1700 				error = uiomoveco(mtod(m, char *) + moff,
1701 						  (int)len, uio,
1702 						  disposable);
1703 			} else
1704 #endif /* ZERO_COPY_SOCKETS */
1705 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1706 			SOCKBUF_LOCK(&so->so_rcv);
1707 			if (error) {
1708 				/*
1709 				 * If any part of the record has been removed
1710 				 * (such as the MT_SONAME mbuf, which will
1711 				 * happen when PR_ADDR, and thus also
1712 				 * PR_ATOMIC, is set), then drop the entire
1713 				 * record to maintain the atomicity of the
1714 				 * receive operation.
1715 				 */
1716 				if (m && mbuf_removed &&
1717 				    (pr->pr_flags & PR_ATOMIC))
1718 					(void)sbdroprecord_locked(&so->so_rcv);
1719 				goto release;
1720 			}
1721 		} else
1722 			uio->uio_resid -= len;
1723 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1724 		if (len == m->m_len - moff) {
1725 			if (m->m_flags & M_EOR)
1726 				flags |= MSG_EOR;
1727 			if (flags & MSG_PEEK) {
1728 				m = m->m_next;
1729 				moff = 0;
1730 			} else {
1731 				nextrecord = m->m_nextpkt;
1732 				sbfree(&so->so_rcv, m);
1733 				if (mp != NULL) {
1734 					*mp = m;
1735 					mp = &m->m_next;
1736 					so->so_rcv.sb_mb = m = m->m_next;
1737 					*mp = NULL;
1738 				} else {
1739 					so->so_rcv.sb_mb = m_free(m);
1740 					m = so->so_rcv.sb_mb;
1741 				}
1742 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1743 				SBLASTRECORDCHK(&so->so_rcv);
1744 				SBLASTMBUFCHK(&so->so_rcv);
1745 			}
1746 		} else {
1747 			if (flags & MSG_PEEK)
1748 				moff += len;
1749 			else {
1750 				if (mp != NULL) {
1751 					int copy_flag;
1752 
1753 					if (flags & MSG_DONTWAIT)
1754 						copy_flag = M_DONTWAIT;
1755 					else
1756 						copy_flag = M_TRYWAIT;
1757 					if (copy_flag == M_TRYWAIT)
1758 						SOCKBUF_UNLOCK(&so->so_rcv);
1759 					*mp = m_copym(m, 0, len, copy_flag);
1760 					if (copy_flag == M_TRYWAIT)
1761 						SOCKBUF_LOCK(&so->so_rcv);
1762  					if (*mp == NULL) {
1763  						/*
1764  						 * m_copym() couldn't
1765 						 * allocate an mbuf.  Adjust
1766 						 * uio_resid back (it was
1767 						 * adjusted down by len
1768 						 * bytes, which we didn't end
1769 						 * up "copying" over).
1770  						 */
1771  						uio->uio_resid += len;
1772  						break;
1773  					}
1774 				}
1775 				m->m_data += len;
1776 				m->m_len -= len;
1777 				so->so_rcv.sb_cc -= len;
1778 			}
1779 		}
1780 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1781 		if (so->so_oobmark) {
1782 			if ((flags & MSG_PEEK) == 0) {
1783 				so->so_oobmark -= len;
1784 				if (so->so_oobmark == 0) {
1785 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1786 					break;
1787 				}
1788 			} else {
1789 				offset += len;
1790 				if (offset == so->so_oobmark)
1791 					break;
1792 			}
1793 		}
1794 		if (flags & MSG_EOR)
1795 			break;
1796 		/*
1797 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1798 		 * must not quit until "uio->uio_resid == 0" or an error
1799 		 * termination.  If a signal/timeout occurs, return with a
1800 		 * short count but without error.  Keep sockbuf locked
1801 		 * against other readers.
1802 		 */
1803 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1804 		    !sosendallatonce(so) && nextrecord == NULL) {
1805 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1806 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1807 				break;
1808 			/*
1809 			 * Notify the protocol that some data has been
1810 			 * drained before blocking.
1811 			 */
1812 			if (pr->pr_flags & PR_WANTRCVD) {
1813 				SOCKBUF_UNLOCK(&so->so_rcv);
1814 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1815 				SOCKBUF_LOCK(&so->so_rcv);
1816 			}
1817 			SBLASTRECORDCHK(&so->so_rcv);
1818 			SBLASTMBUFCHK(&so->so_rcv);
1819 			error = sbwait(&so->so_rcv);
1820 			if (error)
1821 				goto release;
1822 			m = so->so_rcv.sb_mb;
1823 			if (m != NULL)
1824 				nextrecord = m->m_nextpkt;
1825 		}
1826 	}
1827 
1828 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1829 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1830 		flags |= MSG_TRUNC;
1831 		if ((flags & MSG_PEEK) == 0)
1832 			(void) sbdroprecord_locked(&so->so_rcv);
1833 	}
1834 	if ((flags & MSG_PEEK) == 0) {
1835 		if (m == NULL) {
1836 			/*
1837 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1838 			 * part makes sure sb_lastrecord is up-to-date if
1839 			 * there is still data in the socket buffer.
1840 			 */
1841 			so->so_rcv.sb_mb = nextrecord;
1842 			if (so->so_rcv.sb_mb == NULL) {
1843 				so->so_rcv.sb_mbtail = NULL;
1844 				so->so_rcv.sb_lastrecord = NULL;
1845 			} else if (nextrecord->m_nextpkt == NULL)
1846 				so->so_rcv.sb_lastrecord = nextrecord;
1847 		}
1848 		SBLASTRECORDCHK(&so->so_rcv);
1849 		SBLASTMBUFCHK(&so->so_rcv);
1850 		/*
1851 		 * If soreceive() is being done from the socket callback,
1852 		 * then don't need to generate ACK to peer to update window,
1853 		 * since ACK will be generated on return to TCP.
1854 		 */
1855 		if (!(flags & MSG_SOCALLBCK) &&
1856 		    (pr->pr_flags & PR_WANTRCVD)) {
1857 			SOCKBUF_UNLOCK(&so->so_rcv);
1858 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1859 			SOCKBUF_LOCK(&so->so_rcv);
1860 		}
1861 	}
1862 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1863 	if (orig_resid == uio->uio_resid && orig_resid &&
1864 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1865 		sbunlock(&so->so_rcv);
1866 		goto restart;
1867 	}
1868 
1869 	if (flagsp != NULL)
1870 		*flagsp |= flags;
1871 release:
1872 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1873 	sbunlock(&so->so_rcv);
1874 out:
1875 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1876 	SOCKBUF_UNLOCK(&so->so_rcv);
1877 	return (error);
1878 }
1879 
1880 int
1881 soreceive(so, psa, uio, mp0, controlp, flagsp)
1882 	struct socket *so;
1883 	struct sockaddr **psa;
1884 	struct uio *uio;
1885 	struct mbuf **mp0;
1886 	struct mbuf **controlp;
1887 	int *flagsp;
1888 {
1889 
1890 	/* XXXRW: Temporary debugging. */
1891 	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1892 	    ("soreceive: protocol calls soreceive"));
1893 
1894 	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1895 	    controlp, flagsp));
1896 }
1897 
1898 int
1899 soshutdown(so, how)
1900 	struct socket *so;
1901 	int how;
1902 {
1903 	struct protosw *pr = so->so_proto;
1904 
1905 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1906 		return (EINVAL);
1907 
1908 	if (how != SHUT_WR)
1909 		sorflush(so);
1910 	if (how != SHUT_RD)
1911 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1912 	return (0);
1913 }
1914 
1915 void
1916 sorflush(so)
1917 	struct socket *so;
1918 {
1919 	struct sockbuf *sb = &so->so_rcv;
1920 	struct protosw *pr = so->so_proto;
1921 	struct sockbuf asb;
1922 
1923 	/*
1924 	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1925 	 * the socket buffer, then zero'd the original to clear the buffer
1926 	 * fields.  However, with mutexes in the socket buffer, this causes
1927 	 * problems.  We only clear the zeroable bits of the original;
1928 	 * however, we have to initialize and destroy the mutex in the copy
1929 	 * so that dom_dispose() and sbrelease() can lock t as needed.
1930 	 */
1931 	SOCKBUF_LOCK(sb);
1932 	sb->sb_flags |= SB_NOINTR;
1933 	(void) sblock(sb, M_WAITOK);
1934 	/*
1935 	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1936 	 * can safely perform wakeups.  Re-acquire the mutex before
1937 	 * continuing.
1938 	 */
1939 	socantrcvmore_locked(so);
1940 	SOCKBUF_LOCK(sb);
1941 	sbunlock(sb);
1942 	/*
1943 	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1944 	 * and mutex data unchanged.
1945 	 */
1946 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1947 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1948 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1949 	bzero(&sb->sb_startzero,
1950 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1951 	SOCKBUF_UNLOCK(sb);
1952 
1953 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1954 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1955 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1956 	sbrelease(&asb, so);
1957 	SOCKBUF_LOCK_DESTROY(&asb);
1958 }
1959 
1960 /*
1961  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1962  * additional variant to handle the case where the option value needs to be
1963  * some kind of integer, but not a specific size.  In addition to their use
1964  * here, these functions are also called by the protocol-level pr_ctloutput()
1965  * routines.
1966  */
1967 int
1968 sooptcopyin(sopt, buf, len, minlen)
1969 	struct	sockopt *sopt;
1970 	void	*buf;
1971 	size_t	len;
1972 	size_t	minlen;
1973 {
1974 	size_t	valsize;
1975 
1976 	/*
1977 	 * If the user gives us more than we wanted, we ignore it, but if we
1978 	 * don't get the minimum length the caller wants, we return EINVAL.
1979 	 * On success, sopt->sopt_valsize is set to however much we actually
1980 	 * retrieved.
1981 	 */
1982 	if ((valsize = sopt->sopt_valsize) < minlen)
1983 		return EINVAL;
1984 	if (valsize > len)
1985 		sopt->sopt_valsize = valsize = len;
1986 
1987 	if (sopt->sopt_td != NULL)
1988 		return (copyin(sopt->sopt_val, buf, valsize));
1989 
1990 	bcopy(sopt->sopt_val, buf, valsize);
1991 	return (0);
1992 }
1993 
1994 /*
1995  * Kernel version of setsockopt(2).
1996  *
1997  * XXX: optlen is size_t, not socklen_t
1998  */
1999 int
2000 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2001     size_t optlen)
2002 {
2003 	struct sockopt sopt;
2004 
2005 	sopt.sopt_level = level;
2006 	sopt.sopt_name = optname;
2007 	sopt.sopt_dir = SOPT_SET;
2008 	sopt.sopt_val = optval;
2009 	sopt.sopt_valsize = optlen;
2010 	sopt.sopt_td = NULL;
2011 	return (sosetopt(so, &sopt));
2012 }
2013 
2014 int
2015 sosetopt(so, sopt)
2016 	struct socket *so;
2017 	struct sockopt *sopt;
2018 {
2019 	int	error, optval;
2020 	struct	linger l;
2021 	struct	timeval tv;
2022 	u_long  val;
2023 #ifdef MAC
2024 	struct mac extmac;
2025 #endif
2026 
2027 	error = 0;
2028 	if (sopt->sopt_level != SOL_SOCKET) {
2029 		if (so->so_proto && so->so_proto->pr_ctloutput)
2030 			return ((*so->so_proto->pr_ctloutput)
2031 				  (so, sopt));
2032 		error = ENOPROTOOPT;
2033 	} else {
2034 		switch (sopt->sopt_name) {
2035 #ifdef INET
2036 		case SO_ACCEPTFILTER:
2037 			error = do_setopt_accept_filter(so, sopt);
2038 			if (error)
2039 				goto bad;
2040 			break;
2041 #endif
2042 		case SO_LINGER:
2043 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2044 			if (error)
2045 				goto bad;
2046 
2047 			SOCK_LOCK(so);
2048 			so->so_linger = l.l_linger;
2049 			if (l.l_onoff)
2050 				so->so_options |= SO_LINGER;
2051 			else
2052 				so->so_options &= ~SO_LINGER;
2053 			SOCK_UNLOCK(so);
2054 			break;
2055 
2056 		case SO_DEBUG:
2057 		case SO_KEEPALIVE:
2058 		case SO_DONTROUTE:
2059 		case SO_USELOOPBACK:
2060 		case SO_BROADCAST:
2061 		case SO_REUSEADDR:
2062 		case SO_REUSEPORT:
2063 		case SO_OOBINLINE:
2064 		case SO_TIMESTAMP:
2065 		case SO_BINTIME:
2066 		case SO_NOSIGPIPE:
2067 			error = sooptcopyin(sopt, &optval, sizeof optval,
2068 					    sizeof optval);
2069 			if (error)
2070 				goto bad;
2071 			SOCK_LOCK(so);
2072 			if (optval)
2073 				so->so_options |= sopt->sopt_name;
2074 			else
2075 				so->so_options &= ~sopt->sopt_name;
2076 			SOCK_UNLOCK(so);
2077 			break;
2078 
2079 		case SO_SNDBUF:
2080 		case SO_RCVBUF:
2081 		case SO_SNDLOWAT:
2082 		case SO_RCVLOWAT:
2083 			error = sooptcopyin(sopt, &optval, sizeof optval,
2084 					    sizeof optval);
2085 			if (error)
2086 				goto bad;
2087 
2088 			/*
2089 			 * Values < 1 make no sense for any of these options,
2090 			 * so disallow them.
2091 			 */
2092 			if (optval < 1) {
2093 				error = EINVAL;
2094 				goto bad;
2095 			}
2096 
2097 			switch (sopt->sopt_name) {
2098 			case SO_SNDBUF:
2099 			case SO_RCVBUF:
2100 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2101 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2102 				    so, curthread) == 0) {
2103 					error = ENOBUFS;
2104 					goto bad;
2105 				}
2106 				break;
2107 
2108 			/*
2109 			 * Make sure the low-water is never greater than the
2110 			 * high-water.
2111 			 */
2112 			case SO_SNDLOWAT:
2113 				SOCKBUF_LOCK(&so->so_snd);
2114 				so->so_snd.sb_lowat =
2115 				    (optval > so->so_snd.sb_hiwat) ?
2116 				    so->so_snd.sb_hiwat : optval;
2117 				SOCKBUF_UNLOCK(&so->so_snd);
2118 				break;
2119 			case SO_RCVLOWAT:
2120 				SOCKBUF_LOCK(&so->so_rcv);
2121 				so->so_rcv.sb_lowat =
2122 				    (optval > so->so_rcv.sb_hiwat) ?
2123 				    so->so_rcv.sb_hiwat : optval;
2124 				SOCKBUF_UNLOCK(&so->so_rcv);
2125 				break;
2126 			}
2127 			break;
2128 
2129 		case SO_SNDTIMEO:
2130 		case SO_RCVTIMEO:
2131 #ifdef COMPAT_IA32
2132 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2133 				struct timeval32 tv32;
2134 
2135 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2136 				    sizeof tv32);
2137 				CP(tv32, tv, tv_sec);
2138 				CP(tv32, tv, tv_usec);
2139 			} else
2140 #endif
2141 				error = sooptcopyin(sopt, &tv, sizeof tv,
2142 				    sizeof tv);
2143 			if (error)
2144 				goto bad;
2145 
2146 			/* assert(hz > 0); */
2147 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2148 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2149 				error = EDOM;
2150 				goto bad;
2151 			}
2152 			/* assert(tick > 0); */
2153 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2154 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2155 			if (val > INT_MAX) {
2156 				error = EDOM;
2157 				goto bad;
2158 			}
2159 			if (val == 0 && tv.tv_usec != 0)
2160 				val = 1;
2161 
2162 			switch (sopt->sopt_name) {
2163 			case SO_SNDTIMEO:
2164 				so->so_snd.sb_timeo = val;
2165 				break;
2166 			case SO_RCVTIMEO:
2167 				so->so_rcv.sb_timeo = val;
2168 				break;
2169 			}
2170 			break;
2171 
2172 		case SO_LABEL:
2173 #ifdef MAC
2174 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2175 			    sizeof extmac);
2176 			if (error)
2177 				goto bad;
2178 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2179 			    so, &extmac);
2180 #else
2181 			error = EOPNOTSUPP;
2182 #endif
2183 			break;
2184 
2185 		default:
2186 			error = ENOPROTOOPT;
2187 			break;
2188 		}
2189 		if (error == 0 && so->so_proto != NULL &&
2190 		    so->so_proto->pr_ctloutput != NULL) {
2191 			(void) ((*so->so_proto->pr_ctloutput)
2192 				  (so, sopt));
2193 		}
2194 	}
2195 bad:
2196 	return (error);
2197 }
2198 
2199 /*
2200  * Helper routine for getsockopt.
2201  */
2202 int
2203 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2204 {
2205 	int	error;
2206 	size_t	valsize;
2207 
2208 	error = 0;
2209 
2210 	/*
2211 	 * Documented get behavior is that we always return a value, possibly
2212 	 * truncated to fit in the user's buffer.  Traditional behavior is
2213 	 * that we always tell the user precisely how much we copied, rather
2214 	 * than something useful like the total amount we had available for
2215 	 * her.  Note that this interface is not idempotent; the entire
2216 	 * answer must generated ahead of time.
2217 	 */
2218 	valsize = min(len, sopt->sopt_valsize);
2219 	sopt->sopt_valsize = valsize;
2220 	if (sopt->sopt_val != NULL) {
2221 		if (sopt->sopt_td != NULL)
2222 			error = copyout(buf, sopt->sopt_val, valsize);
2223 		else
2224 			bcopy(buf, sopt->sopt_val, valsize);
2225 	}
2226 	return (error);
2227 }
2228 
2229 int
2230 sogetopt(so, sopt)
2231 	struct socket *so;
2232 	struct sockopt *sopt;
2233 {
2234 	int	error, optval;
2235 	struct	linger l;
2236 	struct	timeval tv;
2237 #ifdef MAC
2238 	struct mac extmac;
2239 #endif
2240 
2241 	error = 0;
2242 	if (sopt->sopt_level != SOL_SOCKET) {
2243 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2244 			return ((*so->so_proto->pr_ctloutput)
2245 				  (so, sopt));
2246 		} else
2247 			return (ENOPROTOOPT);
2248 	} else {
2249 		switch (sopt->sopt_name) {
2250 #ifdef INET
2251 		case SO_ACCEPTFILTER:
2252 			error = do_getopt_accept_filter(so, sopt);
2253 			break;
2254 #endif
2255 		case SO_LINGER:
2256 			SOCK_LOCK(so);
2257 			l.l_onoff = so->so_options & SO_LINGER;
2258 			l.l_linger = so->so_linger;
2259 			SOCK_UNLOCK(so);
2260 			error = sooptcopyout(sopt, &l, sizeof l);
2261 			break;
2262 
2263 		case SO_USELOOPBACK:
2264 		case SO_DONTROUTE:
2265 		case SO_DEBUG:
2266 		case SO_KEEPALIVE:
2267 		case SO_REUSEADDR:
2268 		case SO_REUSEPORT:
2269 		case SO_BROADCAST:
2270 		case SO_OOBINLINE:
2271 		case SO_ACCEPTCONN:
2272 		case SO_TIMESTAMP:
2273 		case SO_BINTIME:
2274 		case SO_NOSIGPIPE:
2275 			optval = so->so_options & sopt->sopt_name;
2276 integer:
2277 			error = sooptcopyout(sopt, &optval, sizeof optval);
2278 			break;
2279 
2280 		case SO_TYPE:
2281 			optval = so->so_type;
2282 			goto integer;
2283 
2284 		case SO_ERROR:
2285 			SOCK_LOCK(so);
2286 			optval = so->so_error;
2287 			so->so_error = 0;
2288 			SOCK_UNLOCK(so);
2289 			goto integer;
2290 
2291 		case SO_SNDBUF:
2292 			optval = so->so_snd.sb_hiwat;
2293 			goto integer;
2294 
2295 		case SO_RCVBUF:
2296 			optval = so->so_rcv.sb_hiwat;
2297 			goto integer;
2298 
2299 		case SO_SNDLOWAT:
2300 			optval = so->so_snd.sb_lowat;
2301 			goto integer;
2302 
2303 		case SO_RCVLOWAT:
2304 			optval = so->so_rcv.sb_lowat;
2305 			goto integer;
2306 
2307 		case SO_SNDTIMEO:
2308 		case SO_RCVTIMEO:
2309 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2310 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2311 
2312 			tv.tv_sec = optval / hz;
2313 			tv.tv_usec = (optval % hz) * tick;
2314 #ifdef COMPAT_IA32
2315 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2316 				struct timeval32 tv32;
2317 
2318 				CP(tv, tv32, tv_sec);
2319 				CP(tv, tv32, tv_usec);
2320 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2321 			} else
2322 #endif
2323 				error = sooptcopyout(sopt, &tv, sizeof tv);
2324 			break;
2325 
2326 		case SO_LABEL:
2327 #ifdef MAC
2328 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2329 			    sizeof(extmac));
2330 			if (error)
2331 				return (error);
2332 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2333 			    so, &extmac);
2334 			if (error)
2335 				return (error);
2336 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2337 #else
2338 			error = EOPNOTSUPP;
2339 #endif
2340 			break;
2341 
2342 		case SO_PEERLABEL:
2343 #ifdef MAC
2344 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2345 			    sizeof(extmac));
2346 			if (error)
2347 				return (error);
2348 			error = mac_getsockopt_peerlabel(
2349 			    sopt->sopt_td->td_ucred, so, &extmac);
2350 			if (error)
2351 				return (error);
2352 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2353 #else
2354 			error = EOPNOTSUPP;
2355 #endif
2356 			break;
2357 
2358 		case SO_LISTENQLIMIT:
2359 			optval = so->so_qlimit;
2360 			goto integer;
2361 
2362 		case SO_LISTENQLEN:
2363 			optval = so->so_qlen;
2364 			goto integer;
2365 
2366 		case SO_LISTENINCQLEN:
2367 			optval = so->so_incqlen;
2368 			goto integer;
2369 
2370 		default:
2371 			error = ENOPROTOOPT;
2372 			break;
2373 		}
2374 		return (error);
2375 	}
2376 }
2377 
2378 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2379 int
2380 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2381 {
2382 	struct mbuf *m, *m_prev;
2383 	int sopt_size = sopt->sopt_valsize;
2384 
2385 	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2386 	if (m == NULL)
2387 		return ENOBUFS;
2388 	if (sopt_size > MLEN) {
2389 		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2390 		if ((m->m_flags & M_EXT) == 0) {
2391 			m_free(m);
2392 			return ENOBUFS;
2393 		}
2394 		m->m_len = min(MCLBYTES, sopt_size);
2395 	} else {
2396 		m->m_len = min(MLEN, sopt_size);
2397 	}
2398 	sopt_size -= m->m_len;
2399 	*mp = m;
2400 	m_prev = m;
2401 
2402 	while (sopt_size) {
2403 		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2404 		if (m == NULL) {
2405 			m_freem(*mp);
2406 			return ENOBUFS;
2407 		}
2408 		if (sopt_size > MLEN) {
2409 			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2410 			    M_DONTWAIT);
2411 			if ((m->m_flags & M_EXT) == 0) {
2412 				m_freem(m);
2413 				m_freem(*mp);
2414 				return ENOBUFS;
2415 			}
2416 			m->m_len = min(MCLBYTES, sopt_size);
2417 		} else {
2418 			m->m_len = min(MLEN, sopt_size);
2419 		}
2420 		sopt_size -= m->m_len;
2421 		m_prev->m_next = m;
2422 		m_prev = m;
2423 	}
2424 	return (0);
2425 }
2426 
2427 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2428 int
2429 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2430 {
2431 	struct mbuf *m0 = m;
2432 
2433 	if (sopt->sopt_val == NULL)
2434 		return (0);
2435 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2436 		if (sopt->sopt_td != NULL) {
2437 			int error;
2438 
2439 			error = copyin(sopt->sopt_val, mtod(m, char *),
2440 				       m->m_len);
2441 			if (error != 0) {
2442 				m_freem(m0);
2443 				return(error);
2444 			}
2445 		} else
2446 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2447 		sopt->sopt_valsize -= m->m_len;
2448 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2449 		m = m->m_next;
2450 	}
2451 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2452 		panic("ip6_sooptmcopyin");
2453 	return (0);
2454 }
2455 
2456 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2457 int
2458 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2459 {
2460 	struct mbuf *m0 = m;
2461 	size_t valsize = 0;
2462 
2463 	if (sopt->sopt_val == NULL)
2464 		return (0);
2465 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2466 		if (sopt->sopt_td != NULL) {
2467 			int error;
2468 
2469 			error = copyout(mtod(m, char *), sopt->sopt_val,
2470 				       m->m_len);
2471 			if (error != 0) {
2472 				m_freem(m0);
2473 				return(error);
2474 			}
2475 		} else
2476 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2477 	       sopt->sopt_valsize -= m->m_len;
2478 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2479 	       valsize += m->m_len;
2480 	       m = m->m_next;
2481 	}
2482 	if (m != NULL) {
2483 		/* enough soopt buffer should be given from user-land */
2484 		m_freem(m0);
2485 		return(EINVAL);
2486 	}
2487 	sopt->sopt_valsize = valsize;
2488 	return (0);
2489 }
2490 
2491 /*
2492  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2493  * out-of-band data, which will then notify socket consumers.
2494  */
2495 void
2496 sohasoutofband(so)
2497 	struct socket *so;
2498 {
2499 	if (so->so_sigio != NULL)
2500 		pgsigio(&so->so_sigio, SIGURG, 0);
2501 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2502 }
2503 
2504 int
2505 sopoll(struct socket *so, int events, struct ucred *active_cred,
2506     struct thread *td)
2507 {
2508 
2509 	/* XXXRW: Temporary debugging. */
2510 	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2511 	    ("sopoll: protocol calls sopoll"));
2512 
2513 	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2514 	    td));
2515 }
2516 
2517 int
2518 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2519     struct thread *td)
2520 {
2521 	int revents = 0;
2522 
2523 	SOCKBUF_LOCK(&so->so_snd);
2524 	SOCKBUF_LOCK(&so->so_rcv);
2525 	if (events & (POLLIN | POLLRDNORM))
2526 		if (soreadable(so))
2527 			revents |= events & (POLLIN | POLLRDNORM);
2528 
2529 	if (events & POLLINIGNEOF)
2530 		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2531 		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2532 			revents |= POLLINIGNEOF;
2533 
2534 	if (events & (POLLOUT | POLLWRNORM))
2535 		if (sowriteable(so))
2536 			revents |= events & (POLLOUT | POLLWRNORM);
2537 
2538 	if (events & (POLLPRI | POLLRDBAND))
2539 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2540 			revents |= events & (POLLPRI | POLLRDBAND);
2541 
2542 	if (revents == 0) {
2543 		if (events &
2544 		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2545 		     POLLRDBAND)) {
2546 			selrecord(td, &so->so_rcv.sb_sel);
2547 			so->so_rcv.sb_flags |= SB_SEL;
2548 		}
2549 
2550 		if (events & (POLLOUT | POLLWRNORM)) {
2551 			selrecord(td, &so->so_snd.sb_sel);
2552 			so->so_snd.sb_flags |= SB_SEL;
2553 		}
2554 	}
2555 
2556 	SOCKBUF_UNLOCK(&so->so_rcv);
2557 	SOCKBUF_UNLOCK(&so->so_snd);
2558 	return (revents);
2559 }
2560 
2561 int
2562 soo_kqfilter(struct file *fp, struct knote *kn)
2563 {
2564 	struct socket *so = kn->kn_fp->f_data;
2565 	struct sockbuf *sb;
2566 
2567 	switch (kn->kn_filter) {
2568 	case EVFILT_READ:
2569 		if (so->so_options & SO_ACCEPTCONN)
2570 			kn->kn_fop = &solisten_filtops;
2571 		else
2572 			kn->kn_fop = &soread_filtops;
2573 		sb = &so->so_rcv;
2574 		break;
2575 	case EVFILT_WRITE:
2576 		kn->kn_fop = &sowrite_filtops;
2577 		sb = &so->so_snd;
2578 		break;
2579 	default:
2580 		return (EINVAL);
2581 	}
2582 
2583 	SOCKBUF_LOCK(sb);
2584 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2585 	sb->sb_flags |= SB_KNOTE;
2586 	SOCKBUF_UNLOCK(sb);
2587 	return (0);
2588 }
2589 
2590 static void
2591 filt_sordetach(struct knote *kn)
2592 {
2593 	struct socket *so = kn->kn_fp->f_data;
2594 
2595 	SOCKBUF_LOCK(&so->so_rcv);
2596 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2597 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2598 		so->so_rcv.sb_flags &= ~SB_KNOTE;
2599 	SOCKBUF_UNLOCK(&so->so_rcv);
2600 }
2601 
2602 /*ARGSUSED*/
2603 static int
2604 filt_soread(struct knote *kn, long hint)
2605 {
2606 	struct socket *so;
2607 
2608 	so = kn->kn_fp->f_data;
2609 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2610 
2611 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2612 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2613 		kn->kn_flags |= EV_EOF;
2614 		kn->kn_fflags = so->so_error;
2615 		return (1);
2616 	} else if (so->so_error)	/* temporary udp error */
2617 		return (1);
2618 	else if (kn->kn_sfflags & NOTE_LOWAT)
2619 		return (kn->kn_data >= kn->kn_sdata);
2620 	else
2621 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2622 }
2623 
2624 static void
2625 filt_sowdetach(struct knote *kn)
2626 {
2627 	struct socket *so = kn->kn_fp->f_data;
2628 
2629 	SOCKBUF_LOCK(&so->so_snd);
2630 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2631 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2632 		so->so_snd.sb_flags &= ~SB_KNOTE;
2633 	SOCKBUF_UNLOCK(&so->so_snd);
2634 }
2635 
2636 /*ARGSUSED*/
2637 static int
2638 filt_sowrite(struct knote *kn, long hint)
2639 {
2640 	struct socket *so;
2641 
2642 	so = kn->kn_fp->f_data;
2643 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2644 	kn->kn_data = sbspace(&so->so_snd);
2645 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2646 		kn->kn_flags |= EV_EOF;
2647 		kn->kn_fflags = so->so_error;
2648 		return (1);
2649 	} else if (so->so_error)	/* temporary udp error */
2650 		return (1);
2651 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2652 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2653 		return (0);
2654 	else if (kn->kn_sfflags & NOTE_LOWAT)
2655 		return (kn->kn_data >= kn->kn_sdata);
2656 	else
2657 		return (kn->kn_data >= so->so_snd.sb_lowat);
2658 }
2659 
2660 /*ARGSUSED*/
2661 static int
2662 filt_solisten(struct knote *kn, long hint)
2663 {
2664 	struct socket *so = kn->kn_fp->f_data;
2665 
2666 	kn->kn_data = so->so_qlen;
2667 	return (! TAILQ_EMPTY(&so->so_comp));
2668 }
2669 
2670 int
2671 socheckuid(struct socket *so, uid_t uid)
2672 {
2673 
2674 	if (so == NULL)
2675 		return (EPERM);
2676 	if (so->so_cred->cr_uid != uid)
2677 		return (EPERM);
2678 	return (0);
2679 }
2680 
2681 static int
2682 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2683 {
2684 	int error;
2685 	int val;
2686 
2687 	val = somaxconn;
2688 	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2689 	if (error || !req->newptr )
2690 		return (error);
2691 
2692 	if (val < 1 || val > USHRT_MAX)
2693 		return (EINVAL);
2694 
2695 	somaxconn = val;
2696 	return (0);
2697 }
2698