xref: /freebsd/sys/kern/uipc_socket.c (revision 91c878a6935c5c2e99866eb267e5bc3028bf6d2f)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2006 Robert N. M. Watson
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32  */
33 
34 /*
35  * Comments on the socket life cycle:
36  *
37  * soalloc() sets of socket layer state for a socket, called only by
38  * socreate() and sonewconn().  Socket layer private.
39  *
40  * sodealloc() tears down socket layer state for a socket, called only by
41  * sofree() and sonewconn().  Socket layer private.
42  *
43  * pru_attach() associates protocol layer state with an allocated socket;
44  * called only once, may fail, aborting socket allocation.  This is called
45  * from socreate() and sonewconn().  Socket layer private.
46  *
47  * pru_detach() disassociates protocol layer state from an attached socket,
48  * and will be called exactly once for sockets in which pru_attach() has
49  * been successfully called.  If pru_attach() returned an error,
50  * pru_detach() will not be called.  Socket layer private.
51  *
52  * pru_abort() and pru_close() notify the protocol layer that the last
53  * consumer of a socket is starting to tear down the socket, and that the
54  * protocol should terminate the connection.  Historically, pru_abort() also
55  * detached protocol state from the socket state, but this is no longer the
56  * case.
57  *
58  * socreate() creates a socket and attaches protocol state.  This is a public
59  * interface that may be used by socket layer consumers to create new
60  * sockets.
61  *
62  * sonewconn() creates a socket and attaches protocol state.  This is a
63  * public interface  that may be used by protocols to create new sockets when
64  * a new connection is received and will be available for accept() on a
65  * listen socket.
66  *
67  * soclose() destroys a socket after possibly waiting for it to disconnect.
68  * This is a public interface that socket consumers should use to close and
69  * release a socket when done with it.
70  *
71  * soabort() destroys a socket without waiting for it to disconnect (used
72  * only for incoming connections that are already partially or fully
73  * connected).  This is used internally by the socket layer when clearing
74  * listen socket queues (due to overflow or close on the listen socket), but
75  * is also a public interface protocols may use to abort connections in
76  * their incomplete listen queues should they no longer be required.  Sockets
77  * placed in completed connection listen queues should not be aborted for
78  * reasons described in the comment above the soclose() implementation.  This
79  * is not a general purpose close routine, and except in the specific
80  * circumstances described here, should not be used.
81  *
82  * sofree() will free a socket and its protocol state if all references on
83  * the socket have been released, and is the public interface to attempt to
84  * free a socket when a reference is removed.  This is a socket layer private
85  * interface.
86  *
87  * NOTE: In addition to socreate() and soclose(), which provide a single
88  * socket reference to the consumer to be managed as required, there are two
89  * calls to explicitly manage socket references, soref(), and sorele().
90  * Currently, these are generally required only when transitioning a socket
91  * from a listen queue to a file descriptor, in order to prevent garbage
92  * collection of the socket at an untimely moment.  For a number of reasons,
93  * these interfaces are not preferred, and should be avoided.
94  */
95 
96 #include <sys/cdefs.h>
97 __FBSDID("$FreeBSD$");
98 
99 #include "opt_inet.h"
100 #include "opt_mac.h"
101 #include "opt_zero.h"
102 #include "opt_compat.h"
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/fcntl.h>
107 #include <sys/limits.h>
108 #include <sys/lock.h>
109 #include <sys/mac.h>
110 #include <sys/malloc.h>
111 #include <sys/mbuf.h>
112 #include <sys/mutex.h>
113 #include <sys/domain.h>
114 #include <sys/file.h>			/* for struct knote */
115 #include <sys/kernel.h>
116 #include <sys/event.h>
117 #include <sys/eventhandler.h>
118 #include <sys/poll.h>
119 #include <sys/proc.h>
120 #include <sys/protosw.h>
121 #include <sys/socket.h>
122 #include <sys/socketvar.h>
123 #include <sys/resourcevar.h>
124 #include <sys/signalvar.h>
125 #include <sys/sysctl.h>
126 #include <sys/uio.h>
127 #include <sys/jail.h>
128 
129 #include <security/mac/mac_framework.h>
130 
131 #include <vm/uma.h>
132 
133 #ifdef COMPAT_IA32
134 #include <sys/mount.h>
135 #include <compat/freebsd32/freebsd32.h>
136 
137 extern struct sysentvec ia32_freebsd_sysvec;
138 #endif
139 
140 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
141 		    int flags);
142 
143 static void	filt_sordetach(struct knote *kn);
144 static int	filt_soread(struct knote *kn, long hint);
145 static void	filt_sowdetach(struct knote *kn);
146 static int	filt_sowrite(struct knote *kn, long hint);
147 static int	filt_solisten(struct knote *kn, long hint);
148 
149 static struct filterops solisten_filtops =
150 	{ 1, NULL, filt_sordetach, filt_solisten };
151 static struct filterops soread_filtops =
152 	{ 1, NULL, filt_sordetach, filt_soread };
153 static struct filterops sowrite_filtops =
154 	{ 1, NULL, filt_sowdetach, filt_sowrite };
155 
156 uma_zone_t socket_zone;
157 so_gen_t	so_gencnt;	/* generation count for sockets */
158 
159 int	maxsockets;
160 
161 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
162 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
163 
164 static int somaxconn = SOMAXCONN;
165 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
166 /* XXX: we dont have SYSCTL_USHORT */
167 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
168     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
169     "queue size");
170 static int numopensockets;
171 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
172     &numopensockets, 0, "Number of open sockets");
173 #ifdef ZERO_COPY_SOCKETS
174 /* These aren't static because they're used in other files. */
175 int so_zero_copy_send = 1;
176 int so_zero_copy_receive = 1;
177 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
178     "Zero copy controls");
179 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
180     &so_zero_copy_receive, 0, "Enable zero copy receive");
181 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
182     &so_zero_copy_send, 0, "Enable zero copy send");
183 #endif /* ZERO_COPY_SOCKETS */
184 
185 /*
186  * accept_mtx locks down per-socket fields relating to accept queues.  See
187  * socketvar.h for an annotation of the protected fields of struct socket.
188  */
189 struct mtx accept_mtx;
190 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
191 
192 /*
193  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
194  * so_gencnt field.
195  */
196 static struct mtx so_global_mtx;
197 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
198 
199 /*
200  * General IPC sysctl name space, used by sockets and a variety of other IPC
201  * types.
202  */
203 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
204 
205 /*
206  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
207  * of the change so that they can update their dependent limits as required.
208  */
209 static int
210 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
211 {
212 	int error, newmaxsockets;
213 
214 	newmaxsockets = maxsockets;
215 	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
216 	if (error == 0 && req->newptr) {
217 		if (newmaxsockets > maxsockets) {
218 			maxsockets = newmaxsockets;
219 			if (maxsockets > ((maxfiles / 4) * 3)) {
220 				maxfiles = (maxsockets * 5) / 4;
221 				maxfilesperproc = (maxfiles * 9) / 10;
222 			}
223 			EVENTHANDLER_INVOKE(maxsockets_change);
224 		} else
225 			error = EINVAL;
226 	}
227 	return (error);
228 }
229 
230 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
231     &maxsockets, 0, sysctl_maxsockets, "IU",
232     "Maximum number of sockets avaliable");
233 
234 /*
235  * Initialise maxsockets.
236  */
237 static void init_maxsockets(void *ignored)
238 {
239 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
240 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
241 }
242 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
243 
244 /*
245  * Socket operation routines.  These routines are called by the routines in
246  * sys_socket.c or from a system process, and implement the semantics of
247  * socket operations by switching out to the protocol specific routines.
248  */
249 
250 /*
251  * Get a socket structure from our zone, and initialize it.  Note that it
252  * would probably be better to allocate socket and PCB at the same time, but
253  * I'm not convinced that all the protocols can be easily modified to do
254  * this.
255  *
256  * soalloc() returns a socket with a ref count of 0.
257  */
258 static struct socket *
259 soalloc(int mflags)
260 {
261 	struct socket *so;
262 
263 	so = uma_zalloc(socket_zone, mflags | M_ZERO);
264 	if (so == NULL)
265 		return (NULL);
266 #ifdef MAC
267 	if (mac_init_socket(so, mflags) != 0) {
268 		uma_zfree(socket_zone, so);
269 		return (NULL);
270 	}
271 #endif
272 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
273 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
274 	TAILQ_INIT(&so->so_aiojobq);
275 	mtx_lock(&so_global_mtx);
276 	so->so_gencnt = ++so_gencnt;
277 	++numopensockets;
278 	mtx_unlock(&so_global_mtx);
279 	return (so);
280 }
281 
282 /*
283  * Free the storage associated with a socket at the socket layer, tear down
284  * locks, labels, etc.  All protocol state is assumed already to have been
285  * torn down (and possibly never set up) by the caller.
286  */
287 static void
288 sodealloc(struct socket *so)
289 {
290 
291 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
292 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
293 
294 	mtx_lock(&so_global_mtx);
295 	so->so_gencnt = ++so_gencnt;
296 	--numopensockets;	/* Could be below, but faster here. */
297 	mtx_unlock(&so_global_mtx);
298 	if (so->so_rcv.sb_hiwat)
299 		(void)chgsbsize(so->so_cred->cr_uidinfo,
300 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
301 	if (so->so_snd.sb_hiwat)
302 		(void)chgsbsize(so->so_cred->cr_uidinfo,
303 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
304 #ifdef INET
305 	/* remove acccept filter if one is present. */
306 	if (so->so_accf != NULL)
307 		do_setopt_accept_filter(so, NULL);
308 #endif
309 #ifdef MAC
310 	mac_destroy_socket(so);
311 #endif
312 	crfree(so->so_cred);
313 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
314 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
315 	uma_zfree(socket_zone, so);
316 }
317 
318 /*
319  * socreate returns a socket with a ref count of 1.  The socket should be
320  * closed with soclose().
321  */
322 int
323 socreate(dom, aso, type, proto, cred, td)
324 	int dom;
325 	struct socket **aso;
326 	int type;
327 	int proto;
328 	struct ucred *cred;
329 	struct thread *td;
330 {
331 	struct protosw *prp;
332 	struct socket *so;
333 	int error;
334 
335 	if (proto)
336 		prp = pffindproto(dom, proto, type);
337 	else
338 		prp = pffindtype(dom, type);
339 
340 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
341 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
342 		return (EPROTONOSUPPORT);
343 
344 	if (jailed(cred) && jail_socket_unixiproute_only &&
345 	    prp->pr_domain->dom_family != PF_LOCAL &&
346 	    prp->pr_domain->dom_family != PF_INET &&
347 	    prp->pr_domain->dom_family != PF_ROUTE) {
348 		return (EPROTONOSUPPORT);
349 	}
350 
351 	if (prp->pr_type != type)
352 		return (EPROTOTYPE);
353 	so = soalloc(M_WAITOK);
354 	if (so == NULL)
355 		return (ENOBUFS);
356 
357 	TAILQ_INIT(&so->so_incomp);
358 	TAILQ_INIT(&so->so_comp);
359 	so->so_type = type;
360 	so->so_cred = crhold(cred);
361 	so->so_proto = prp;
362 #ifdef MAC
363 	mac_create_socket(cred, so);
364 #endif
365 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
366 	    NULL, NULL, NULL);
367 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
368 	    NULL, NULL, NULL);
369 	so->so_count = 1;
370 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
371 	if (error) {
372 		KASSERT(so->so_count == 1, ("socreate: so_count %d",
373 		    so->so_count));
374 		so->so_count = 0;
375 		sodealloc(so);
376 		return (error);
377 	}
378 	*aso = so;
379 	return (0);
380 }
381 
382 #ifdef REGRESSION
383 static int regression_sonewconn_earlytest = 1;
384 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
385     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
386 #endif
387 
388 /*
389  * When an attempt at a new connection is noted on a socket which accepts
390  * connections, sonewconn is called.  If the connection is possible (subject
391  * to space constraints, etc.) then we allocate a new structure, propoerly
392  * linked into the data structure of the original socket, and return this.
393  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
394  *
395  * Note: the ref count on the socket is 0 on return.
396  */
397 struct socket *
398 sonewconn(head, connstatus)
399 	register struct socket *head;
400 	int connstatus;
401 {
402 	register struct socket *so;
403 	int over;
404 
405 	ACCEPT_LOCK();
406 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
407 	ACCEPT_UNLOCK();
408 #ifdef REGRESSION
409 	if (regression_sonewconn_earlytest && over)
410 #else
411 	if (over)
412 #endif
413 		return (NULL);
414 	so = soalloc(M_NOWAIT);
415 	if (so == NULL)
416 		return (NULL);
417 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
418 		connstatus = 0;
419 	so->so_head = head;
420 	so->so_type = head->so_type;
421 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
422 	so->so_linger = head->so_linger;
423 	so->so_state = head->so_state | SS_NOFDREF;
424 	so->so_proto = head->so_proto;
425 	so->so_cred = crhold(head->so_cred);
426 #ifdef MAC
427 	SOCK_LOCK(head);
428 	mac_create_socket_from_socket(head, so);
429 	SOCK_UNLOCK(head);
430 #endif
431 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
432 	    NULL, NULL, NULL);
433 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
434 	    NULL, NULL, NULL);
435 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
436 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
437 		sodealloc(so);
438 		return (NULL);
439 	}
440 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
441 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
442 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
443 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
444 	so->so_state |= connstatus;
445 	ACCEPT_LOCK();
446 	if (connstatus) {
447 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
448 		so->so_qstate |= SQ_COMP;
449 		head->so_qlen++;
450 	} else {
451 		/*
452 		 * Keep removing sockets from the head until there's room for
453 		 * us to insert on the tail.  In pre-locking revisions, this
454 		 * was a simple if(), but as we could be racing with other
455 		 * threads and soabort() requires dropping locks, we must
456 		 * loop waiting for the condition to be true.
457 		 */
458 		while (head->so_incqlen > head->so_qlimit) {
459 			struct socket *sp;
460 			sp = TAILQ_FIRST(&head->so_incomp);
461 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
462 			head->so_incqlen--;
463 			sp->so_qstate &= ~SQ_INCOMP;
464 			sp->so_head = NULL;
465 			ACCEPT_UNLOCK();
466 			soabort(sp);
467 			ACCEPT_LOCK();
468 		}
469 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
470 		so->so_qstate |= SQ_INCOMP;
471 		head->so_incqlen++;
472 	}
473 	ACCEPT_UNLOCK();
474 	if (connstatus) {
475 		sorwakeup(head);
476 		wakeup_one(&head->so_timeo);
477 	}
478 	return (so);
479 }
480 
481 int
482 sobind(so, nam, td)
483 	struct socket *so;
484 	struct sockaddr *nam;
485 	struct thread *td;
486 {
487 
488 	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
489 }
490 
491 /*
492  * solisten() transitions a socket from a non-listening state to a listening
493  * state, but can also be used to update the listen queue depth on an
494  * existing listen socket.  The protocol will call back into the sockets
495  * layer using solisten_proto_check() and solisten_proto() to check and set
496  * socket-layer listen state.  Call backs are used so that the protocol can
497  * acquire both protocol and socket layer locks in whatever order is required
498  * by the protocol.
499  *
500  * Protocol implementors are advised to hold the socket lock across the
501  * socket-layer test and set to avoid races at the socket layer.
502  */
503 int
504 solisten(so, backlog, td)
505 	struct socket *so;
506 	int backlog;
507 	struct thread *td;
508 {
509 
510 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
511 }
512 
513 int
514 solisten_proto_check(so)
515 	struct socket *so;
516 {
517 
518 	SOCK_LOCK_ASSERT(so);
519 
520 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
521 	    SS_ISDISCONNECTING))
522 		return (EINVAL);
523 	return (0);
524 }
525 
526 void
527 solisten_proto(so, backlog)
528 	struct socket *so;
529 	int backlog;
530 {
531 
532 	SOCK_LOCK_ASSERT(so);
533 
534 	if (backlog < 0 || backlog > somaxconn)
535 		backlog = somaxconn;
536 	so->so_qlimit = backlog;
537 	so->so_options |= SO_ACCEPTCONN;
538 }
539 
540 /*
541  * Attempt to free a socket.  This should really be sotryfree().
542  *
543  * sofree() will succeed if:
544  *
545  * - There are no outstanding file descriptor references or related consumers
546  *   (so_count == 0).
547  *
548  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
549  *
550  * - The protocol does not have an outstanding strong reference on the socket
551  *   (SS_PROTOREF).
552  *
553  * - The socket is not in a completed connection queue, so a process has been
554  *   notified that it is present.  If it is removed, the user process may
555  *   block in accept() despite select() saying the socket was ready.
556  *
557  * Otherwise, it will quietly abort so that a future call to sofree(), when
558  * conditions are right, can succeed.
559  */
560 void
561 sofree(so)
562 	struct socket *so;
563 {
564 	struct protosw *pr = so->so_proto;
565 	struct socket *head;
566 
567 	ACCEPT_LOCK_ASSERT();
568 	SOCK_LOCK_ASSERT(so);
569 
570 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
571 	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
572 		SOCK_UNLOCK(so);
573 		ACCEPT_UNLOCK();
574 		return;
575 	}
576 
577 	head = so->so_head;
578 	if (head != NULL) {
579 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
580 		    (so->so_qstate & SQ_INCOMP) != 0,
581 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
582 		    "SQ_INCOMP"));
583 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
584 		    (so->so_qstate & SQ_INCOMP) == 0,
585 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
586 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
587 		head->so_incqlen--;
588 		so->so_qstate &= ~SQ_INCOMP;
589 		so->so_head = NULL;
590 	}
591 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
592 	    (so->so_qstate & SQ_INCOMP) == 0,
593 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
594 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
595 	SOCK_UNLOCK(so);
596 	ACCEPT_UNLOCK();
597 
598 	/*
599 	 * From this point on, we assume that no other references to this
600 	 * socket exist anywhere else in the stack.  Therefore, no locks need
601 	 * to be acquired or held.
602 	 *
603 	 * We used to do a lot of socket buffer and socket locking here, as
604 	 * well as invoke sorflush() and perform wakeups.  The direct call to
605 	 * dom_dispose() and sbrelease_internal() are an inlining of what was
606 	 * necessary from sorflush().
607 	 *
608 	 * Notice that the socket buffer and kqueue state are torn down
609 	 * before calling pru_detach.  This means that protocols shold not
610 	 * assume they can perform socket wakeups, etc, in their detach
611 	 * code.
612 	 */
613 	KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
614 	KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
615 	sbdestroy(&so->so_snd, so);
616 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
617 		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
618 	sbdestroy(&so->so_rcv, so);
619 	if (pr->pr_usrreqs->pru_detach != NULL)
620 		(*pr->pr_usrreqs->pru_detach)(so);
621 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
622 	knlist_destroy(&so->so_snd.sb_sel.si_note);
623 	sodealloc(so);
624 }
625 
626 /*
627  * Close a socket on last file table reference removal.  Initiate disconnect
628  * if connected.  Free socket when disconnect complete.
629  *
630  * This function will sorele() the socket.  Note that soclose() may be called
631  * prior to the ref count reaching zero.  The actual socket structure will
632  * not be freed until the ref count reaches zero.
633  */
634 int
635 soclose(so)
636 	struct socket *so;
637 {
638 	int error = 0;
639 
640 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
641 
642 	funsetown(&so->so_sigio);
643 	if (so->so_options & SO_ACCEPTCONN) {
644 		struct socket *sp;
645 		ACCEPT_LOCK();
646 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
647 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
648 			so->so_incqlen--;
649 			sp->so_qstate &= ~SQ_INCOMP;
650 			sp->so_head = NULL;
651 			ACCEPT_UNLOCK();
652 			soabort(sp);
653 			ACCEPT_LOCK();
654 		}
655 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
656 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
657 			so->so_qlen--;
658 			sp->so_qstate &= ~SQ_COMP;
659 			sp->so_head = NULL;
660 			ACCEPT_UNLOCK();
661 			soabort(sp);
662 			ACCEPT_LOCK();
663 		}
664 		ACCEPT_UNLOCK();
665 	}
666 	if (so->so_state & SS_ISCONNECTED) {
667 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
668 			error = sodisconnect(so);
669 			if (error)
670 				goto drop;
671 		}
672 		if (so->so_options & SO_LINGER) {
673 			if ((so->so_state & SS_ISDISCONNECTING) &&
674 			    (so->so_state & SS_NBIO))
675 				goto drop;
676 			while (so->so_state & SS_ISCONNECTED) {
677 				error = tsleep(&so->so_timeo,
678 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
679 				if (error)
680 					break;
681 			}
682 		}
683 	}
684 
685 drop:
686 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
687 		(*so->so_proto->pr_usrreqs->pru_close)(so);
688 	ACCEPT_LOCK();
689 	SOCK_LOCK(so);
690 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
691 	so->so_state |= SS_NOFDREF;
692 	sorele(so);
693 	return (error);
694 }
695 
696 /*
697  * soabort() is used to abruptly tear down a connection, such as when a
698  * resource limit is reached (listen queue depth exceeded), or if a listen
699  * socket is closed while there are sockets waiting to be accepted.
700  *
701  * This interface is tricky, because it is called on an unreferenced socket,
702  * and must be called only by a thread that has actually removed the socket
703  * from the listen queue it was on, or races with other threads are risked.
704  *
705  * This interface will call into the protocol code, so must not be called
706  * with any socket locks held.  Protocols do call it while holding their own
707  * recursible protocol mutexes, but this is something that should be subject
708  * to review in the future.
709  */
710 void
711 soabort(so)
712 	struct socket *so;
713 {
714 
715 	/*
716 	 * In as much as is possible, assert that no references to this
717 	 * socket are held.  This is not quite the same as asserting that the
718 	 * current thread is responsible for arranging for no references, but
719 	 * is as close as we can get for now.
720 	 */
721 	KASSERT(so->so_count == 0, ("soabort: so_count"));
722 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
723 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
724 	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
725 	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
726 
727 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
728 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
729 	ACCEPT_LOCK();
730 	SOCK_LOCK(so);
731 	sofree(so);
732 }
733 
734 int
735 soaccept(so, nam)
736 	struct socket *so;
737 	struct sockaddr **nam;
738 {
739 	int error;
740 
741 	SOCK_LOCK(so);
742 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
743 	so->so_state &= ~SS_NOFDREF;
744 	SOCK_UNLOCK(so);
745 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
746 	return (error);
747 }
748 
749 int
750 soconnect(so, nam, td)
751 	struct socket *so;
752 	struct sockaddr *nam;
753 	struct thread *td;
754 {
755 	int error;
756 
757 	if (so->so_options & SO_ACCEPTCONN)
758 		return (EOPNOTSUPP);
759 	/*
760 	 * If protocol is connection-based, can only connect once.
761 	 * Otherwise, if connected, try to disconnect first.  This allows
762 	 * user to disconnect by connecting to, e.g., a null address.
763 	 */
764 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
765 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
766 	    (error = sodisconnect(so)))) {
767 		error = EISCONN;
768 	} else {
769 		/*
770 		 * Prevent accumulated error from previous connection from
771 		 * biting us.
772 		 */
773 		so->so_error = 0;
774 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
775 	}
776 
777 	return (error);
778 }
779 
780 int
781 soconnect2(so1, so2)
782 	struct socket *so1;
783 	struct socket *so2;
784 {
785 
786 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
787 }
788 
789 int
790 sodisconnect(so)
791 	struct socket *so;
792 {
793 	int error;
794 
795 	if ((so->so_state & SS_ISCONNECTED) == 0)
796 		return (ENOTCONN);
797 	if (so->so_state & SS_ISDISCONNECTING)
798 		return (EALREADY);
799 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
800 	return (error);
801 }
802 
803 #ifdef ZERO_COPY_SOCKETS
804 struct so_zerocopy_stats{
805 	int size_ok;
806 	int align_ok;
807 	int found_ifp;
808 };
809 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
810 #include <netinet/in.h>
811 #include <net/route.h>
812 #include <netinet/in_pcb.h>
813 #include <vm/vm.h>
814 #include <vm/vm_page.h>
815 #include <vm/vm_object.h>
816 #endif /*ZERO_COPY_SOCKETS*/
817 
818 /*
819  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
820  * all of the data referenced by the uio.  If desired, it uses zero-copy.
821  * *space will be updated to reflect data copied in.
822  *
823  * NB: If atomic I/O is requested, the caller must already have checked that
824  * space can hold resid bytes.
825  *
826  * NB: In the event of an error, the caller may need to free the partial
827  * chain pointed to by *mpp.  The contents of both *uio and *space may be
828  * modified even in the case of an error.
829  */
830 static int
831 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
832     int flags)
833 {
834 	struct mbuf *m, **mp, *top;
835 	long len, resid;
836 	int error;
837 #ifdef ZERO_COPY_SOCKETS
838 	int cow_send;
839 #endif
840 
841 	*retmp = top = NULL;
842 	mp = &top;
843 	len = 0;
844 	resid = uio->uio_resid;
845 	error = 0;
846 	do {
847 #ifdef ZERO_COPY_SOCKETS
848 		cow_send = 0;
849 #endif /* ZERO_COPY_SOCKETS */
850 		if (resid >= MINCLSIZE) {
851 #ifdef ZERO_COPY_SOCKETS
852 			if (top == NULL) {
853 				MGETHDR(m, M_TRYWAIT, MT_DATA);
854 				if (m == NULL) {
855 					error = ENOBUFS;
856 					goto out;
857 				}
858 				m->m_pkthdr.len = 0;
859 				m->m_pkthdr.rcvif = NULL;
860 			} else {
861 				MGET(m, M_TRYWAIT, MT_DATA);
862 				if (m == NULL) {
863 					error = ENOBUFS;
864 					goto out;
865 				}
866 			}
867 			if (so_zero_copy_send &&
868 			    resid>=PAGE_SIZE &&
869 			    *space>=PAGE_SIZE &&
870 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
871 				so_zerocp_stats.size_ok++;
872 				so_zerocp_stats.align_ok++;
873 				cow_send = socow_setup(m, uio);
874 				len = cow_send;
875 			}
876 			if (!cow_send) {
877 				MCLGET(m, M_TRYWAIT);
878 				if ((m->m_flags & M_EXT) == 0) {
879 					m_free(m);
880 					m = NULL;
881 				} else {
882 					len = min(min(MCLBYTES, resid),
883 					    *space);
884 				}
885 			}
886 #else /* ZERO_COPY_SOCKETS */
887 			if (top == NULL) {
888 				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
889 				m->m_pkthdr.len = 0;
890 				m->m_pkthdr.rcvif = NULL;
891 			} else
892 				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
893 			len = min(min(MCLBYTES, resid), *space);
894 #endif /* ZERO_COPY_SOCKETS */
895 		} else {
896 			if (top == NULL) {
897 				m = m_gethdr(M_TRYWAIT, MT_DATA);
898 				m->m_pkthdr.len = 0;
899 				m->m_pkthdr.rcvif = NULL;
900 
901 				len = min(min(MHLEN, resid), *space);
902 				/*
903 				 * For datagram protocols, leave room
904 				 * for protocol headers in first mbuf.
905 				 */
906 				if (atomic && m && len < MHLEN)
907 					MH_ALIGN(m, len);
908 			} else {
909 				m = m_get(M_TRYWAIT, MT_DATA);
910 				len = min(min(MLEN, resid), *space);
911 			}
912 		}
913 		if (m == NULL) {
914 			error = ENOBUFS;
915 			goto out;
916 		}
917 
918 		*space -= len;
919 #ifdef ZERO_COPY_SOCKETS
920 		if (cow_send)
921 			error = 0;
922 		else
923 #endif /* ZERO_COPY_SOCKETS */
924 		error = uiomove(mtod(m, void *), (int)len, uio);
925 		resid = uio->uio_resid;
926 		m->m_len = len;
927 		*mp = m;
928 		top->m_pkthdr.len += len;
929 		if (error)
930 			goto out;
931 		mp = &m->m_next;
932 		if (resid <= 0) {
933 			if (flags & MSG_EOR)
934 				top->m_flags |= M_EOR;
935 			break;
936 		}
937 	} while (*space > 0 && atomic);
938 out:
939 	*retmp = top;
940 	return (error);
941 }
942 
943 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
944 
945 int
946 sosend_dgram(so, addr, uio, top, control, flags, td)
947 	struct socket *so;
948 	struct sockaddr *addr;
949 	struct uio *uio;
950 	struct mbuf *top;
951 	struct mbuf *control;
952 	int flags;
953 	struct thread *td;
954 {
955 	long space, resid;
956 	int clen = 0, error, dontroute;
957 	int atomic = sosendallatonce(so) || top;
958 
959 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
960 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
961 	    ("sodgram_send: !PR_ATOMIC"));
962 
963 	if (uio != NULL)
964 		resid = uio->uio_resid;
965 	else
966 		resid = top->m_pkthdr.len;
967 	/*
968 	 * In theory resid should be unsigned.  However, space must be
969 	 * signed, as it might be less than 0 if we over-committed, and we
970 	 * must use a signed comparison of space and resid.  On the other
971 	 * hand, a negative resid causes us to loop sending 0-length
972 	 * segments to the protocol.
973 	 *
974 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
975 	 * type sockets since that's an error.
976 	 */
977 	if (resid < 0) {
978 		error = EINVAL;
979 		goto out;
980 	}
981 
982 	dontroute =
983 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
984 	if (td != NULL)
985 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
986 	if (control != NULL)
987 		clen = control->m_len;
988 
989 	SOCKBUF_LOCK(&so->so_snd);
990 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
991 		SOCKBUF_UNLOCK(&so->so_snd);
992 		error = EPIPE;
993 		goto out;
994 	}
995 	if (so->so_error) {
996 		error = so->so_error;
997 		so->so_error = 0;
998 		SOCKBUF_UNLOCK(&so->so_snd);
999 		goto out;
1000 	}
1001 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1002 		/*
1003 		 * `sendto' and `sendmsg' is allowed on a connection-based
1004 		 * socket if it supports implied connect.  Return ENOTCONN if
1005 		 * not connected and no address is supplied.
1006 		 */
1007 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1008 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1009 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1010 			    !(resid == 0 && clen != 0)) {
1011 				SOCKBUF_UNLOCK(&so->so_snd);
1012 				error = ENOTCONN;
1013 				goto out;
1014 			}
1015 		} else if (addr == NULL) {
1016 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1017 				error = ENOTCONN;
1018 			else
1019 				error = EDESTADDRREQ;
1020 			SOCKBUF_UNLOCK(&so->so_snd);
1021 			goto out;
1022 		}
1023 	}
1024 
1025 	/*
1026 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1027 	 * problem and need fixing.
1028 	 */
1029 	space = sbspace(&so->so_snd);
1030 	if (flags & MSG_OOB)
1031 		space += 1024;
1032 	space -= clen;
1033 	SOCKBUF_UNLOCK(&so->so_snd);
1034 	if (resid > space) {
1035 		error = EMSGSIZE;
1036 		goto out;
1037 	}
1038 	if (uio == NULL) {
1039 		resid = 0;
1040 		if (flags & MSG_EOR)
1041 			top->m_flags |= M_EOR;
1042 	} else {
1043 		error = sosend_copyin(uio, &top, atomic, &space, flags);
1044 		if (error)
1045 			goto out;
1046 		resid = uio->uio_resid;
1047 	}
1048 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1049 	/*
1050 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1051 	 * than with.
1052 	 */
1053 	if (dontroute) {
1054 		SOCK_LOCK(so);
1055 		so->so_options |= SO_DONTROUTE;
1056 		SOCK_UNLOCK(so);
1057 	}
1058 	/*
1059 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1060 	 * of date.  We could have recieved a reset packet in an interrupt or
1061 	 * maybe we slept while doing page faults in uiomove() etc.  We could
1062 	 * probably recheck again inside the locking protection here, but
1063 	 * there are probably other places that this also happens.  We must
1064 	 * rethink this.
1065 	 */
1066 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1067 	    (flags & MSG_OOB) ? PRUS_OOB :
1068 	/*
1069 	 * If the user set MSG_EOF, the protocol understands this flag and
1070 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1071 	 */
1072 	    ((flags & MSG_EOF) &&
1073 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1074 	     (resid <= 0)) ?
1075 		PRUS_EOF :
1076 		/* If there is more to send set PRUS_MORETOCOME */
1077 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1078 		top, addr, control, td);
1079 	if (dontroute) {
1080 		SOCK_LOCK(so);
1081 		so->so_options &= ~SO_DONTROUTE;
1082 		SOCK_UNLOCK(so);
1083 	}
1084 	clen = 0;
1085 	control = NULL;
1086 	top = NULL;
1087 out:
1088 	if (top != NULL)
1089 		m_freem(top);
1090 	if (control != NULL)
1091 		m_freem(control);
1092 	return (error);
1093 }
1094 
1095 /*
1096  * Send on a socket.  If send must go all at once and message is larger than
1097  * send buffering, then hard error.  Lock against other senders.  If must go
1098  * all at once and not enough room now, then inform user that this would
1099  * block and do nothing.  Otherwise, if nonblocking, send as much as
1100  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1101  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1102  * in mbuf chain must be small enough to send all at once.
1103  *
1104  * Returns nonzero on error, timeout or signal; callers must check for short
1105  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1106  * on return.
1107  */
1108 #define	snderr(errno)	{ error = (errno); goto release; }
1109 int
1110 sosend_generic(so, addr, uio, top, control, flags, td)
1111 	struct socket *so;
1112 	struct sockaddr *addr;
1113 	struct uio *uio;
1114 	struct mbuf *top;
1115 	struct mbuf *control;
1116 	int flags;
1117 	struct thread *td;
1118 {
1119 	long space, resid;
1120 	int clen = 0, error, dontroute;
1121 	int atomic = sosendallatonce(so) || top;
1122 
1123 	if (uio != NULL)
1124 		resid = uio->uio_resid;
1125 	else
1126 		resid = top->m_pkthdr.len;
1127 	/*
1128 	 * In theory resid should be unsigned.  However, space must be
1129 	 * signed, as it might be less than 0 if we over-committed, and we
1130 	 * must use a signed comparison of space and resid.  On the other
1131 	 * hand, a negative resid causes us to loop sending 0-length
1132 	 * segments to the protocol.
1133 	 *
1134 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1135 	 * type sockets since that's an error.
1136 	 */
1137 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1138 		error = EINVAL;
1139 		goto out;
1140 	}
1141 
1142 	dontroute =
1143 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1144 	    (so->so_proto->pr_flags & PR_ATOMIC);
1145 	if (td != NULL)
1146 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1147 	if (control != NULL)
1148 		clen = control->m_len;
1149 
1150 	SOCKBUF_LOCK(&so->so_snd);
1151 restart:
1152 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1153 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1154 	if (error)
1155 		goto out_locked;
1156 	do {
1157 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1158 		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1159 			snderr(EPIPE);
1160 		if (so->so_error) {
1161 			error = so->so_error;
1162 			so->so_error = 0;
1163 			goto release;
1164 		}
1165 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1166 			/*
1167 			 * `sendto' and `sendmsg' is allowed on a connection-
1168 			 * based socket if it supports implied connect.
1169 			 * Return ENOTCONN if not connected and no address is
1170 			 * supplied.
1171 			 */
1172 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1173 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1174 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1175 				    !(resid == 0 && clen != 0))
1176 					snderr(ENOTCONN);
1177 			} else if (addr == NULL)
1178 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1179 				   ENOTCONN : EDESTADDRREQ);
1180 		}
1181 		space = sbspace(&so->so_snd);
1182 		if (flags & MSG_OOB)
1183 			space += 1024;
1184 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1185 		    clen > so->so_snd.sb_hiwat)
1186 			snderr(EMSGSIZE);
1187 		if (space < resid + clen &&
1188 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1189 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1190 				snderr(EWOULDBLOCK);
1191 			sbunlock(&so->so_snd);
1192 			error = sbwait(&so->so_snd);
1193 			if (error)
1194 				goto out_locked;
1195 			goto restart;
1196 		}
1197 		SOCKBUF_UNLOCK(&so->so_snd);
1198 		space -= clen;
1199 		do {
1200 			if (uio == NULL) {
1201 				resid = 0;
1202 				if (flags & MSG_EOR)
1203 					top->m_flags |= M_EOR;
1204 			} else {
1205 				error = sosend_copyin(uio, &top, atomic,
1206 				    &space, flags);
1207 				if (error != 0) {
1208 					SOCKBUF_LOCK(&so->so_snd);
1209 					goto release;
1210 				}
1211 				resid = uio->uio_resid;
1212 			}
1213 			if (dontroute) {
1214 				SOCK_LOCK(so);
1215 				so->so_options |= SO_DONTROUTE;
1216 				SOCK_UNLOCK(so);
1217 			}
1218 			/*
1219 			 * XXX all the SBS_CANTSENDMORE checks previously
1220 			 * done could be out of date.  We could have recieved
1221 			 * a reset packet in an interrupt or maybe we slept
1222 			 * while doing page faults in uiomove() etc.  We
1223 			 * could probably recheck again inside the locking
1224 			 * protection here, but there are probably other
1225 			 * places that this also happens.  We must rethink
1226 			 * this.
1227 			 */
1228 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1229 			    (flags & MSG_OOB) ? PRUS_OOB :
1230 			/*
1231 			 * If the user set MSG_EOF, the protocol understands
1232 			 * this flag and nothing left to send then use
1233 			 * PRU_SEND_EOF instead of PRU_SEND.
1234 			 */
1235 			    ((flags & MSG_EOF) &&
1236 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1237 			     (resid <= 0)) ?
1238 				PRUS_EOF :
1239 			/* If there is more to send set PRUS_MORETOCOME. */
1240 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1241 			    top, addr, control, td);
1242 			if (dontroute) {
1243 				SOCK_LOCK(so);
1244 				so->so_options &= ~SO_DONTROUTE;
1245 				SOCK_UNLOCK(so);
1246 			}
1247 			clen = 0;
1248 			control = NULL;
1249 			top = NULL;
1250 			if (error) {
1251 				SOCKBUF_LOCK(&so->so_snd);
1252 				goto release;
1253 			}
1254 		} while (resid && space > 0);
1255 		SOCKBUF_LOCK(&so->so_snd);
1256 	} while (resid);
1257 
1258 release:
1259 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1260 	sbunlock(&so->so_snd);
1261 out_locked:
1262 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1263 	SOCKBUF_UNLOCK(&so->so_snd);
1264 out:
1265 	if (top != NULL)
1266 		m_freem(top);
1267 	if (control != NULL)
1268 		m_freem(control);
1269 	return (error);
1270 }
1271 #undef snderr
1272 
1273 int
1274 sosend(so, addr, uio, top, control, flags, td)
1275 	struct socket *so;
1276 	struct sockaddr *addr;
1277 	struct uio *uio;
1278 	struct mbuf *top;
1279 	struct mbuf *control;
1280 	int flags;
1281 	struct thread *td;
1282 {
1283 
1284 	/* XXXRW: Temporary debugging. */
1285 	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1286 	    ("sosend: protocol calls sosend"));
1287 
1288 	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1289 	    control, flags, td));
1290 }
1291 
1292 /*
1293  * The part of soreceive() that implements reading non-inline out-of-band
1294  * data from a socket.  For more complete comments, see soreceive(), from
1295  * which this code originated.
1296  *
1297  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1298  * unable to return an mbuf chain to the caller.
1299  */
1300 static int
1301 soreceive_rcvoob(so, uio, flags)
1302 	struct socket *so;
1303 	struct uio *uio;
1304 	int flags;
1305 {
1306 	struct protosw *pr = so->so_proto;
1307 	struct mbuf *m;
1308 	int error;
1309 
1310 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1311 
1312 	m = m_get(M_TRYWAIT, MT_DATA);
1313 	if (m == NULL)
1314 		return (ENOBUFS);
1315 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1316 	if (error)
1317 		goto bad;
1318 	do {
1319 #ifdef ZERO_COPY_SOCKETS
1320 		if (so_zero_copy_receive) {
1321 			int disposable;
1322 
1323 			if ((m->m_flags & M_EXT)
1324 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1325 				disposable = 1;
1326 			else
1327 				disposable = 0;
1328 
1329 			error = uiomoveco(mtod(m, void *),
1330 					  min(uio->uio_resid, m->m_len),
1331 					  uio, disposable);
1332 		} else
1333 #endif /* ZERO_COPY_SOCKETS */
1334 		error = uiomove(mtod(m, void *),
1335 		    (int) min(uio->uio_resid, m->m_len), uio);
1336 		m = m_free(m);
1337 	} while (uio->uio_resid && error == 0 && m);
1338 bad:
1339 	if (m != NULL)
1340 		m_freem(m);
1341 	return (error);
1342 }
1343 
1344 /*
1345  * Following replacement or removal of the first mbuf on the first mbuf chain
1346  * of a socket buffer, push necessary state changes back into the socket
1347  * buffer so that other consumers see the values consistently.  'nextrecord'
1348  * is the callers locally stored value of the original value of
1349  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1350  * NOTE: 'nextrecord' may be NULL.
1351  */
1352 static __inline void
1353 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1354 {
1355 
1356 	SOCKBUF_LOCK_ASSERT(sb);
1357 	/*
1358 	 * First, update for the new value of nextrecord.  If necessary, make
1359 	 * it the first record.
1360 	 */
1361 	if (sb->sb_mb != NULL)
1362 		sb->sb_mb->m_nextpkt = nextrecord;
1363 	else
1364 		sb->sb_mb = nextrecord;
1365 
1366         /*
1367          * Now update any dependent socket buffer fields to reflect the new
1368          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1369 	 * addition of a second clause that takes care of the case where
1370 	 * sb_mb has been updated, but remains the last record.
1371          */
1372         if (sb->sb_mb == NULL) {
1373                 sb->sb_mbtail = NULL;
1374                 sb->sb_lastrecord = NULL;
1375         } else if (sb->sb_mb->m_nextpkt == NULL)
1376                 sb->sb_lastrecord = sb->sb_mb;
1377 }
1378 
1379 
1380 /*
1381  * Implement receive operations on a socket.  We depend on the way that
1382  * records are added to the sockbuf by sbappend.  In particular, each record
1383  * (mbufs linked through m_next) must begin with an address if the protocol
1384  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1385  * data, and then zero or more mbufs of data.  In order to allow parallelism
1386  * between network receive and copying to user space, as well as avoid
1387  * sleeping with a mutex held, we release the socket buffer mutex during the
1388  * user space copy.  Although the sockbuf is locked, new data may still be
1389  * appended, and thus we must maintain consistency of the sockbuf during that
1390  * time.
1391  *
1392  * The caller may receive the data as a single mbuf chain by supplying an
1393  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1394  * the count in uio_resid.
1395  */
1396 int
1397 soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1398 	struct socket *so;
1399 	struct sockaddr **psa;
1400 	struct uio *uio;
1401 	struct mbuf **mp0;
1402 	struct mbuf **controlp;
1403 	int *flagsp;
1404 {
1405 	struct mbuf *m, **mp;
1406 	int flags, len, error, offset;
1407 	struct protosw *pr = so->so_proto;
1408 	struct mbuf *nextrecord;
1409 	int moff, type = 0;
1410 	int mbuf_removed = 0;
1411 	int orig_resid = uio->uio_resid;
1412 
1413 	mp = mp0;
1414 	if (psa != NULL)
1415 		*psa = NULL;
1416 	if (controlp != NULL)
1417 		*controlp = NULL;
1418 	if (flagsp != NULL)
1419 		flags = *flagsp &~ MSG_EOR;
1420 	else
1421 		flags = 0;
1422 	if (flags & MSG_OOB)
1423 		return (soreceive_rcvoob(so, uio, flags));
1424 	if (mp != NULL)
1425 		*mp = NULL;
1426 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1427 	    && uio->uio_resid)
1428 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1429 
1430 	SOCKBUF_LOCK(&so->so_rcv);
1431 restart:
1432 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1433 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1434 	if (error)
1435 		goto out;
1436 
1437 	m = so->so_rcv.sb_mb;
1438 	/*
1439 	 * If we have less data than requested, block awaiting more (subject
1440 	 * to any timeout) if:
1441 	 *   1. the current count is less than the low water mark, or
1442 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1443 	 *	receive operation at once if we block (resid <= hiwat).
1444 	 *   3. MSG_DONTWAIT is not set
1445 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1446 	 * we have to do the receive in sections, and thus risk returning a
1447 	 * short count if a timeout or signal occurs after we start.
1448 	 */
1449 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1450 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1451 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1452 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1453 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1454 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1455 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1456 		    m, so->so_rcv.sb_cc));
1457 		if (so->so_error) {
1458 			if (m != NULL)
1459 				goto dontblock;
1460 			error = so->so_error;
1461 			if ((flags & MSG_PEEK) == 0)
1462 				so->so_error = 0;
1463 			goto release;
1464 		}
1465 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1466 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1467 			if (m)
1468 				goto dontblock;
1469 			else
1470 				goto release;
1471 		}
1472 		for (; m != NULL; m = m->m_next)
1473 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1474 				m = so->so_rcv.sb_mb;
1475 				goto dontblock;
1476 			}
1477 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1478 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1479 			error = ENOTCONN;
1480 			goto release;
1481 		}
1482 		if (uio->uio_resid == 0)
1483 			goto release;
1484 		if ((so->so_state & SS_NBIO) ||
1485 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1486 			error = EWOULDBLOCK;
1487 			goto release;
1488 		}
1489 		SBLASTRECORDCHK(&so->so_rcv);
1490 		SBLASTMBUFCHK(&so->so_rcv);
1491 		sbunlock(&so->so_rcv);
1492 		error = sbwait(&so->so_rcv);
1493 		if (error)
1494 			goto out;
1495 		goto restart;
1496 	}
1497 dontblock:
1498 	/*
1499 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1500 	 * pointer to the next record in the socket buffer.  We must keep the
1501 	 * various socket buffer pointers and local stack versions of the
1502 	 * pointers in sync, pushing out modifications before dropping the
1503 	 * socket buffer mutex, and re-reading them when picking it up.
1504 	 *
1505 	 * Otherwise, we will race with the network stack appending new data
1506 	 * or records onto the socket buffer by using inconsistent/stale
1507 	 * versions of the field, possibly resulting in socket buffer
1508 	 * corruption.
1509 	 *
1510 	 * By holding the high-level sblock(), we prevent simultaneous
1511 	 * readers from pulling off the front of the socket buffer.
1512 	 */
1513 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1514 	if (uio->uio_td)
1515 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1516 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1517 	SBLASTRECORDCHK(&so->so_rcv);
1518 	SBLASTMBUFCHK(&so->so_rcv);
1519 	nextrecord = m->m_nextpkt;
1520 	if (pr->pr_flags & PR_ADDR) {
1521 		KASSERT(m->m_type == MT_SONAME,
1522 		    ("m->m_type == %d", m->m_type));
1523 		orig_resid = 0;
1524 		if (psa != NULL)
1525 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1526 			    M_NOWAIT);
1527 		if (flags & MSG_PEEK) {
1528 			m = m->m_next;
1529 		} else {
1530 			sbfree(&so->so_rcv, m);
1531 			mbuf_removed = 1;
1532 			so->so_rcv.sb_mb = m_free(m);
1533 			m = so->so_rcv.sb_mb;
1534 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1535 		}
1536 	}
1537 
1538 	/*
1539 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1540 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1541 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1542 	 * perform externalization (or freeing if controlp == NULL).
1543 	 */
1544 	if (m != NULL && m->m_type == MT_CONTROL) {
1545 		struct mbuf *cm = NULL, *cmn;
1546 		struct mbuf **cme = &cm;
1547 
1548 		do {
1549 			if (flags & MSG_PEEK) {
1550 				if (controlp != NULL) {
1551 					*controlp = m_copy(m, 0, m->m_len);
1552 					controlp = &(*controlp)->m_next;
1553 				}
1554 				m = m->m_next;
1555 			} else {
1556 				sbfree(&so->so_rcv, m);
1557 				mbuf_removed = 1;
1558 				so->so_rcv.sb_mb = m->m_next;
1559 				m->m_next = NULL;
1560 				*cme = m;
1561 				cme = &(*cme)->m_next;
1562 				m = so->so_rcv.sb_mb;
1563 			}
1564 		} while (m != NULL && m->m_type == MT_CONTROL);
1565 		if ((flags & MSG_PEEK) == 0)
1566 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1567 		while (cm != NULL) {
1568 			cmn = cm->m_next;
1569 			cm->m_next = NULL;
1570 			if (pr->pr_domain->dom_externalize != NULL) {
1571 				SOCKBUF_UNLOCK(&so->so_rcv);
1572 				error = (*pr->pr_domain->dom_externalize)
1573 				    (cm, controlp);
1574 				SOCKBUF_LOCK(&so->so_rcv);
1575 			} else if (controlp != NULL)
1576 				*controlp = cm;
1577 			else
1578 				m_freem(cm);
1579 			if (controlp != NULL) {
1580 				orig_resid = 0;
1581 				while (*controlp != NULL)
1582 					controlp = &(*controlp)->m_next;
1583 			}
1584 			cm = cmn;
1585 		}
1586 		if (m != NULL)
1587 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1588 		else
1589 			nextrecord = so->so_rcv.sb_mb;
1590 		orig_resid = 0;
1591 	}
1592 	if (m != NULL) {
1593 		if ((flags & MSG_PEEK) == 0) {
1594 			KASSERT(m->m_nextpkt == nextrecord,
1595 			    ("soreceive: post-control, nextrecord !sync"));
1596 			if (nextrecord == NULL) {
1597 				KASSERT(so->so_rcv.sb_mb == m,
1598 				    ("soreceive: post-control, sb_mb!=m"));
1599 				KASSERT(so->so_rcv.sb_lastrecord == m,
1600 				    ("soreceive: post-control, lastrecord!=m"));
1601 			}
1602 		}
1603 		type = m->m_type;
1604 		if (type == MT_OOBDATA)
1605 			flags |= MSG_OOB;
1606 	} else {
1607 		if ((flags & MSG_PEEK) == 0) {
1608 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1609 			    ("soreceive: sb_mb != nextrecord"));
1610 			if (so->so_rcv.sb_mb == NULL) {
1611 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1612 				    ("soreceive: sb_lastercord != NULL"));
1613 			}
1614 		}
1615 	}
1616 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1617 	SBLASTRECORDCHK(&so->so_rcv);
1618 	SBLASTMBUFCHK(&so->so_rcv);
1619 
1620 	/*
1621 	 * Now continue to read any data mbufs off of the head of the socket
1622 	 * buffer until the read request is satisfied.  Note that 'type' is
1623 	 * used to store the type of any mbuf reads that have happened so far
1624 	 * such that soreceive() can stop reading if the type changes, which
1625 	 * causes soreceive() to return only one of regular data and inline
1626 	 * out-of-band data in a single socket receive operation.
1627 	 */
1628 	moff = 0;
1629 	offset = 0;
1630 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1631 		/*
1632 		 * If the type of mbuf has changed since the last mbuf
1633 		 * examined ('type'), end the receive operation.
1634 	 	 */
1635 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1636 		if (m->m_type == MT_OOBDATA) {
1637 			if (type != MT_OOBDATA)
1638 				break;
1639 		} else if (type == MT_OOBDATA)
1640 			break;
1641 		else
1642 		    KASSERT(m->m_type == MT_DATA,
1643 			("m->m_type == %d", m->m_type));
1644 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1645 		len = uio->uio_resid;
1646 		if (so->so_oobmark && len > so->so_oobmark - offset)
1647 			len = so->so_oobmark - offset;
1648 		if (len > m->m_len - moff)
1649 			len = m->m_len - moff;
1650 		/*
1651 		 * If mp is set, just pass back the mbufs.  Otherwise copy
1652 		 * them out via the uio, then free.  Sockbuf must be
1653 		 * consistent here (points to current mbuf, it points to next
1654 		 * record) when we drop priority; we must note any additions
1655 		 * to the sockbuf when we block interrupts again.
1656 		 */
1657 		if (mp == NULL) {
1658 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1659 			SBLASTRECORDCHK(&so->so_rcv);
1660 			SBLASTMBUFCHK(&so->so_rcv);
1661 			SOCKBUF_UNLOCK(&so->so_rcv);
1662 #ifdef ZERO_COPY_SOCKETS
1663 			if (so_zero_copy_receive) {
1664 				int disposable;
1665 
1666 				if ((m->m_flags & M_EXT)
1667 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1668 					disposable = 1;
1669 				else
1670 					disposable = 0;
1671 
1672 				error = uiomoveco(mtod(m, char *) + moff,
1673 						  (int)len, uio,
1674 						  disposable);
1675 			} else
1676 #endif /* ZERO_COPY_SOCKETS */
1677 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1678 			SOCKBUF_LOCK(&so->so_rcv);
1679 			if (error) {
1680 				/*
1681 				 * If any part of the record has been removed
1682 				 * (such as the MT_SONAME mbuf, which will
1683 				 * happen when PR_ADDR, and thus also
1684 				 * PR_ATOMIC, is set), then drop the entire
1685 				 * record to maintain the atomicity of the
1686 				 * receive operation.
1687 				 */
1688 				if (m && mbuf_removed &&
1689 				    (pr->pr_flags & PR_ATOMIC))
1690 					(void)sbdroprecord_locked(&so->so_rcv);
1691 				goto release;
1692 			}
1693 		} else
1694 			uio->uio_resid -= len;
1695 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1696 		if (len == m->m_len - moff) {
1697 			if (m->m_flags & M_EOR)
1698 				flags |= MSG_EOR;
1699 			if (flags & MSG_PEEK) {
1700 				m = m->m_next;
1701 				moff = 0;
1702 			} else {
1703 				nextrecord = m->m_nextpkt;
1704 				sbfree(&so->so_rcv, m);
1705 				if (mp != NULL) {
1706 					*mp = m;
1707 					mp = &m->m_next;
1708 					so->so_rcv.sb_mb = m = m->m_next;
1709 					*mp = NULL;
1710 				} else {
1711 					so->so_rcv.sb_mb = m_free(m);
1712 					m = so->so_rcv.sb_mb;
1713 				}
1714 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1715 				SBLASTRECORDCHK(&so->so_rcv);
1716 				SBLASTMBUFCHK(&so->so_rcv);
1717 			}
1718 		} else {
1719 			if (flags & MSG_PEEK)
1720 				moff += len;
1721 			else {
1722 				if (mp != NULL) {
1723 					int copy_flag;
1724 
1725 					if (flags & MSG_DONTWAIT)
1726 						copy_flag = M_DONTWAIT;
1727 					else
1728 						copy_flag = M_TRYWAIT;
1729 					if (copy_flag == M_TRYWAIT)
1730 						SOCKBUF_UNLOCK(&so->so_rcv);
1731 					*mp = m_copym(m, 0, len, copy_flag);
1732 					if (copy_flag == M_TRYWAIT)
1733 						SOCKBUF_LOCK(&so->so_rcv);
1734  					if (*mp == NULL) {
1735  						/*
1736  						 * m_copym() couldn't
1737 						 * allocate an mbuf.  Adjust
1738 						 * uio_resid back (it was
1739 						 * adjusted down by len
1740 						 * bytes, which we didn't end
1741 						 * up "copying" over).
1742  						 */
1743  						uio->uio_resid += len;
1744  						break;
1745  					}
1746 				}
1747 				m->m_data += len;
1748 				m->m_len -= len;
1749 				so->so_rcv.sb_cc -= len;
1750 			}
1751 		}
1752 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1753 		if (so->so_oobmark) {
1754 			if ((flags & MSG_PEEK) == 0) {
1755 				so->so_oobmark -= len;
1756 				if (so->so_oobmark == 0) {
1757 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1758 					break;
1759 				}
1760 			} else {
1761 				offset += len;
1762 				if (offset == so->so_oobmark)
1763 					break;
1764 			}
1765 		}
1766 		if (flags & MSG_EOR)
1767 			break;
1768 		/*
1769 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1770 		 * must not quit until "uio->uio_resid == 0" or an error
1771 		 * termination.  If a signal/timeout occurs, return with a
1772 		 * short count but without error.  Keep sockbuf locked
1773 		 * against other readers.
1774 		 */
1775 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1776 		    !sosendallatonce(so) && nextrecord == NULL) {
1777 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1778 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1779 				break;
1780 			/*
1781 			 * Notify the protocol that some data has been
1782 			 * drained before blocking.
1783 			 */
1784 			if (pr->pr_flags & PR_WANTRCVD) {
1785 				SOCKBUF_UNLOCK(&so->so_rcv);
1786 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1787 				SOCKBUF_LOCK(&so->so_rcv);
1788 			}
1789 			SBLASTRECORDCHK(&so->so_rcv);
1790 			SBLASTMBUFCHK(&so->so_rcv);
1791 			error = sbwait(&so->so_rcv);
1792 			if (error)
1793 				goto release;
1794 			m = so->so_rcv.sb_mb;
1795 			if (m != NULL)
1796 				nextrecord = m->m_nextpkt;
1797 		}
1798 	}
1799 
1800 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1801 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1802 		flags |= MSG_TRUNC;
1803 		if ((flags & MSG_PEEK) == 0)
1804 			(void) sbdroprecord_locked(&so->so_rcv);
1805 	}
1806 	if ((flags & MSG_PEEK) == 0) {
1807 		if (m == NULL) {
1808 			/*
1809 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1810 			 * part makes sure sb_lastrecord is up-to-date if
1811 			 * there is still data in the socket buffer.
1812 			 */
1813 			so->so_rcv.sb_mb = nextrecord;
1814 			if (so->so_rcv.sb_mb == NULL) {
1815 				so->so_rcv.sb_mbtail = NULL;
1816 				so->so_rcv.sb_lastrecord = NULL;
1817 			} else if (nextrecord->m_nextpkt == NULL)
1818 				so->so_rcv.sb_lastrecord = nextrecord;
1819 		}
1820 		SBLASTRECORDCHK(&so->so_rcv);
1821 		SBLASTMBUFCHK(&so->so_rcv);
1822 		/*
1823 		 * If soreceive() is being done from the socket callback,
1824 		 * then don't need to generate ACK to peer to update window,
1825 		 * since ACK will be generated on return to TCP.
1826 		 */
1827 		if (!(flags & MSG_SOCALLBCK) &&
1828 		    (pr->pr_flags & PR_WANTRCVD)) {
1829 			SOCKBUF_UNLOCK(&so->so_rcv);
1830 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1831 			SOCKBUF_LOCK(&so->so_rcv);
1832 		}
1833 	}
1834 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1835 	if (orig_resid == uio->uio_resid && orig_resid &&
1836 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1837 		sbunlock(&so->so_rcv);
1838 		goto restart;
1839 	}
1840 
1841 	if (flagsp != NULL)
1842 		*flagsp |= flags;
1843 release:
1844 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1845 	sbunlock(&so->so_rcv);
1846 out:
1847 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1848 	SOCKBUF_UNLOCK(&so->so_rcv);
1849 	return (error);
1850 }
1851 
1852 int
1853 soreceive(so, psa, uio, mp0, controlp, flagsp)
1854 	struct socket *so;
1855 	struct sockaddr **psa;
1856 	struct uio *uio;
1857 	struct mbuf **mp0;
1858 	struct mbuf **controlp;
1859 	int *flagsp;
1860 {
1861 
1862 	/* XXXRW: Temporary debugging. */
1863 	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1864 	    ("soreceive: protocol calls soreceive"));
1865 
1866 	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1867 	    controlp, flagsp));
1868 }
1869 
1870 int
1871 soshutdown(so, how)
1872 	struct socket *so;
1873 	int how;
1874 {
1875 	struct protosw *pr = so->so_proto;
1876 
1877 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1878 		return (EINVAL);
1879 
1880 	if (how != SHUT_WR)
1881 		sorflush(so);
1882 	if (how != SHUT_RD)
1883 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1884 	return (0);
1885 }
1886 
1887 void
1888 sorflush(so)
1889 	struct socket *so;
1890 {
1891 	struct sockbuf *sb = &so->so_rcv;
1892 	struct protosw *pr = so->so_proto;
1893 	struct sockbuf asb;
1894 
1895 	/*
1896 	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1897 	 * the socket buffer, then zero'd the original to clear the buffer
1898 	 * fields.  However, with mutexes in the socket buffer, this causes
1899 	 * problems.  We only clear the zeroable bits of the original;
1900 	 * however, we have to initialize and destroy the mutex in the copy
1901 	 * so that dom_dispose() and sbrelease() can lock t as needed.
1902 	 */
1903 	SOCKBUF_LOCK(sb);
1904 	sb->sb_flags |= SB_NOINTR;
1905 	(void) sblock(sb, M_WAITOK);
1906 	/*
1907 	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1908 	 * can safely perform wakeups.  Re-acquire the mutex before
1909 	 * continuing.
1910 	 */
1911 	socantrcvmore_locked(so);
1912 	SOCKBUF_LOCK(sb);
1913 	sbunlock(sb);
1914 	/*
1915 	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1916 	 * and mutex data unchanged.
1917 	 */
1918 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1919 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1920 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1921 	bzero(&sb->sb_startzero,
1922 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1923 	SOCKBUF_UNLOCK(sb);
1924 
1925 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1926 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1927 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1928 	sbrelease(&asb, so);
1929 	SOCKBUF_LOCK_DESTROY(&asb);
1930 }
1931 
1932 /*
1933  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1934  * additional variant to handle the case where the option value needs to be
1935  * some kind of integer, but not a specific size.  In addition to their use
1936  * here, these functions are also called by the protocol-level pr_ctloutput()
1937  * routines.
1938  */
1939 int
1940 sooptcopyin(sopt, buf, len, minlen)
1941 	struct	sockopt *sopt;
1942 	void	*buf;
1943 	size_t	len;
1944 	size_t	minlen;
1945 {
1946 	size_t	valsize;
1947 
1948 	/*
1949 	 * If the user gives us more than we wanted, we ignore it, but if we
1950 	 * don't get the minimum length the caller wants, we return EINVAL.
1951 	 * On success, sopt->sopt_valsize is set to however much we actually
1952 	 * retrieved.
1953 	 */
1954 	if ((valsize = sopt->sopt_valsize) < minlen)
1955 		return EINVAL;
1956 	if (valsize > len)
1957 		sopt->sopt_valsize = valsize = len;
1958 
1959 	if (sopt->sopt_td != NULL)
1960 		return (copyin(sopt->sopt_val, buf, valsize));
1961 
1962 	bcopy(sopt->sopt_val, buf, valsize);
1963 	return (0);
1964 }
1965 
1966 /*
1967  * Kernel version of setsockopt(2).
1968  *
1969  * XXX: optlen is size_t, not socklen_t
1970  */
1971 int
1972 so_setsockopt(struct socket *so, int level, int optname, void *optval,
1973     size_t optlen)
1974 {
1975 	struct sockopt sopt;
1976 
1977 	sopt.sopt_level = level;
1978 	sopt.sopt_name = optname;
1979 	sopt.sopt_dir = SOPT_SET;
1980 	sopt.sopt_val = optval;
1981 	sopt.sopt_valsize = optlen;
1982 	sopt.sopt_td = NULL;
1983 	return (sosetopt(so, &sopt));
1984 }
1985 
1986 int
1987 sosetopt(so, sopt)
1988 	struct socket *so;
1989 	struct sockopt *sopt;
1990 {
1991 	int	error, optval;
1992 	struct	linger l;
1993 	struct	timeval tv;
1994 	u_long  val;
1995 #ifdef MAC
1996 	struct mac extmac;
1997 #endif
1998 
1999 	error = 0;
2000 	if (sopt->sopt_level != SOL_SOCKET) {
2001 		if (so->so_proto && so->so_proto->pr_ctloutput)
2002 			return ((*so->so_proto->pr_ctloutput)
2003 				  (so, sopt));
2004 		error = ENOPROTOOPT;
2005 	} else {
2006 		switch (sopt->sopt_name) {
2007 #ifdef INET
2008 		case SO_ACCEPTFILTER:
2009 			error = do_setopt_accept_filter(so, sopt);
2010 			if (error)
2011 				goto bad;
2012 			break;
2013 #endif
2014 		case SO_LINGER:
2015 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2016 			if (error)
2017 				goto bad;
2018 
2019 			SOCK_LOCK(so);
2020 			so->so_linger = l.l_linger;
2021 			if (l.l_onoff)
2022 				so->so_options |= SO_LINGER;
2023 			else
2024 				so->so_options &= ~SO_LINGER;
2025 			SOCK_UNLOCK(so);
2026 			break;
2027 
2028 		case SO_DEBUG:
2029 		case SO_KEEPALIVE:
2030 		case SO_DONTROUTE:
2031 		case SO_USELOOPBACK:
2032 		case SO_BROADCAST:
2033 		case SO_REUSEADDR:
2034 		case SO_REUSEPORT:
2035 		case SO_OOBINLINE:
2036 		case SO_TIMESTAMP:
2037 		case SO_BINTIME:
2038 		case SO_NOSIGPIPE:
2039 			error = sooptcopyin(sopt, &optval, sizeof optval,
2040 					    sizeof optval);
2041 			if (error)
2042 				goto bad;
2043 			SOCK_LOCK(so);
2044 			if (optval)
2045 				so->so_options |= sopt->sopt_name;
2046 			else
2047 				so->so_options &= ~sopt->sopt_name;
2048 			SOCK_UNLOCK(so);
2049 			break;
2050 
2051 		case SO_SNDBUF:
2052 		case SO_RCVBUF:
2053 		case SO_SNDLOWAT:
2054 		case SO_RCVLOWAT:
2055 			error = sooptcopyin(sopt, &optval, sizeof optval,
2056 					    sizeof optval);
2057 			if (error)
2058 				goto bad;
2059 
2060 			/*
2061 			 * Values < 1 make no sense for any of these options,
2062 			 * so disallow them.
2063 			 */
2064 			if (optval < 1) {
2065 				error = EINVAL;
2066 				goto bad;
2067 			}
2068 
2069 			switch (sopt->sopt_name) {
2070 			case SO_SNDBUF:
2071 			case SO_RCVBUF:
2072 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2073 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2074 				    so, curthread) == 0) {
2075 					error = ENOBUFS;
2076 					goto bad;
2077 				}
2078 				break;
2079 
2080 			/*
2081 			 * Make sure the low-water is never greater than the
2082 			 * high-water.
2083 			 */
2084 			case SO_SNDLOWAT:
2085 				SOCKBUF_LOCK(&so->so_snd);
2086 				so->so_snd.sb_lowat =
2087 				    (optval > so->so_snd.sb_hiwat) ?
2088 				    so->so_snd.sb_hiwat : optval;
2089 				SOCKBUF_UNLOCK(&so->so_snd);
2090 				break;
2091 			case SO_RCVLOWAT:
2092 				SOCKBUF_LOCK(&so->so_rcv);
2093 				so->so_rcv.sb_lowat =
2094 				    (optval > so->so_rcv.sb_hiwat) ?
2095 				    so->so_rcv.sb_hiwat : optval;
2096 				SOCKBUF_UNLOCK(&so->so_rcv);
2097 				break;
2098 			}
2099 			break;
2100 
2101 		case SO_SNDTIMEO:
2102 		case SO_RCVTIMEO:
2103 #ifdef COMPAT_IA32
2104 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2105 				struct timeval32 tv32;
2106 
2107 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2108 				    sizeof tv32);
2109 				CP(tv32, tv, tv_sec);
2110 				CP(tv32, tv, tv_usec);
2111 			} else
2112 #endif
2113 				error = sooptcopyin(sopt, &tv, sizeof tv,
2114 				    sizeof tv);
2115 			if (error)
2116 				goto bad;
2117 
2118 			/* assert(hz > 0); */
2119 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2120 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2121 				error = EDOM;
2122 				goto bad;
2123 			}
2124 			/* assert(tick > 0); */
2125 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2126 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2127 			if (val > INT_MAX) {
2128 				error = EDOM;
2129 				goto bad;
2130 			}
2131 			if (val == 0 && tv.tv_usec != 0)
2132 				val = 1;
2133 
2134 			switch (sopt->sopt_name) {
2135 			case SO_SNDTIMEO:
2136 				so->so_snd.sb_timeo = val;
2137 				break;
2138 			case SO_RCVTIMEO:
2139 				so->so_rcv.sb_timeo = val;
2140 				break;
2141 			}
2142 			break;
2143 
2144 		case SO_LABEL:
2145 #ifdef MAC
2146 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2147 			    sizeof extmac);
2148 			if (error)
2149 				goto bad;
2150 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2151 			    so, &extmac);
2152 #else
2153 			error = EOPNOTSUPP;
2154 #endif
2155 			break;
2156 
2157 		default:
2158 			error = ENOPROTOOPT;
2159 			break;
2160 		}
2161 		if (error == 0 && so->so_proto != NULL &&
2162 		    so->so_proto->pr_ctloutput != NULL) {
2163 			(void) ((*so->so_proto->pr_ctloutput)
2164 				  (so, sopt));
2165 		}
2166 	}
2167 bad:
2168 	return (error);
2169 }
2170 
2171 /*
2172  * Helper routine for getsockopt.
2173  */
2174 int
2175 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2176 {
2177 	int	error;
2178 	size_t	valsize;
2179 
2180 	error = 0;
2181 
2182 	/*
2183 	 * Documented get behavior is that we always return a value, possibly
2184 	 * truncated to fit in the user's buffer.  Traditional behavior is
2185 	 * that we always tell the user precisely how much we copied, rather
2186 	 * than something useful like the total amount we had available for
2187 	 * her.  Note that this interface is not idempotent; the entire
2188 	 * answer must generated ahead of time.
2189 	 */
2190 	valsize = min(len, sopt->sopt_valsize);
2191 	sopt->sopt_valsize = valsize;
2192 	if (sopt->sopt_val != NULL) {
2193 		if (sopt->sopt_td != NULL)
2194 			error = copyout(buf, sopt->sopt_val, valsize);
2195 		else
2196 			bcopy(buf, sopt->sopt_val, valsize);
2197 	}
2198 	return (error);
2199 }
2200 
2201 int
2202 sogetopt(so, sopt)
2203 	struct socket *so;
2204 	struct sockopt *sopt;
2205 {
2206 	int	error, optval;
2207 	struct	linger l;
2208 	struct	timeval tv;
2209 #ifdef MAC
2210 	struct mac extmac;
2211 #endif
2212 
2213 	error = 0;
2214 	if (sopt->sopt_level != SOL_SOCKET) {
2215 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2216 			return ((*so->so_proto->pr_ctloutput)
2217 				  (so, sopt));
2218 		} else
2219 			return (ENOPROTOOPT);
2220 	} else {
2221 		switch (sopt->sopt_name) {
2222 #ifdef INET
2223 		case SO_ACCEPTFILTER:
2224 			error = do_getopt_accept_filter(so, sopt);
2225 			break;
2226 #endif
2227 		case SO_LINGER:
2228 			SOCK_LOCK(so);
2229 			l.l_onoff = so->so_options & SO_LINGER;
2230 			l.l_linger = so->so_linger;
2231 			SOCK_UNLOCK(so);
2232 			error = sooptcopyout(sopt, &l, sizeof l);
2233 			break;
2234 
2235 		case SO_USELOOPBACK:
2236 		case SO_DONTROUTE:
2237 		case SO_DEBUG:
2238 		case SO_KEEPALIVE:
2239 		case SO_REUSEADDR:
2240 		case SO_REUSEPORT:
2241 		case SO_BROADCAST:
2242 		case SO_OOBINLINE:
2243 		case SO_ACCEPTCONN:
2244 		case SO_TIMESTAMP:
2245 		case SO_BINTIME:
2246 		case SO_NOSIGPIPE:
2247 			optval = so->so_options & sopt->sopt_name;
2248 integer:
2249 			error = sooptcopyout(sopt, &optval, sizeof optval);
2250 			break;
2251 
2252 		case SO_TYPE:
2253 			optval = so->so_type;
2254 			goto integer;
2255 
2256 		case SO_ERROR:
2257 			SOCK_LOCK(so);
2258 			optval = so->so_error;
2259 			so->so_error = 0;
2260 			SOCK_UNLOCK(so);
2261 			goto integer;
2262 
2263 		case SO_SNDBUF:
2264 			optval = so->so_snd.sb_hiwat;
2265 			goto integer;
2266 
2267 		case SO_RCVBUF:
2268 			optval = so->so_rcv.sb_hiwat;
2269 			goto integer;
2270 
2271 		case SO_SNDLOWAT:
2272 			optval = so->so_snd.sb_lowat;
2273 			goto integer;
2274 
2275 		case SO_RCVLOWAT:
2276 			optval = so->so_rcv.sb_lowat;
2277 			goto integer;
2278 
2279 		case SO_SNDTIMEO:
2280 		case SO_RCVTIMEO:
2281 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2282 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2283 
2284 			tv.tv_sec = optval / hz;
2285 			tv.tv_usec = (optval % hz) * tick;
2286 #ifdef COMPAT_IA32
2287 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2288 				struct timeval32 tv32;
2289 
2290 				CP(tv, tv32, tv_sec);
2291 				CP(tv, tv32, tv_usec);
2292 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2293 			} else
2294 #endif
2295 				error = sooptcopyout(sopt, &tv, sizeof tv);
2296 			break;
2297 
2298 		case SO_LABEL:
2299 #ifdef MAC
2300 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2301 			    sizeof(extmac));
2302 			if (error)
2303 				return (error);
2304 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2305 			    so, &extmac);
2306 			if (error)
2307 				return (error);
2308 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2309 #else
2310 			error = EOPNOTSUPP;
2311 #endif
2312 			break;
2313 
2314 		case SO_PEERLABEL:
2315 #ifdef MAC
2316 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2317 			    sizeof(extmac));
2318 			if (error)
2319 				return (error);
2320 			error = mac_getsockopt_peerlabel(
2321 			    sopt->sopt_td->td_ucred, so, &extmac);
2322 			if (error)
2323 				return (error);
2324 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2325 #else
2326 			error = EOPNOTSUPP;
2327 #endif
2328 			break;
2329 
2330 		case SO_LISTENQLIMIT:
2331 			optval = so->so_qlimit;
2332 			goto integer;
2333 
2334 		case SO_LISTENQLEN:
2335 			optval = so->so_qlen;
2336 			goto integer;
2337 
2338 		case SO_LISTENINCQLEN:
2339 			optval = so->so_incqlen;
2340 			goto integer;
2341 
2342 		default:
2343 			error = ENOPROTOOPT;
2344 			break;
2345 		}
2346 		return (error);
2347 	}
2348 }
2349 
2350 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2351 int
2352 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2353 {
2354 	struct mbuf *m, *m_prev;
2355 	int sopt_size = sopt->sopt_valsize;
2356 
2357 	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2358 	if (m == NULL)
2359 		return ENOBUFS;
2360 	if (sopt_size > MLEN) {
2361 		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2362 		if ((m->m_flags & M_EXT) == 0) {
2363 			m_free(m);
2364 			return ENOBUFS;
2365 		}
2366 		m->m_len = min(MCLBYTES, sopt_size);
2367 	} else {
2368 		m->m_len = min(MLEN, sopt_size);
2369 	}
2370 	sopt_size -= m->m_len;
2371 	*mp = m;
2372 	m_prev = m;
2373 
2374 	while (sopt_size) {
2375 		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2376 		if (m == NULL) {
2377 			m_freem(*mp);
2378 			return ENOBUFS;
2379 		}
2380 		if (sopt_size > MLEN) {
2381 			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2382 			    M_DONTWAIT);
2383 			if ((m->m_flags & M_EXT) == 0) {
2384 				m_freem(m);
2385 				m_freem(*mp);
2386 				return ENOBUFS;
2387 			}
2388 			m->m_len = min(MCLBYTES, sopt_size);
2389 		} else {
2390 			m->m_len = min(MLEN, sopt_size);
2391 		}
2392 		sopt_size -= m->m_len;
2393 		m_prev->m_next = m;
2394 		m_prev = m;
2395 	}
2396 	return (0);
2397 }
2398 
2399 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2400 int
2401 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2402 {
2403 	struct mbuf *m0 = m;
2404 
2405 	if (sopt->sopt_val == NULL)
2406 		return (0);
2407 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2408 		if (sopt->sopt_td != NULL) {
2409 			int error;
2410 
2411 			error = copyin(sopt->sopt_val, mtod(m, char *),
2412 				       m->m_len);
2413 			if (error != 0) {
2414 				m_freem(m0);
2415 				return(error);
2416 			}
2417 		} else
2418 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2419 		sopt->sopt_valsize -= m->m_len;
2420 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2421 		m = m->m_next;
2422 	}
2423 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2424 		panic("ip6_sooptmcopyin");
2425 	return (0);
2426 }
2427 
2428 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2429 int
2430 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2431 {
2432 	struct mbuf *m0 = m;
2433 	size_t valsize = 0;
2434 
2435 	if (sopt->sopt_val == NULL)
2436 		return (0);
2437 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2438 		if (sopt->sopt_td != NULL) {
2439 			int error;
2440 
2441 			error = copyout(mtod(m, char *), sopt->sopt_val,
2442 				       m->m_len);
2443 			if (error != 0) {
2444 				m_freem(m0);
2445 				return(error);
2446 			}
2447 		} else
2448 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2449 	       sopt->sopt_valsize -= m->m_len;
2450 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2451 	       valsize += m->m_len;
2452 	       m = m->m_next;
2453 	}
2454 	if (m != NULL) {
2455 		/* enough soopt buffer should be given from user-land */
2456 		m_freem(m0);
2457 		return(EINVAL);
2458 	}
2459 	sopt->sopt_valsize = valsize;
2460 	return (0);
2461 }
2462 
2463 /*
2464  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2465  * out-of-band data, which will then notify socket consumers.
2466  */
2467 void
2468 sohasoutofband(so)
2469 	struct socket *so;
2470 {
2471 	if (so->so_sigio != NULL)
2472 		pgsigio(&so->so_sigio, SIGURG, 0);
2473 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2474 }
2475 
2476 int
2477 sopoll(struct socket *so, int events, struct ucred *active_cred,
2478     struct thread *td)
2479 {
2480 
2481 	/* XXXRW: Temporary debugging. */
2482 	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2483 	    ("sopoll: protocol calls sopoll"));
2484 
2485 	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2486 	    td));
2487 }
2488 
2489 int
2490 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2491     struct thread *td)
2492 {
2493 	int revents = 0;
2494 
2495 	SOCKBUF_LOCK(&so->so_snd);
2496 	SOCKBUF_LOCK(&so->so_rcv);
2497 	if (events & (POLLIN | POLLRDNORM))
2498 		if (soreadable(so))
2499 			revents |= events & (POLLIN | POLLRDNORM);
2500 
2501 	if (events & POLLINIGNEOF)
2502 		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2503 		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2504 			revents |= POLLINIGNEOF;
2505 
2506 	if (events & (POLLOUT | POLLWRNORM))
2507 		if (sowriteable(so))
2508 			revents |= events & (POLLOUT | POLLWRNORM);
2509 
2510 	if (events & (POLLPRI | POLLRDBAND))
2511 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2512 			revents |= events & (POLLPRI | POLLRDBAND);
2513 
2514 	if (revents == 0) {
2515 		if (events &
2516 		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2517 		     POLLRDBAND)) {
2518 			selrecord(td, &so->so_rcv.sb_sel);
2519 			so->so_rcv.sb_flags |= SB_SEL;
2520 		}
2521 
2522 		if (events & (POLLOUT | POLLWRNORM)) {
2523 			selrecord(td, &so->so_snd.sb_sel);
2524 			so->so_snd.sb_flags |= SB_SEL;
2525 		}
2526 	}
2527 
2528 	SOCKBUF_UNLOCK(&so->so_rcv);
2529 	SOCKBUF_UNLOCK(&so->so_snd);
2530 	return (revents);
2531 }
2532 
2533 int
2534 soo_kqfilter(struct file *fp, struct knote *kn)
2535 {
2536 	struct socket *so = kn->kn_fp->f_data;
2537 	struct sockbuf *sb;
2538 
2539 	switch (kn->kn_filter) {
2540 	case EVFILT_READ:
2541 		if (so->so_options & SO_ACCEPTCONN)
2542 			kn->kn_fop = &solisten_filtops;
2543 		else
2544 			kn->kn_fop = &soread_filtops;
2545 		sb = &so->so_rcv;
2546 		break;
2547 	case EVFILT_WRITE:
2548 		kn->kn_fop = &sowrite_filtops;
2549 		sb = &so->so_snd;
2550 		break;
2551 	default:
2552 		return (EINVAL);
2553 	}
2554 
2555 	SOCKBUF_LOCK(sb);
2556 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2557 	sb->sb_flags |= SB_KNOTE;
2558 	SOCKBUF_UNLOCK(sb);
2559 	return (0);
2560 }
2561 
2562 static void
2563 filt_sordetach(struct knote *kn)
2564 {
2565 	struct socket *so = kn->kn_fp->f_data;
2566 
2567 	SOCKBUF_LOCK(&so->so_rcv);
2568 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2569 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2570 		so->so_rcv.sb_flags &= ~SB_KNOTE;
2571 	SOCKBUF_UNLOCK(&so->so_rcv);
2572 }
2573 
2574 /*ARGSUSED*/
2575 static int
2576 filt_soread(struct knote *kn, long hint)
2577 {
2578 	struct socket *so;
2579 
2580 	so = kn->kn_fp->f_data;
2581 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2582 
2583 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2584 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2585 		kn->kn_flags |= EV_EOF;
2586 		kn->kn_fflags = so->so_error;
2587 		return (1);
2588 	} else if (so->so_error)	/* temporary udp error */
2589 		return (1);
2590 	else if (kn->kn_sfflags & NOTE_LOWAT)
2591 		return (kn->kn_data >= kn->kn_sdata);
2592 	else
2593 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2594 }
2595 
2596 static void
2597 filt_sowdetach(struct knote *kn)
2598 {
2599 	struct socket *so = kn->kn_fp->f_data;
2600 
2601 	SOCKBUF_LOCK(&so->so_snd);
2602 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2603 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2604 		so->so_snd.sb_flags &= ~SB_KNOTE;
2605 	SOCKBUF_UNLOCK(&so->so_snd);
2606 }
2607 
2608 /*ARGSUSED*/
2609 static int
2610 filt_sowrite(struct knote *kn, long hint)
2611 {
2612 	struct socket *so;
2613 
2614 	so = kn->kn_fp->f_data;
2615 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2616 	kn->kn_data = sbspace(&so->so_snd);
2617 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2618 		kn->kn_flags |= EV_EOF;
2619 		kn->kn_fflags = so->so_error;
2620 		return (1);
2621 	} else if (so->so_error)	/* temporary udp error */
2622 		return (1);
2623 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2624 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2625 		return (0);
2626 	else if (kn->kn_sfflags & NOTE_LOWAT)
2627 		return (kn->kn_data >= kn->kn_sdata);
2628 	else
2629 		return (kn->kn_data >= so->so_snd.sb_lowat);
2630 }
2631 
2632 /*ARGSUSED*/
2633 static int
2634 filt_solisten(struct knote *kn, long hint)
2635 {
2636 	struct socket *so = kn->kn_fp->f_data;
2637 
2638 	kn->kn_data = so->so_qlen;
2639 	return (! TAILQ_EMPTY(&so->so_comp));
2640 }
2641 
2642 int
2643 socheckuid(struct socket *so, uid_t uid)
2644 {
2645 
2646 	if (so == NULL)
2647 		return (EPERM);
2648 	if (so->so_cred->cr_uid != uid)
2649 		return (EPERM);
2650 	return (0);
2651 }
2652 
2653 static int
2654 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2655 {
2656 	int error;
2657 	int val;
2658 
2659 	val = somaxconn;
2660 	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2661 	if (error || !req->newptr )
2662 		return (error);
2663 
2664 	if (val < 1 || val > USHRT_MAX)
2665 		return (EINVAL);
2666 
2667 	somaxconn = val;
2668 	return (0);
2669 }
2670