xref: /freebsd/sys/kern/uipc_socket.c (revision bfe691b2f75de2224c7ceb304ebcdef2b42d4179)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2006 Robert N. M. Watson
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 /*
36  * Comments on the socket life cycle:
37  *
38  * soalloc() sets of socket layer state for a socket, called only by
39  * socreate() and sonewconn().  Socket layer private.
40  *
41  * sodealloc() tears down socket layer state for a socket, called only by
42  * sofree() and sonewconn().  Socket layer private.
43  *
44  * pru_attach() associates protocol layer state with an allocated socket;
45  * called only once, may fail, aborting socket allocation.  This is called
46  * from socreate() and sonewconn().  Socket layer private.
47  *
48  * pru_detach() disassociates protocol layer state from an attached socket,
49  * and will be called exactly once for sockets in which pru_attach() has
50  * been successfully called.  If pru_attach() returned an error,
51  * pru_detach() will not be called.  Socket layer private.
52  *
53  * pru_abort() and pru_close() notify the protocol layer that the last
54  * consumer of a socket is starting to tear down the socket, and that the
55  * protocol should terminate the connection.  Historically, pru_abort() also
56  * detached protocol state from the socket state, but this is no longer the
57  * case.
58  *
59  * socreate() creates a socket and attaches protocol state.  This is a public
60  * interface that may be used by socket layer consumers to create new
61  * sockets.
62  *
63  * sonewconn() creates a socket and attaches protocol state.  This is a
64  * public interface  that may be used by protocols to create new sockets when
65  * a new connection is received and will be available for accept() on a
66  * listen socket.
67  *
68  * soclose() destroys a socket after possibly waiting for it to disconnect.
69  * This is a public interface that socket consumers should use to close and
70  * release a socket when done with it.
71  *
72  * soabort() destroys a socket without waiting for it to disconnect (used
73  * only for incoming connections that are already partially or fully
74  * connected).  This is used internally by the socket layer when clearing
75  * listen socket queues (due to overflow or close on the listen socket), but
76  * is also a public interface protocols may use to abort connections in
77  * their incomplete listen queues should they no longer be required.  Sockets
78  * placed in completed connection listen queues should not be aborted for
79  * reasons described in the comment above the soclose() implementation.  This
80  * is not a general purpose close routine, and except in the specific
81  * circumstances described here, should not be used.
82  *
83  * sofree() will free a socket and its protocol state if all references on
84  * the socket have been released, and is the public interface to attempt to
85  * free a socket when a reference is removed.  This is a socket layer private
86  * interface.
87  *
88  * NOTE: In addition to socreate() and soclose(), which provide a single
89  * socket reference to the consumer to be managed as required, there are two
90  * calls to explicitly manage socket references, soref(), and sorele().
91  * Currently, these are generally required only when transitioning a socket
92  * from a listen queue to a file descriptor, in order to prevent garbage
93  * collection of the socket at an untimely moment.  For a number of reasons,
94  * these interfaces are not preferred, and should be avoided.
95  */
96 
97 #include <sys/cdefs.h>
98 __FBSDID("$FreeBSD$");
99 
100 #include "opt_inet.h"
101 #include "opt_mac.h"
102 #include "opt_zero.h"
103 #include "opt_compat.h"
104 
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/fcntl.h>
108 #include <sys/limits.h>
109 #include <sys/lock.h>
110 #include <sys/mac.h>
111 #include <sys/malloc.h>
112 #include <sys/mbuf.h>
113 #include <sys/mutex.h>
114 #include <sys/domain.h>
115 #include <sys/file.h>			/* for struct knote */
116 #include <sys/kernel.h>
117 #include <sys/event.h>
118 #include <sys/eventhandler.h>
119 #include <sys/poll.h>
120 #include <sys/proc.h>
121 #include <sys/protosw.h>
122 #include <sys/socket.h>
123 #include <sys/socketvar.h>
124 #include <sys/resourcevar.h>
125 #include <sys/signalvar.h>
126 #include <sys/stat.h>
127 #include <sys/sysctl.h>
128 #include <sys/uio.h>
129 #include <sys/jail.h>
130 
131 #include <security/mac/mac_framework.h>
132 
133 #include <vm/uma.h>
134 
135 #ifdef COMPAT_IA32
136 #include <sys/mount.h>
137 #include <compat/freebsd32/freebsd32.h>
138 
139 extern struct sysentvec ia32_freebsd_sysvec;
140 #endif
141 
142 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
143 		    int flags);
144 
145 static void	filt_sordetach(struct knote *kn);
146 static int	filt_soread(struct knote *kn, long hint);
147 static void	filt_sowdetach(struct knote *kn);
148 static int	filt_sowrite(struct knote *kn, long hint);
149 static int	filt_solisten(struct knote *kn, long hint);
150 
151 static struct filterops solisten_filtops =
152 	{ 1, NULL, filt_sordetach, filt_solisten };
153 static struct filterops soread_filtops =
154 	{ 1, NULL, filt_sordetach, filt_soread };
155 static struct filterops sowrite_filtops =
156 	{ 1, NULL, filt_sowdetach, filt_sowrite };
157 
158 uma_zone_t socket_zone;
159 so_gen_t	so_gencnt;	/* generation count for sockets */
160 
161 int	maxsockets;
162 
163 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
164 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
165 
166 static int somaxconn = SOMAXCONN;
167 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
168 /* XXX: we dont have SYSCTL_USHORT */
169 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
170     0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
171     "queue size");
172 static int numopensockets;
173 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
174     &numopensockets, 0, "Number of open sockets");
175 #ifdef ZERO_COPY_SOCKETS
176 /* These aren't static because they're used in other files. */
177 int so_zero_copy_send = 1;
178 int so_zero_copy_receive = 1;
179 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
180     "Zero copy controls");
181 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
182     &so_zero_copy_receive, 0, "Enable zero copy receive");
183 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
184     &so_zero_copy_send, 0, "Enable zero copy send");
185 #endif /* ZERO_COPY_SOCKETS */
186 
187 /*
188  * accept_mtx locks down per-socket fields relating to accept queues.  See
189  * socketvar.h for an annotation of the protected fields of struct socket.
190  */
191 struct mtx accept_mtx;
192 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
193 
194 /*
195  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
196  * so_gencnt field.
197  */
198 static struct mtx so_global_mtx;
199 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
200 
201 /*
202  * General IPC sysctl name space, used by sockets and a variety of other IPC
203  * types.
204  */
205 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
206 
207 /*
208  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
209  * of the change so that they can update their dependent limits as required.
210  */
211 static int
212 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
213 {
214 	int error, newmaxsockets;
215 
216 	newmaxsockets = maxsockets;
217 	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
218 	if (error == 0 && req->newptr) {
219 		if (newmaxsockets > maxsockets) {
220 			maxsockets = newmaxsockets;
221 			if (maxsockets > ((maxfiles / 4) * 3)) {
222 				maxfiles = (maxsockets * 5) / 4;
223 				maxfilesperproc = (maxfiles * 9) / 10;
224 			}
225 			EVENTHANDLER_INVOKE(maxsockets_change);
226 		} else
227 			error = EINVAL;
228 	}
229 	return (error);
230 }
231 
232 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
233     &maxsockets, 0, sysctl_maxsockets, "IU",
234     "Maximum number of sockets avaliable");
235 
236 /*
237  * Initialise maxsockets.
238  */
239 static void init_maxsockets(void *ignored)
240 {
241 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
242 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
243 }
244 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
245 
246 /*
247  * Socket operation routines.  These routines are called by the routines in
248  * sys_socket.c or from a system process, and implement the semantics of
249  * socket operations by switching out to the protocol specific routines.
250  */
251 
252 /*
253  * Get a socket structure from our zone, and initialize it.  Note that it
254  * would probably be better to allocate socket and PCB at the same time, but
255  * I'm not convinced that all the protocols can be easily modified to do
256  * this.
257  *
258  * soalloc() returns a socket with a ref count of 0.
259  */
260 static struct socket *
261 soalloc(void)
262 {
263 	struct socket *so;
264 
265 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
266 	if (so == NULL)
267 		return (NULL);
268 #ifdef MAC
269 	if (mac_init_socket(so, M_NOWAIT) != 0) {
270 		uma_zfree(socket_zone, so);
271 		return (NULL);
272 	}
273 #endif
274 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
275 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
276 	TAILQ_INIT(&so->so_aiojobq);
277 	mtx_lock(&so_global_mtx);
278 	so->so_gencnt = ++so_gencnt;
279 	++numopensockets;
280 	mtx_unlock(&so_global_mtx);
281 	return (so);
282 }
283 
284 /*
285  * Free the storage associated with a socket at the socket layer, tear down
286  * locks, labels, etc.  All protocol state is assumed already to have been
287  * torn down (and possibly never set up) by the caller.
288  */
289 static void
290 sodealloc(struct socket *so)
291 {
292 
293 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
294 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
295 
296 	mtx_lock(&so_global_mtx);
297 	so->so_gencnt = ++so_gencnt;
298 	--numopensockets;	/* Could be below, but faster here. */
299 	mtx_unlock(&so_global_mtx);
300 	if (so->so_rcv.sb_hiwat)
301 		(void)chgsbsize(so->so_cred->cr_uidinfo,
302 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
303 	if (so->so_snd.sb_hiwat)
304 		(void)chgsbsize(so->so_cred->cr_uidinfo,
305 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
306 #ifdef INET
307 	/* remove acccept filter if one is present. */
308 	if (so->so_accf != NULL)
309 		do_setopt_accept_filter(so, NULL);
310 #endif
311 #ifdef MAC
312 	mac_destroy_socket(so);
313 #endif
314 	crfree(so->so_cred);
315 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
316 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
317 	uma_zfree(socket_zone, so);
318 }
319 
320 /*
321  * socreate returns a socket with a ref count of 1.  The socket should be
322  * closed with soclose().
323  */
324 int
325 socreate(dom, aso, type, proto, cred, td)
326 	int dom;
327 	struct socket **aso;
328 	int type;
329 	int proto;
330 	struct ucred *cred;
331 	struct thread *td;
332 {
333 	struct protosw *prp;
334 	struct socket *so;
335 	int error;
336 
337 	if (proto)
338 		prp = pffindproto(dom, proto, type);
339 	else
340 		prp = pffindtype(dom, type);
341 
342 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
343 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
344 		return (EPROTONOSUPPORT);
345 
346 	if (jailed(cred) && jail_socket_unixiproute_only &&
347 	    prp->pr_domain->dom_family != PF_LOCAL &&
348 	    prp->pr_domain->dom_family != PF_INET &&
349 	    prp->pr_domain->dom_family != PF_ROUTE) {
350 		return (EPROTONOSUPPORT);
351 	}
352 
353 	if (prp->pr_type != type)
354 		return (EPROTOTYPE);
355 	so = soalloc();
356 	if (so == NULL)
357 		return (ENOBUFS);
358 
359 	TAILQ_INIT(&so->so_incomp);
360 	TAILQ_INIT(&so->so_comp);
361 	so->so_type = type;
362 	so->so_cred = crhold(cred);
363 	so->so_proto = prp;
364 #ifdef MAC
365 	mac_create_socket(cred, so);
366 #endif
367 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
368 	    NULL, NULL, NULL);
369 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
370 	    NULL, NULL, NULL);
371 	so->so_count = 1;
372 	/*
373 	 * Auto-sizing of socket buffers is managed by the protocols and
374 	 * the appropriate flags must be set in the pru_attach function.
375 	 */
376 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
377 	if (error) {
378 		KASSERT(so->so_count == 1, ("socreate: so_count %d",
379 		    so->so_count));
380 		so->so_count = 0;
381 		sodealloc(so);
382 		return (error);
383 	}
384 	*aso = so;
385 	return (0);
386 }
387 
388 #ifdef REGRESSION
389 static int regression_sonewconn_earlytest = 1;
390 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
391     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
392 #endif
393 
394 /*
395  * When an attempt at a new connection is noted on a socket which accepts
396  * connections, sonewconn is called.  If the connection is possible (subject
397  * to space constraints, etc.) then we allocate a new structure, propoerly
398  * linked into the data structure of the original socket, and return this.
399  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
400  *
401  * Note: the ref count on the socket is 0 on return.
402  */
403 struct socket *
404 sonewconn(head, connstatus)
405 	register struct socket *head;
406 	int connstatus;
407 {
408 	register struct socket *so;
409 	int over;
410 
411 	ACCEPT_LOCK();
412 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
413 	ACCEPT_UNLOCK();
414 #ifdef REGRESSION
415 	if (regression_sonewconn_earlytest && over)
416 #else
417 	if (over)
418 #endif
419 		return (NULL);
420 	so = soalloc();
421 	if (so == NULL)
422 		return (NULL);
423 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
424 		connstatus = 0;
425 	so->so_head = head;
426 	so->so_type = head->so_type;
427 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
428 	so->so_linger = head->so_linger;
429 	so->so_state = head->so_state | SS_NOFDREF;
430 	so->so_proto = head->so_proto;
431 	so->so_cred = crhold(head->so_cred);
432 #ifdef MAC
433 	SOCK_LOCK(head);
434 	mac_create_socket_from_socket(head, so);
435 	SOCK_UNLOCK(head);
436 #endif
437 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
438 	    NULL, NULL, NULL);
439 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
440 	    NULL, NULL, NULL);
441 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
442 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
443 		sodealloc(so);
444 		return (NULL);
445 	}
446 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
447 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
448 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
449 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
450 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
451 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
452 	so->so_state |= connstatus;
453 	ACCEPT_LOCK();
454 	if (connstatus) {
455 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
456 		so->so_qstate |= SQ_COMP;
457 		head->so_qlen++;
458 	} else {
459 		/*
460 		 * Keep removing sockets from the head until there's room for
461 		 * us to insert on the tail.  In pre-locking revisions, this
462 		 * was a simple if(), but as we could be racing with other
463 		 * threads and soabort() requires dropping locks, we must
464 		 * loop waiting for the condition to be true.
465 		 */
466 		while (head->so_incqlen > head->so_qlimit) {
467 			struct socket *sp;
468 			sp = TAILQ_FIRST(&head->so_incomp);
469 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
470 			head->so_incqlen--;
471 			sp->so_qstate &= ~SQ_INCOMP;
472 			sp->so_head = NULL;
473 			ACCEPT_UNLOCK();
474 			soabort(sp);
475 			ACCEPT_LOCK();
476 		}
477 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
478 		so->so_qstate |= SQ_INCOMP;
479 		head->so_incqlen++;
480 	}
481 	ACCEPT_UNLOCK();
482 	if (connstatus) {
483 		sorwakeup(head);
484 		wakeup_one(&head->so_timeo);
485 	}
486 	return (so);
487 }
488 
489 int
490 sobind(so, nam, td)
491 	struct socket *so;
492 	struct sockaddr *nam;
493 	struct thread *td;
494 {
495 
496 	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
497 }
498 
499 /*
500  * solisten() transitions a socket from a non-listening state to a listening
501  * state, but can also be used to update the listen queue depth on an
502  * existing listen socket.  The protocol will call back into the sockets
503  * layer using solisten_proto_check() and solisten_proto() to check and set
504  * socket-layer listen state.  Call backs are used so that the protocol can
505  * acquire both protocol and socket layer locks in whatever order is required
506  * by the protocol.
507  *
508  * Protocol implementors are advised to hold the socket lock across the
509  * socket-layer test and set to avoid races at the socket layer.
510  */
511 int
512 solisten(so, backlog, td)
513 	struct socket *so;
514 	int backlog;
515 	struct thread *td;
516 {
517 
518 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
519 }
520 
521 int
522 solisten_proto_check(so)
523 	struct socket *so;
524 {
525 
526 	SOCK_LOCK_ASSERT(so);
527 
528 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
529 	    SS_ISDISCONNECTING))
530 		return (EINVAL);
531 	return (0);
532 }
533 
534 void
535 solisten_proto(so, backlog)
536 	struct socket *so;
537 	int backlog;
538 {
539 
540 	SOCK_LOCK_ASSERT(so);
541 
542 	if (backlog < 0 || backlog > somaxconn)
543 		backlog = somaxconn;
544 	so->so_qlimit = backlog;
545 	so->so_options |= SO_ACCEPTCONN;
546 }
547 
548 /*
549  * Attempt to free a socket.  This should really be sotryfree().
550  *
551  * sofree() will succeed if:
552  *
553  * - There are no outstanding file descriptor references or related consumers
554  *   (so_count == 0).
555  *
556  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
557  *
558  * - The protocol does not have an outstanding strong reference on the socket
559  *   (SS_PROTOREF).
560  *
561  * - The socket is not in a completed connection queue, so a process has been
562  *   notified that it is present.  If it is removed, the user process may
563  *   block in accept() despite select() saying the socket was ready.
564  *
565  * Otherwise, it will quietly abort so that a future call to sofree(), when
566  * conditions are right, can succeed.
567  */
568 void
569 sofree(so)
570 	struct socket *so;
571 {
572 	struct protosw *pr = so->so_proto;
573 	struct socket *head;
574 
575 	ACCEPT_LOCK_ASSERT();
576 	SOCK_LOCK_ASSERT(so);
577 
578 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
579 	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
580 		SOCK_UNLOCK(so);
581 		ACCEPT_UNLOCK();
582 		return;
583 	}
584 
585 	head = so->so_head;
586 	if (head != NULL) {
587 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
588 		    (so->so_qstate & SQ_INCOMP) != 0,
589 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
590 		    "SQ_INCOMP"));
591 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
592 		    (so->so_qstate & SQ_INCOMP) == 0,
593 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
594 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
595 		head->so_incqlen--;
596 		so->so_qstate &= ~SQ_INCOMP;
597 		so->so_head = NULL;
598 	}
599 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
600 	    (so->so_qstate & SQ_INCOMP) == 0,
601 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
602 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
603 	if (so->so_options & SO_ACCEPTCONN) {
604 		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
605 		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
606 	}
607 	SOCK_UNLOCK(so);
608 	ACCEPT_UNLOCK();
609 
610 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
611 		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
612 	if (pr->pr_usrreqs->pru_detach != NULL)
613 		(*pr->pr_usrreqs->pru_detach)(so);
614 
615 	/*
616 	 * From this point on, we assume that no other references to this
617 	 * socket exist anywhere else in the stack.  Therefore, no locks need
618 	 * to be acquired or held.
619 	 *
620 	 * We used to do a lot of socket buffer and socket locking here, as
621 	 * well as invoke sorflush() and perform wakeups.  The direct call to
622 	 * dom_dispose() and sbrelease_internal() are an inlining of what was
623 	 * necessary from sorflush().
624 	 *
625 	 * Notice that the socket buffer and kqueue state are torn down
626 	 * before calling pru_detach.  This means that protocols shold not
627 	 * assume they can perform socket wakeups, etc, in their detach
628 	 * code.
629 	 */
630 	KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock"));
631 	KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock"));
632 	sbdestroy(&so->so_snd, so);
633 	sbdestroy(&so->so_rcv, so);
634 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
635 	knlist_destroy(&so->so_snd.sb_sel.si_note);
636 	sodealloc(so);
637 }
638 
639 /*
640  * Close a socket on last file table reference removal.  Initiate disconnect
641  * if connected.  Free socket when disconnect complete.
642  *
643  * This function will sorele() the socket.  Note that soclose() may be called
644  * prior to the ref count reaching zero.  The actual socket structure will
645  * not be freed until the ref count reaches zero.
646  */
647 int
648 soclose(so)
649 	struct socket *so;
650 {
651 	int error = 0;
652 
653 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
654 
655 	funsetown(&so->so_sigio);
656 	if (so->so_state & SS_ISCONNECTED) {
657 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
658 			error = sodisconnect(so);
659 			if (error)
660 				goto drop;
661 		}
662 		if (so->so_options & SO_LINGER) {
663 			if ((so->so_state & SS_ISDISCONNECTING) &&
664 			    (so->so_state & SS_NBIO))
665 				goto drop;
666 			while (so->so_state & SS_ISCONNECTED) {
667 				error = tsleep(&so->so_timeo,
668 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
669 				if (error)
670 					break;
671 			}
672 		}
673 	}
674 
675 drop:
676 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
677 		(*so->so_proto->pr_usrreqs->pru_close)(so);
678 	if (so->so_options & SO_ACCEPTCONN) {
679 		struct socket *sp;
680 		ACCEPT_LOCK();
681 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
682 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
683 			so->so_incqlen--;
684 			sp->so_qstate &= ~SQ_INCOMP;
685 			sp->so_head = NULL;
686 			ACCEPT_UNLOCK();
687 			soabort(sp);
688 			ACCEPT_LOCK();
689 		}
690 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
691 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
692 			so->so_qlen--;
693 			sp->so_qstate &= ~SQ_COMP;
694 			sp->so_head = NULL;
695 			ACCEPT_UNLOCK();
696 			soabort(sp);
697 			ACCEPT_LOCK();
698 		}
699 		ACCEPT_UNLOCK();
700 	}
701 	ACCEPT_LOCK();
702 	SOCK_LOCK(so);
703 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
704 	so->so_state |= SS_NOFDREF;
705 	sorele(so);
706 	return (error);
707 }
708 
709 /*
710  * soabort() is used to abruptly tear down a connection, such as when a
711  * resource limit is reached (listen queue depth exceeded), or if a listen
712  * socket is closed while there are sockets waiting to be accepted.
713  *
714  * This interface is tricky, because it is called on an unreferenced socket,
715  * and must be called only by a thread that has actually removed the socket
716  * from the listen queue it was on, or races with other threads are risked.
717  *
718  * This interface will call into the protocol code, so must not be called
719  * with any socket locks held.  Protocols do call it while holding their own
720  * recursible protocol mutexes, but this is something that should be subject
721  * to review in the future.
722  */
723 void
724 soabort(so)
725 	struct socket *so;
726 {
727 
728 	/*
729 	 * In as much as is possible, assert that no references to this
730 	 * socket are held.  This is not quite the same as asserting that the
731 	 * current thread is responsible for arranging for no references, but
732 	 * is as close as we can get for now.
733 	 */
734 	KASSERT(so->so_count == 0, ("soabort: so_count"));
735 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
736 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
737 	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
738 	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
739 
740 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
741 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
742 	ACCEPT_LOCK();
743 	SOCK_LOCK(so);
744 	sofree(so);
745 }
746 
747 int
748 soaccept(so, nam)
749 	struct socket *so;
750 	struct sockaddr **nam;
751 {
752 	int error;
753 
754 	SOCK_LOCK(so);
755 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
756 	so->so_state &= ~SS_NOFDREF;
757 	SOCK_UNLOCK(so);
758 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
759 	return (error);
760 }
761 
762 int
763 soconnect(so, nam, td)
764 	struct socket *so;
765 	struct sockaddr *nam;
766 	struct thread *td;
767 {
768 	int error;
769 
770 	if (so->so_options & SO_ACCEPTCONN)
771 		return (EOPNOTSUPP);
772 	/*
773 	 * If protocol is connection-based, can only connect once.
774 	 * Otherwise, if connected, try to disconnect first.  This allows
775 	 * user to disconnect by connecting to, e.g., a null address.
776 	 */
777 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
778 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
779 	    (error = sodisconnect(so)))) {
780 		error = EISCONN;
781 	} else {
782 		/*
783 		 * Prevent accumulated error from previous connection from
784 		 * biting us.
785 		 */
786 		so->so_error = 0;
787 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
788 	}
789 
790 	return (error);
791 }
792 
793 int
794 soconnect2(so1, so2)
795 	struct socket *so1;
796 	struct socket *so2;
797 {
798 
799 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
800 }
801 
802 int
803 sodisconnect(so)
804 	struct socket *so;
805 {
806 	int error;
807 
808 	if ((so->so_state & SS_ISCONNECTED) == 0)
809 		return (ENOTCONN);
810 	if (so->so_state & SS_ISDISCONNECTING)
811 		return (EALREADY);
812 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
813 	return (error);
814 }
815 
816 #ifdef ZERO_COPY_SOCKETS
817 struct so_zerocopy_stats{
818 	int size_ok;
819 	int align_ok;
820 	int found_ifp;
821 };
822 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
823 #include <netinet/in.h>
824 #include <net/route.h>
825 #include <netinet/in_pcb.h>
826 #include <vm/vm.h>
827 #include <vm/vm_page.h>
828 #include <vm/vm_object.h>
829 
830 /*
831  * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
832  * sosend_dgram() and sosend_generic() use m_uiotombuf().
833  *
834  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
835  * all of the data referenced by the uio.  If desired, it uses zero-copy.
836  * *space will be updated to reflect data copied in.
837  *
838  * NB: If atomic I/O is requested, the caller must already have checked that
839  * space can hold resid bytes.
840  *
841  * NB: In the event of an error, the caller may need to free the partial
842  * chain pointed to by *mpp.  The contents of both *uio and *space may be
843  * modified even in the case of an error.
844  */
845 static int
846 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
847     int flags)
848 {
849 	struct mbuf *m, **mp, *top;
850 	long len, resid;
851 	int error;
852 #ifdef ZERO_COPY_SOCKETS
853 	int cow_send;
854 #endif
855 
856 	*retmp = top = NULL;
857 	mp = &top;
858 	len = 0;
859 	resid = uio->uio_resid;
860 	error = 0;
861 	do {
862 #ifdef ZERO_COPY_SOCKETS
863 		cow_send = 0;
864 #endif /* ZERO_COPY_SOCKETS */
865 		if (resid >= MINCLSIZE) {
866 #ifdef ZERO_COPY_SOCKETS
867 			if (top == NULL) {
868 				m = m_gethdr(M_WAITOK, MT_DATA);
869 				m->m_pkthdr.len = 0;
870 				m->m_pkthdr.rcvif = NULL;
871 			} else
872 				m = m_get(M_WAITOK, MT_DATA);
873 			if (so_zero_copy_send &&
874 			    resid>=PAGE_SIZE &&
875 			    *space>=PAGE_SIZE &&
876 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
877 				so_zerocp_stats.size_ok++;
878 				so_zerocp_stats.align_ok++;
879 				cow_send = socow_setup(m, uio);
880 				len = cow_send;
881 			}
882 			if (!cow_send) {
883 				m_clget(m, M_WAITOK);
884 				len = min(min(MCLBYTES, resid), *space);
885 			}
886 #else /* ZERO_COPY_SOCKETS */
887 			if (top == NULL) {
888 				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
889 				m->m_pkthdr.len = 0;
890 				m->m_pkthdr.rcvif = NULL;
891 			} else
892 				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
893 			len = min(min(MCLBYTES, resid), *space);
894 #endif /* ZERO_COPY_SOCKETS */
895 		} else {
896 			if (top == NULL) {
897 				m = m_gethdr(M_TRYWAIT, MT_DATA);
898 				m->m_pkthdr.len = 0;
899 				m->m_pkthdr.rcvif = NULL;
900 
901 				len = min(min(MHLEN, resid), *space);
902 				/*
903 				 * For datagram protocols, leave room
904 				 * for protocol headers in first mbuf.
905 				 */
906 				if (atomic && m && len < MHLEN)
907 					MH_ALIGN(m, len);
908 			} else {
909 				m = m_get(M_TRYWAIT, MT_DATA);
910 				len = min(min(MLEN, resid), *space);
911 			}
912 		}
913 		if (m == NULL) {
914 			error = ENOBUFS;
915 			goto out;
916 		}
917 
918 		*space -= len;
919 #ifdef ZERO_COPY_SOCKETS
920 		if (cow_send)
921 			error = 0;
922 		else
923 #endif /* ZERO_COPY_SOCKETS */
924 		error = uiomove(mtod(m, void *), (int)len, uio);
925 		resid = uio->uio_resid;
926 		m->m_len = len;
927 		*mp = m;
928 		top->m_pkthdr.len += len;
929 		if (error)
930 			goto out;
931 		mp = &m->m_next;
932 		if (resid <= 0) {
933 			if (flags & MSG_EOR)
934 				top->m_flags |= M_EOR;
935 			break;
936 		}
937 	} while (*space > 0 && atomic);
938 out:
939 	*retmp = top;
940 	return (error);
941 }
942 #endif /*ZERO_COPY_SOCKETS*/
943 
944 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
945 
946 int
947 sosend_dgram(so, addr, uio, top, control, flags, td)
948 	struct socket *so;
949 	struct sockaddr *addr;
950 	struct uio *uio;
951 	struct mbuf *top;
952 	struct mbuf *control;
953 	int flags;
954 	struct thread *td;
955 {
956 	long space, resid;
957 	int clen = 0, error, dontroute;
958 #ifdef ZERO_COPY_SOCKETS
959 	int atomic = sosendallatonce(so) || top;
960 #endif
961 
962 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
963 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
964 	    ("sodgram_send: !PR_ATOMIC"));
965 
966 	if (uio != NULL)
967 		resid = uio->uio_resid;
968 	else
969 		resid = top->m_pkthdr.len;
970 	/*
971 	 * In theory resid should be unsigned.  However, space must be
972 	 * signed, as it might be less than 0 if we over-committed, and we
973 	 * must use a signed comparison of space and resid.  On the other
974 	 * hand, a negative resid causes us to loop sending 0-length
975 	 * segments to the protocol.
976 	 *
977 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
978 	 * type sockets since that's an error.
979 	 */
980 	if (resid < 0) {
981 		error = EINVAL;
982 		goto out;
983 	}
984 
985 	dontroute =
986 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
987 	if (td != NULL)
988 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
989 	if (control != NULL)
990 		clen = control->m_len;
991 
992 	SOCKBUF_LOCK(&so->so_snd);
993 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
994 		SOCKBUF_UNLOCK(&so->so_snd);
995 		error = EPIPE;
996 		goto out;
997 	}
998 	if (so->so_error) {
999 		error = so->so_error;
1000 		so->so_error = 0;
1001 		SOCKBUF_UNLOCK(&so->so_snd);
1002 		goto out;
1003 	}
1004 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1005 		/*
1006 		 * `sendto' and `sendmsg' is allowed on a connection-based
1007 		 * socket if it supports implied connect.  Return ENOTCONN if
1008 		 * not connected and no address is supplied.
1009 		 */
1010 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1011 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1012 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1013 			    !(resid == 0 && clen != 0)) {
1014 				SOCKBUF_UNLOCK(&so->so_snd);
1015 				error = ENOTCONN;
1016 				goto out;
1017 			}
1018 		} else if (addr == NULL) {
1019 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1020 				error = ENOTCONN;
1021 			else
1022 				error = EDESTADDRREQ;
1023 			SOCKBUF_UNLOCK(&so->so_snd);
1024 			goto out;
1025 		}
1026 	}
1027 
1028 	/*
1029 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1030 	 * problem and need fixing.
1031 	 */
1032 	space = sbspace(&so->so_snd);
1033 	if (flags & MSG_OOB)
1034 		space += 1024;
1035 	space -= clen;
1036 	SOCKBUF_UNLOCK(&so->so_snd);
1037 	if (resid > space) {
1038 		error = EMSGSIZE;
1039 		goto out;
1040 	}
1041 	if (uio == NULL) {
1042 		resid = 0;
1043 		if (flags & MSG_EOR)
1044 			top->m_flags |= M_EOR;
1045 	} else {
1046 #ifdef ZERO_COPY_SOCKETS
1047 		error = sosend_copyin(uio, &top, atomic, &space, flags);
1048 		if (error)
1049 			goto out;
1050 #else
1051 		/*
1052 		 * Copy the data from userland into a mbuf chain.
1053 		 * If no data is to be copied in, a single empty mbuf
1054 		 * is returned.
1055 		 */
1056 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1057 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1058 		if (top == NULL) {
1059 			error = EFAULT;	/* only possible error */
1060 			goto out;
1061 		}
1062 		space -= resid - uio->uio_resid;
1063 #endif
1064 		resid = uio->uio_resid;
1065 	}
1066 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1067 	/*
1068 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1069 	 * than with.
1070 	 */
1071 	if (dontroute) {
1072 		SOCK_LOCK(so);
1073 		so->so_options |= SO_DONTROUTE;
1074 		SOCK_UNLOCK(so);
1075 	}
1076 	/*
1077 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1078 	 * of date.  We could have recieved a reset packet in an interrupt or
1079 	 * maybe we slept while doing page faults in uiomove() etc.  We could
1080 	 * probably recheck again inside the locking protection here, but
1081 	 * there are probably other places that this also happens.  We must
1082 	 * rethink this.
1083 	 */
1084 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1085 	    (flags & MSG_OOB) ? PRUS_OOB :
1086 	/*
1087 	 * If the user set MSG_EOF, the protocol understands this flag and
1088 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1089 	 */
1090 	    ((flags & MSG_EOF) &&
1091 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1092 	     (resid <= 0)) ?
1093 		PRUS_EOF :
1094 		/* If there is more to send set PRUS_MORETOCOME */
1095 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1096 		top, addr, control, td);
1097 	if (dontroute) {
1098 		SOCK_LOCK(so);
1099 		so->so_options &= ~SO_DONTROUTE;
1100 		SOCK_UNLOCK(so);
1101 	}
1102 	clen = 0;
1103 	control = NULL;
1104 	top = NULL;
1105 out:
1106 	if (top != NULL)
1107 		m_freem(top);
1108 	if (control != NULL)
1109 		m_freem(control);
1110 	return (error);
1111 }
1112 
1113 /*
1114  * Send on a socket.  If send must go all at once and message is larger than
1115  * send buffering, then hard error.  Lock against other senders.  If must go
1116  * all at once and not enough room now, then inform user that this would
1117  * block and do nothing.  Otherwise, if nonblocking, send as much as
1118  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1119  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1120  * in mbuf chain must be small enough to send all at once.
1121  *
1122  * Returns nonzero on error, timeout or signal; callers must check for short
1123  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1124  * on return.
1125  */
1126 #define	snderr(errno)	{ error = (errno); goto release; }
1127 int
1128 sosend_generic(so, addr, uio, top, control, flags, td)
1129 	struct socket *so;
1130 	struct sockaddr *addr;
1131 	struct uio *uio;
1132 	struct mbuf *top;
1133 	struct mbuf *control;
1134 	int flags;
1135 	struct thread *td;
1136 {
1137 	long space, resid;
1138 	int clen = 0, error, dontroute;
1139 	int atomic = sosendallatonce(so) || top;
1140 
1141 	if (uio != NULL)
1142 		resid = uio->uio_resid;
1143 	else
1144 		resid = top->m_pkthdr.len;
1145 	/*
1146 	 * In theory resid should be unsigned.  However, space must be
1147 	 * signed, as it might be less than 0 if we over-committed, and we
1148 	 * must use a signed comparison of space and resid.  On the other
1149 	 * hand, a negative resid causes us to loop sending 0-length
1150 	 * segments to the protocol.
1151 	 *
1152 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1153 	 * type sockets since that's an error.
1154 	 */
1155 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1156 		error = EINVAL;
1157 		goto out;
1158 	}
1159 
1160 	dontroute =
1161 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1162 	    (so->so_proto->pr_flags & PR_ATOMIC);
1163 	if (td != NULL)
1164 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1165 	if (control != NULL)
1166 		clen = control->m_len;
1167 
1168 	SOCKBUF_LOCK(&so->so_snd);
1169 restart:
1170 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1171 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1172 	if (error)
1173 		goto out_locked;
1174 	do {
1175 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1176 		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1177 			snderr(EPIPE);
1178 		if (so->so_error) {
1179 			error = so->so_error;
1180 			so->so_error = 0;
1181 			goto release;
1182 		}
1183 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1184 			/*
1185 			 * `sendto' and `sendmsg' is allowed on a connection-
1186 			 * based socket if it supports implied connect.
1187 			 * Return ENOTCONN if not connected and no address is
1188 			 * supplied.
1189 			 */
1190 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1191 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1192 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1193 				    !(resid == 0 && clen != 0))
1194 					snderr(ENOTCONN);
1195 			} else if (addr == NULL)
1196 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1197 				   ENOTCONN : EDESTADDRREQ);
1198 		}
1199 		space = sbspace(&so->so_snd);
1200 		if (flags & MSG_OOB)
1201 			space += 1024;
1202 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1203 		    clen > so->so_snd.sb_hiwat)
1204 			snderr(EMSGSIZE);
1205 		if (space < resid + clen &&
1206 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1207 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1208 				snderr(EWOULDBLOCK);
1209 			sbunlock(&so->so_snd);
1210 			error = sbwait(&so->so_snd);
1211 			if (error)
1212 				goto out_locked;
1213 			goto restart;
1214 		}
1215 		SOCKBUF_UNLOCK(&so->so_snd);
1216 		space -= clen;
1217 		do {
1218 			if (uio == NULL) {
1219 				resid = 0;
1220 				if (flags & MSG_EOR)
1221 					top->m_flags |= M_EOR;
1222 			} else {
1223 #ifdef ZERO_COPY_SOCKETS
1224 				error = sosend_copyin(uio, &top, atomic,
1225 				    &space, flags);
1226 				if (error != 0) {
1227 					SOCKBUF_LOCK(&so->so_snd);
1228 					goto release;
1229 				}
1230 #else
1231 				/*
1232 				 * Copy the data from userland into a mbuf
1233 				 * chain.  If no data is to be copied in,
1234 				 * a single empty mbuf is returned.
1235 				 */
1236 				top = m_uiotombuf(uio, M_WAITOK, space,
1237 				    (atomic ? max_hdr : 0),
1238 				    (atomic ? M_PKTHDR : 0) |
1239 				    ((flags & MSG_EOR) ? M_EOR : 0));
1240 				if (top == NULL) {
1241 					SOCKBUF_LOCK(&so->so_snd);
1242 					error = EFAULT; /* only possible error */
1243 					goto release;
1244 				}
1245 				space -= resid - uio->uio_resid;
1246 #endif
1247 				resid = uio->uio_resid;
1248 			}
1249 			if (dontroute) {
1250 				SOCK_LOCK(so);
1251 				so->so_options |= SO_DONTROUTE;
1252 				SOCK_UNLOCK(so);
1253 			}
1254 			/*
1255 			 * XXX all the SBS_CANTSENDMORE checks previously
1256 			 * done could be out of date.  We could have recieved
1257 			 * a reset packet in an interrupt or maybe we slept
1258 			 * while doing page faults in uiomove() etc.  We
1259 			 * could probably recheck again inside the locking
1260 			 * protection here, but there are probably other
1261 			 * places that this also happens.  We must rethink
1262 			 * this.
1263 			 */
1264 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1265 			    (flags & MSG_OOB) ? PRUS_OOB :
1266 			/*
1267 			 * If the user set MSG_EOF, the protocol understands
1268 			 * this flag and nothing left to send then use
1269 			 * PRU_SEND_EOF instead of PRU_SEND.
1270 			 */
1271 			    ((flags & MSG_EOF) &&
1272 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1273 			     (resid <= 0)) ?
1274 				PRUS_EOF :
1275 			/* If there is more to send set PRUS_MORETOCOME. */
1276 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1277 			    top, addr, control, td);
1278 			if (dontroute) {
1279 				SOCK_LOCK(so);
1280 				so->so_options &= ~SO_DONTROUTE;
1281 				SOCK_UNLOCK(so);
1282 			}
1283 			clen = 0;
1284 			control = NULL;
1285 			top = NULL;
1286 			if (error) {
1287 				SOCKBUF_LOCK(&so->so_snd);
1288 				goto release;
1289 			}
1290 		} while (resid && space > 0);
1291 		SOCKBUF_LOCK(&so->so_snd);
1292 	} while (resid);
1293 
1294 release:
1295 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1296 	sbunlock(&so->so_snd);
1297 out_locked:
1298 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1299 	SOCKBUF_UNLOCK(&so->so_snd);
1300 out:
1301 	if (top != NULL)
1302 		m_freem(top);
1303 	if (control != NULL)
1304 		m_freem(control);
1305 	return (error);
1306 }
1307 #undef snderr
1308 
1309 int
1310 sosend(so, addr, uio, top, control, flags, td)
1311 	struct socket *so;
1312 	struct sockaddr *addr;
1313 	struct uio *uio;
1314 	struct mbuf *top;
1315 	struct mbuf *control;
1316 	int flags;
1317 	struct thread *td;
1318 {
1319 
1320 	/* XXXRW: Temporary debugging. */
1321 	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1322 	    ("sosend: protocol calls sosend"));
1323 
1324 	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1325 	    control, flags, td));
1326 }
1327 
1328 /*
1329  * The part of soreceive() that implements reading non-inline out-of-band
1330  * data from a socket.  For more complete comments, see soreceive(), from
1331  * which this code originated.
1332  *
1333  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1334  * unable to return an mbuf chain to the caller.
1335  */
1336 static int
1337 soreceive_rcvoob(so, uio, flags)
1338 	struct socket *so;
1339 	struct uio *uio;
1340 	int flags;
1341 {
1342 	struct protosw *pr = so->so_proto;
1343 	struct mbuf *m;
1344 	int error;
1345 
1346 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1347 
1348 	m = m_get(M_TRYWAIT, MT_DATA);
1349 	if (m == NULL)
1350 		return (ENOBUFS);
1351 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1352 	if (error)
1353 		goto bad;
1354 	do {
1355 #ifdef ZERO_COPY_SOCKETS
1356 		if (so_zero_copy_receive) {
1357 			int disposable;
1358 
1359 			if ((m->m_flags & M_EXT)
1360 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1361 				disposable = 1;
1362 			else
1363 				disposable = 0;
1364 
1365 			error = uiomoveco(mtod(m, void *),
1366 					  min(uio->uio_resid, m->m_len),
1367 					  uio, disposable);
1368 		} else
1369 #endif /* ZERO_COPY_SOCKETS */
1370 		error = uiomove(mtod(m, void *),
1371 		    (int) min(uio->uio_resid, m->m_len), uio);
1372 		m = m_free(m);
1373 	} while (uio->uio_resid && error == 0 && m);
1374 bad:
1375 	if (m != NULL)
1376 		m_freem(m);
1377 	return (error);
1378 }
1379 
1380 /*
1381  * Following replacement or removal of the first mbuf on the first mbuf chain
1382  * of a socket buffer, push necessary state changes back into the socket
1383  * buffer so that other consumers see the values consistently.  'nextrecord'
1384  * is the callers locally stored value of the original value of
1385  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1386  * NOTE: 'nextrecord' may be NULL.
1387  */
1388 static __inline void
1389 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1390 {
1391 
1392 	SOCKBUF_LOCK_ASSERT(sb);
1393 	/*
1394 	 * First, update for the new value of nextrecord.  If necessary, make
1395 	 * it the first record.
1396 	 */
1397 	if (sb->sb_mb != NULL)
1398 		sb->sb_mb->m_nextpkt = nextrecord;
1399 	else
1400 		sb->sb_mb = nextrecord;
1401 
1402         /*
1403          * Now update any dependent socket buffer fields to reflect the new
1404          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1405 	 * addition of a second clause that takes care of the case where
1406 	 * sb_mb has been updated, but remains the last record.
1407          */
1408         if (sb->sb_mb == NULL) {
1409                 sb->sb_mbtail = NULL;
1410                 sb->sb_lastrecord = NULL;
1411         } else if (sb->sb_mb->m_nextpkt == NULL)
1412                 sb->sb_lastrecord = sb->sb_mb;
1413 }
1414 
1415 
1416 /*
1417  * Implement receive operations on a socket.  We depend on the way that
1418  * records are added to the sockbuf by sbappend.  In particular, each record
1419  * (mbufs linked through m_next) must begin with an address if the protocol
1420  * so specifies, followed by an optional mbuf or mbufs containing ancillary
1421  * data, and then zero or more mbufs of data.  In order to allow parallelism
1422  * between network receive and copying to user space, as well as avoid
1423  * sleeping with a mutex held, we release the socket buffer mutex during the
1424  * user space copy.  Although the sockbuf is locked, new data may still be
1425  * appended, and thus we must maintain consistency of the sockbuf during that
1426  * time.
1427  *
1428  * The caller may receive the data as a single mbuf chain by supplying an
1429  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1430  * the count in uio_resid.
1431  */
1432 int
1433 soreceive_generic(so, psa, uio, mp0, controlp, flagsp)
1434 	struct socket *so;
1435 	struct sockaddr **psa;
1436 	struct uio *uio;
1437 	struct mbuf **mp0;
1438 	struct mbuf **controlp;
1439 	int *flagsp;
1440 {
1441 	struct mbuf *m, **mp;
1442 	int flags, len, error, offset;
1443 	struct protosw *pr = so->so_proto;
1444 	struct mbuf *nextrecord;
1445 	int moff, type = 0;
1446 	int orig_resid = uio->uio_resid;
1447 
1448 	mp = mp0;
1449 	if (psa != NULL)
1450 		*psa = NULL;
1451 	if (controlp != NULL)
1452 		*controlp = NULL;
1453 	if (flagsp != NULL)
1454 		flags = *flagsp &~ MSG_EOR;
1455 	else
1456 		flags = 0;
1457 	if (flags & MSG_OOB)
1458 		return (soreceive_rcvoob(so, uio, flags));
1459 	if (mp != NULL)
1460 		*mp = NULL;
1461 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1462 	    && uio->uio_resid)
1463 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1464 
1465 	SOCKBUF_LOCK(&so->so_rcv);
1466 restart:
1467 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1468 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1469 	if (error)
1470 		goto out;
1471 
1472 	m = so->so_rcv.sb_mb;
1473 	/*
1474 	 * If we have less data than requested, block awaiting more (subject
1475 	 * to any timeout) if:
1476 	 *   1. the current count is less than the low water mark, or
1477 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1478 	 *	receive operation at once if we block (resid <= hiwat).
1479 	 *   3. MSG_DONTWAIT is not set
1480 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1481 	 * we have to do the receive in sections, and thus risk returning a
1482 	 * short count if a timeout or signal occurs after we start.
1483 	 */
1484 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1485 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1486 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1487 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1488 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1489 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1490 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1491 		    m, so->so_rcv.sb_cc));
1492 		if (so->so_error) {
1493 			if (m != NULL)
1494 				goto dontblock;
1495 			error = so->so_error;
1496 			if ((flags & MSG_PEEK) == 0)
1497 				so->so_error = 0;
1498 			goto release;
1499 		}
1500 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1501 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1502 			if (m)
1503 				goto dontblock;
1504 			else
1505 				goto release;
1506 		}
1507 		for (; m != NULL; m = m->m_next)
1508 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1509 				m = so->so_rcv.sb_mb;
1510 				goto dontblock;
1511 			}
1512 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1513 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1514 			error = ENOTCONN;
1515 			goto release;
1516 		}
1517 		if (uio->uio_resid == 0)
1518 			goto release;
1519 		if ((so->so_state & SS_NBIO) ||
1520 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1521 			error = EWOULDBLOCK;
1522 			goto release;
1523 		}
1524 		SBLASTRECORDCHK(&so->so_rcv);
1525 		SBLASTMBUFCHK(&so->so_rcv);
1526 		sbunlock(&so->so_rcv);
1527 		error = sbwait(&so->so_rcv);
1528 		if (error)
1529 			goto out;
1530 		goto restart;
1531 	}
1532 dontblock:
1533 	/*
1534 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1535 	 * pointer to the next record in the socket buffer.  We must keep the
1536 	 * various socket buffer pointers and local stack versions of the
1537 	 * pointers in sync, pushing out modifications before dropping the
1538 	 * socket buffer mutex, and re-reading them when picking it up.
1539 	 *
1540 	 * Otherwise, we will race with the network stack appending new data
1541 	 * or records onto the socket buffer by using inconsistent/stale
1542 	 * versions of the field, possibly resulting in socket buffer
1543 	 * corruption.
1544 	 *
1545 	 * By holding the high-level sblock(), we prevent simultaneous
1546 	 * readers from pulling off the front of the socket buffer.
1547 	 */
1548 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1549 	if (uio->uio_td)
1550 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1551 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1552 	SBLASTRECORDCHK(&so->so_rcv);
1553 	SBLASTMBUFCHK(&so->so_rcv);
1554 	nextrecord = m->m_nextpkt;
1555 	if (pr->pr_flags & PR_ADDR) {
1556 		KASSERT(m->m_type == MT_SONAME,
1557 		    ("m->m_type == %d", m->m_type));
1558 		orig_resid = 0;
1559 		if (psa != NULL)
1560 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1561 			    M_NOWAIT);
1562 		if (flags & MSG_PEEK) {
1563 			m = m->m_next;
1564 		} else {
1565 			sbfree(&so->so_rcv, m);
1566 			so->so_rcv.sb_mb = m_free(m);
1567 			m = so->so_rcv.sb_mb;
1568 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1569 		}
1570 	}
1571 
1572 	/*
1573 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1574 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1575 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1576 	 * perform externalization (or freeing if controlp == NULL).
1577 	 */
1578 	if (m != NULL && m->m_type == MT_CONTROL) {
1579 		struct mbuf *cm = NULL, *cmn;
1580 		struct mbuf **cme = &cm;
1581 
1582 		do {
1583 			if (flags & MSG_PEEK) {
1584 				if (controlp != NULL) {
1585 					*controlp = m_copy(m, 0, m->m_len);
1586 					controlp = &(*controlp)->m_next;
1587 				}
1588 				m = m->m_next;
1589 			} else {
1590 				sbfree(&so->so_rcv, m);
1591 				so->so_rcv.sb_mb = m->m_next;
1592 				m->m_next = NULL;
1593 				*cme = m;
1594 				cme = &(*cme)->m_next;
1595 				m = so->so_rcv.sb_mb;
1596 			}
1597 		} while (m != NULL && m->m_type == MT_CONTROL);
1598 		if ((flags & MSG_PEEK) == 0)
1599 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1600 		while (cm != NULL) {
1601 			cmn = cm->m_next;
1602 			cm->m_next = NULL;
1603 			if (pr->pr_domain->dom_externalize != NULL) {
1604 				SOCKBUF_UNLOCK(&so->so_rcv);
1605 				error = (*pr->pr_domain->dom_externalize)
1606 				    (cm, controlp);
1607 				SOCKBUF_LOCK(&so->so_rcv);
1608 			} else if (controlp != NULL)
1609 				*controlp = cm;
1610 			else
1611 				m_freem(cm);
1612 			if (controlp != NULL) {
1613 				orig_resid = 0;
1614 				while (*controlp != NULL)
1615 					controlp = &(*controlp)->m_next;
1616 			}
1617 			cm = cmn;
1618 		}
1619 		if (m != NULL)
1620 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1621 		else
1622 			nextrecord = so->so_rcv.sb_mb;
1623 		orig_resid = 0;
1624 	}
1625 	if (m != NULL) {
1626 		if ((flags & MSG_PEEK) == 0) {
1627 			KASSERT(m->m_nextpkt == nextrecord,
1628 			    ("soreceive: post-control, nextrecord !sync"));
1629 			if (nextrecord == NULL) {
1630 				KASSERT(so->so_rcv.sb_mb == m,
1631 				    ("soreceive: post-control, sb_mb!=m"));
1632 				KASSERT(so->so_rcv.sb_lastrecord == m,
1633 				    ("soreceive: post-control, lastrecord!=m"));
1634 			}
1635 		}
1636 		type = m->m_type;
1637 		if (type == MT_OOBDATA)
1638 			flags |= MSG_OOB;
1639 	} else {
1640 		if ((flags & MSG_PEEK) == 0) {
1641 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1642 			    ("soreceive: sb_mb != nextrecord"));
1643 			if (so->so_rcv.sb_mb == NULL) {
1644 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1645 				    ("soreceive: sb_lastercord != NULL"));
1646 			}
1647 		}
1648 	}
1649 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1650 	SBLASTRECORDCHK(&so->so_rcv);
1651 	SBLASTMBUFCHK(&so->so_rcv);
1652 
1653 	/*
1654 	 * Now continue to read any data mbufs off of the head of the socket
1655 	 * buffer until the read request is satisfied.  Note that 'type' is
1656 	 * used to store the type of any mbuf reads that have happened so far
1657 	 * such that soreceive() can stop reading if the type changes, which
1658 	 * causes soreceive() to return only one of regular data and inline
1659 	 * out-of-band data in a single socket receive operation.
1660 	 */
1661 	moff = 0;
1662 	offset = 0;
1663 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1664 		/*
1665 		 * If the type of mbuf has changed since the last mbuf
1666 		 * examined ('type'), end the receive operation.
1667 	 	 */
1668 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1669 		if (m->m_type == MT_OOBDATA) {
1670 			if (type != MT_OOBDATA)
1671 				break;
1672 		} else if (type == MT_OOBDATA)
1673 			break;
1674 		else
1675 		    KASSERT(m->m_type == MT_DATA,
1676 			("m->m_type == %d", m->m_type));
1677 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1678 		len = uio->uio_resid;
1679 		if (so->so_oobmark && len > so->so_oobmark - offset)
1680 			len = so->so_oobmark - offset;
1681 		if (len > m->m_len - moff)
1682 			len = m->m_len - moff;
1683 		/*
1684 		 * If mp is set, just pass back the mbufs.  Otherwise copy
1685 		 * them out via the uio, then free.  Sockbuf must be
1686 		 * consistent here (points to current mbuf, it points to next
1687 		 * record) when we drop priority; we must note any additions
1688 		 * to the sockbuf when we block interrupts again.
1689 		 */
1690 		if (mp == NULL) {
1691 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1692 			SBLASTRECORDCHK(&so->so_rcv);
1693 			SBLASTMBUFCHK(&so->so_rcv);
1694 			SOCKBUF_UNLOCK(&so->so_rcv);
1695 #ifdef ZERO_COPY_SOCKETS
1696 			if (so_zero_copy_receive) {
1697 				int disposable;
1698 
1699 				if ((m->m_flags & M_EXT)
1700 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1701 					disposable = 1;
1702 				else
1703 					disposable = 0;
1704 
1705 				error = uiomoveco(mtod(m, char *) + moff,
1706 						  (int)len, uio,
1707 						  disposable);
1708 			} else
1709 #endif /* ZERO_COPY_SOCKETS */
1710 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1711 			SOCKBUF_LOCK(&so->so_rcv);
1712 			if (error) {
1713 				/*
1714 				 * The MT_SONAME mbuf has already been removed
1715 				 * from the record, so it is necessary to
1716 				 * remove the data mbufs, if any, to preserve
1717 				 * the invariant in the case of PR_ADDR that
1718 				 * requires MT_SONAME mbufs at the head of
1719 				 * each record.
1720 				 */
1721 				if (m && pr->pr_flags & PR_ATOMIC &&
1722 				    ((flags & MSG_PEEK) == 0))
1723 					(void)sbdroprecord_locked(&so->so_rcv);
1724 				goto release;
1725 			}
1726 		} else
1727 			uio->uio_resid -= len;
1728 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1729 		if (len == m->m_len - moff) {
1730 			if (m->m_flags & M_EOR)
1731 				flags |= MSG_EOR;
1732 			if (flags & MSG_PEEK) {
1733 				m = m->m_next;
1734 				moff = 0;
1735 			} else {
1736 				nextrecord = m->m_nextpkt;
1737 				sbfree(&so->so_rcv, m);
1738 				if (mp != NULL) {
1739 					*mp = m;
1740 					mp = &m->m_next;
1741 					so->so_rcv.sb_mb = m = m->m_next;
1742 					*mp = NULL;
1743 				} else {
1744 					so->so_rcv.sb_mb = m_free(m);
1745 					m = so->so_rcv.sb_mb;
1746 				}
1747 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1748 				SBLASTRECORDCHK(&so->so_rcv);
1749 				SBLASTMBUFCHK(&so->so_rcv);
1750 			}
1751 		} else {
1752 			if (flags & MSG_PEEK)
1753 				moff += len;
1754 			else {
1755 				if (mp != NULL) {
1756 					int copy_flag;
1757 
1758 					if (flags & MSG_DONTWAIT)
1759 						copy_flag = M_DONTWAIT;
1760 					else
1761 						copy_flag = M_TRYWAIT;
1762 					if (copy_flag == M_TRYWAIT)
1763 						SOCKBUF_UNLOCK(&so->so_rcv);
1764 					*mp = m_copym(m, 0, len, copy_flag);
1765 					if (copy_flag == M_TRYWAIT)
1766 						SOCKBUF_LOCK(&so->so_rcv);
1767  					if (*mp == NULL) {
1768  						/*
1769  						 * m_copym() couldn't
1770 						 * allocate an mbuf.  Adjust
1771 						 * uio_resid back (it was
1772 						 * adjusted down by len
1773 						 * bytes, which we didn't end
1774 						 * up "copying" over).
1775  						 */
1776  						uio->uio_resid += len;
1777  						break;
1778  					}
1779 				}
1780 				m->m_data += len;
1781 				m->m_len -= len;
1782 				so->so_rcv.sb_cc -= len;
1783 			}
1784 		}
1785 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1786 		if (so->so_oobmark) {
1787 			if ((flags & MSG_PEEK) == 0) {
1788 				so->so_oobmark -= len;
1789 				if (so->so_oobmark == 0) {
1790 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1791 					break;
1792 				}
1793 			} else {
1794 				offset += len;
1795 				if (offset == so->so_oobmark)
1796 					break;
1797 			}
1798 		}
1799 		if (flags & MSG_EOR)
1800 			break;
1801 		/*
1802 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1803 		 * must not quit until "uio->uio_resid == 0" or an error
1804 		 * termination.  If a signal/timeout occurs, return with a
1805 		 * short count but without error.  Keep sockbuf locked
1806 		 * against other readers.
1807 		 */
1808 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1809 		    !sosendallatonce(so) && nextrecord == NULL) {
1810 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1811 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1812 				break;
1813 			/*
1814 			 * Notify the protocol that some data has been
1815 			 * drained before blocking.
1816 			 */
1817 			if (pr->pr_flags & PR_WANTRCVD) {
1818 				SOCKBUF_UNLOCK(&so->so_rcv);
1819 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1820 				SOCKBUF_LOCK(&so->so_rcv);
1821 			}
1822 			SBLASTRECORDCHK(&so->so_rcv);
1823 			SBLASTMBUFCHK(&so->so_rcv);
1824 			error = sbwait(&so->so_rcv);
1825 			if (error)
1826 				goto release;
1827 			m = so->so_rcv.sb_mb;
1828 			if (m != NULL)
1829 				nextrecord = m->m_nextpkt;
1830 		}
1831 	}
1832 
1833 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1834 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1835 		flags |= MSG_TRUNC;
1836 		if ((flags & MSG_PEEK) == 0)
1837 			(void) sbdroprecord_locked(&so->so_rcv);
1838 	}
1839 	if ((flags & MSG_PEEK) == 0) {
1840 		if (m == NULL) {
1841 			/*
1842 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1843 			 * part makes sure sb_lastrecord is up-to-date if
1844 			 * there is still data in the socket buffer.
1845 			 */
1846 			so->so_rcv.sb_mb = nextrecord;
1847 			if (so->so_rcv.sb_mb == NULL) {
1848 				so->so_rcv.sb_mbtail = NULL;
1849 				so->so_rcv.sb_lastrecord = NULL;
1850 			} else if (nextrecord->m_nextpkt == NULL)
1851 				so->so_rcv.sb_lastrecord = nextrecord;
1852 		}
1853 		SBLASTRECORDCHK(&so->so_rcv);
1854 		SBLASTMBUFCHK(&so->so_rcv);
1855 		/*
1856 		 * If soreceive() is being done from the socket callback,
1857 		 * then don't need to generate ACK to peer to update window,
1858 		 * since ACK will be generated on return to TCP.
1859 		 */
1860 		if (!(flags & MSG_SOCALLBCK) &&
1861 		    (pr->pr_flags & PR_WANTRCVD)) {
1862 			SOCKBUF_UNLOCK(&so->so_rcv);
1863 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1864 			SOCKBUF_LOCK(&so->so_rcv);
1865 		}
1866 	}
1867 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1868 	if (orig_resid == uio->uio_resid && orig_resid &&
1869 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1870 		sbunlock(&so->so_rcv);
1871 		goto restart;
1872 	}
1873 
1874 	if (flagsp != NULL)
1875 		*flagsp |= flags;
1876 release:
1877 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1878 	sbunlock(&so->so_rcv);
1879 out:
1880 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1881 	SOCKBUF_UNLOCK(&so->so_rcv);
1882 	return (error);
1883 }
1884 
1885 int
1886 soreceive(so, psa, uio, mp0, controlp, flagsp)
1887 	struct socket *so;
1888 	struct sockaddr **psa;
1889 	struct uio *uio;
1890 	struct mbuf **mp0;
1891 	struct mbuf **controlp;
1892 	int *flagsp;
1893 {
1894 
1895 	/* XXXRW: Temporary debugging. */
1896 	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
1897 	    ("soreceive: protocol calls soreceive"));
1898 
1899 	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
1900 	    controlp, flagsp));
1901 }
1902 
1903 int
1904 soshutdown(so, how)
1905 	struct socket *so;
1906 	int how;
1907 {
1908 	struct protosw *pr = so->so_proto;
1909 
1910 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1911 		return (EINVAL);
1912 
1913 	if (how != SHUT_WR)
1914 		sorflush(so);
1915 	if (how != SHUT_RD)
1916 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1917 	return (0);
1918 }
1919 
1920 void
1921 sorflush(so)
1922 	struct socket *so;
1923 {
1924 	struct sockbuf *sb = &so->so_rcv;
1925 	struct protosw *pr = so->so_proto;
1926 	struct sockbuf asb;
1927 
1928 	/*
1929 	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1930 	 * the socket buffer, then zero'd the original to clear the buffer
1931 	 * fields.  However, with mutexes in the socket buffer, this causes
1932 	 * problems.  We only clear the zeroable bits of the original;
1933 	 * however, we have to initialize and destroy the mutex in the copy
1934 	 * so that dom_dispose() and sbrelease() can lock t as needed.
1935 	 */
1936 	SOCKBUF_LOCK(sb);
1937 	sb->sb_flags |= SB_NOINTR;
1938 	(void) sblock(sb, M_WAITOK);
1939 	/*
1940 	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1941 	 * can safely perform wakeups.  Re-acquire the mutex before
1942 	 * continuing.
1943 	 */
1944 	socantrcvmore_locked(so);
1945 	SOCKBUF_LOCK(sb);
1946 	sbunlock(sb);
1947 	/*
1948 	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
1949 	 * and mutex data unchanged.
1950 	 */
1951 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1952 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1953 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1954 	bzero(&sb->sb_startzero,
1955 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1956 	SOCKBUF_UNLOCK(sb);
1957 
1958 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1959 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1960 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1961 	sbrelease(&asb, so);
1962 	SOCKBUF_LOCK_DESTROY(&asb);
1963 }
1964 
1965 /*
1966  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
1967  * additional variant to handle the case where the option value needs to be
1968  * some kind of integer, but not a specific size.  In addition to their use
1969  * here, these functions are also called by the protocol-level pr_ctloutput()
1970  * routines.
1971  */
1972 int
1973 sooptcopyin(sopt, buf, len, minlen)
1974 	struct	sockopt *sopt;
1975 	void	*buf;
1976 	size_t	len;
1977 	size_t	minlen;
1978 {
1979 	size_t	valsize;
1980 
1981 	/*
1982 	 * If the user gives us more than we wanted, we ignore it, but if we
1983 	 * don't get the minimum length the caller wants, we return EINVAL.
1984 	 * On success, sopt->sopt_valsize is set to however much we actually
1985 	 * retrieved.
1986 	 */
1987 	if ((valsize = sopt->sopt_valsize) < minlen)
1988 		return EINVAL;
1989 	if (valsize > len)
1990 		sopt->sopt_valsize = valsize = len;
1991 
1992 	if (sopt->sopt_td != NULL)
1993 		return (copyin(sopt->sopt_val, buf, valsize));
1994 
1995 	bcopy(sopt->sopt_val, buf, valsize);
1996 	return (0);
1997 }
1998 
1999 /*
2000  * Kernel version of setsockopt(2).
2001  *
2002  * XXX: optlen is size_t, not socklen_t
2003  */
2004 int
2005 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2006     size_t optlen)
2007 {
2008 	struct sockopt sopt;
2009 
2010 	sopt.sopt_level = level;
2011 	sopt.sopt_name = optname;
2012 	sopt.sopt_dir = SOPT_SET;
2013 	sopt.sopt_val = optval;
2014 	sopt.sopt_valsize = optlen;
2015 	sopt.sopt_td = NULL;
2016 	return (sosetopt(so, &sopt));
2017 }
2018 
2019 int
2020 sosetopt(so, sopt)
2021 	struct socket *so;
2022 	struct sockopt *sopt;
2023 {
2024 	int	error, optval;
2025 	struct	linger l;
2026 	struct	timeval tv;
2027 	u_long  val;
2028 #ifdef MAC
2029 	struct mac extmac;
2030 #endif
2031 
2032 	error = 0;
2033 	if (sopt->sopt_level != SOL_SOCKET) {
2034 		if (so->so_proto && so->so_proto->pr_ctloutput)
2035 			return ((*so->so_proto->pr_ctloutput)
2036 				  (so, sopt));
2037 		error = ENOPROTOOPT;
2038 	} else {
2039 		switch (sopt->sopt_name) {
2040 #ifdef INET
2041 		case SO_ACCEPTFILTER:
2042 			error = do_setopt_accept_filter(so, sopt);
2043 			if (error)
2044 				goto bad;
2045 			break;
2046 #endif
2047 		case SO_LINGER:
2048 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2049 			if (error)
2050 				goto bad;
2051 
2052 			SOCK_LOCK(so);
2053 			so->so_linger = l.l_linger;
2054 			if (l.l_onoff)
2055 				so->so_options |= SO_LINGER;
2056 			else
2057 				so->so_options &= ~SO_LINGER;
2058 			SOCK_UNLOCK(so);
2059 			break;
2060 
2061 		case SO_DEBUG:
2062 		case SO_KEEPALIVE:
2063 		case SO_DONTROUTE:
2064 		case SO_USELOOPBACK:
2065 		case SO_BROADCAST:
2066 		case SO_REUSEADDR:
2067 		case SO_REUSEPORT:
2068 		case SO_OOBINLINE:
2069 		case SO_TIMESTAMP:
2070 		case SO_BINTIME:
2071 		case SO_NOSIGPIPE:
2072 			error = sooptcopyin(sopt, &optval, sizeof optval,
2073 					    sizeof optval);
2074 			if (error)
2075 				goto bad;
2076 			SOCK_LOCK(so);
2077 			if (optval)
2078 				so->so_options |= sopt->sopt_name;
2079 			else
2080 				so->so_options &= ~sopt->sopt_name;
2081 			SOCK_UNLOCK(so);
2082 			break;
2083 
2084 		case SO_SNDBUF:
2085 		case SO_RCVBUF:
2086 		case SO_SNDLOWAT:
2087 		case SO_RCVLOWAT:
2088 			error = sooptcopyin(sopt, &optval, sizeof optval,
2089 					    sizeof optval);
2090 			if (error)
2091 				goto bad;
2092 
2093 			/*
2094 			 * Values < 1 make no sense for any of these options,
2095 			 * so disallow them.
2096 			 */
2097 			if (optval < 1) {
2098 				error = EINVAL;
2099 				goto bad;
2100 			}
2101 
2102 			switch (sopt->sopt_name) {
2103 			case SO_SNDBUF:
2104 			case SO_RCVBUF:
2105 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2106 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2107 				    so, curthread) == 0) {
2108 					error = ENOBUFS;
2109 					goto bad;
2110 				}
2111 				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2112 				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2113 				break;
2114 
2115 			/*
2116 			 * Make sure the low-water is never greater than the
2117 			 * high-water.
2118 			 */
2119 			case SO_SNDLOWAT:
2120 				SOCKBUF_LOCK(&so->so_snd);
2121 				so->so_snd.sb_lowat =
2122 				    (optval > so->so_snd.sb_hiwat) ?
2123 				    so->so_snd.sb_hiwat : optval;
2124 				SOCKBUF_UNLOCK(&so->so_snd);
2125 				break;
2126 			case SO_RCVLOWAT:
2127 				SOCKBUF_LOCK(&so->so_rcv);
2128 				so->so_rcv.sb_lowat =
2129 				    (optval > so->so_rcv.sb_hiwat) ?
2130 				    so->so_rcv.sb_hiwat : optval;
2131 				SOCKBUF_UNLOCK(&so->so_rcv);
2132 				break;
2133 			}
2134 			break;
2135 
2136 		case SO_SNDTIMEO:
2137 		case SO_RCVTIMEO:
2138 #ifdef COMPAT_IA32
2139 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2140 				struct timeval32 tv32;
2141 
2142 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2143 				    sizeof tv32);
2144 				CP(tv32, tv, tv_sec);
2145 				CP(tv32, tv, tv_usec);
2146 			} else
2147 #endif
2148 				error = sooptcopyin(sopt, &tv, sizeof tv,
2149 				    sizeof tv);
2150 			if (error)
2151 				goto bad;
2152 
2153 			/* assert(hz > 0); */
2154 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2155 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2156 				error = EDOM;
2157 				goto bad;
2158 			}
2159 			/* assert(tick > 0); */
2160 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2161 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2162 			if (val > INT_MAX) {
2163 				error = EDOM;
2164 				goto bad;
2165 			}
2166 			if (val == 0 && tv.tv_usec != 0)
2167 				val = 1;
2168 
2169 			switch (sopt->sopt_name) {
2170 			case SO_SNDTIMEO:
2171 				so->so_snd.sb_timeo = val;
2172 				break;
2173 			case SO_RCVTIMEO:
2174 				so->so_rcv.sb_timeo = val;
2175 				break;
2176 			}
2177 			break;
2178 
2179 		case SO_LABEL:
2180 #ifdef MAC
2181 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2182 			    sizeof extmac);
2183 			if (error)
2184 				goto bad;
2185 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2186 			    so, &extmac);
2187 #else
2188 			error = EOPNOTSUPP;
2189 #endif
2190 			break;
2191 
2192 		default:
2193 			error = ENOPROTOOPT;
2194 			break;
2195 		}
2196 		if (error == 0 && so->so_proto != NULL &&
2197 		    so->so_proto->pr_ctloutput != NULL) {
2198 			(void) ((*so->so_proto->pr_ctloutput)
2199 				  (so, sopt));
2200 		}
2201 	}
2202 bad:
2203 	return (error);
2204 }
2205 
2206 /*
2207  * Helper routine for getsockopt.
2208  */
2209 int
2210 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2211 {
2212 	int	error;
2213 	size_t	valsize;
2214 
2215 	error = 0;
2216 
2217 	/*
2218 	 * Documented get behavior is that we always return a value, possibly
2219 	 * truncated to fit in the user's buffer.  Traditional behavior is
2220 	 * that we always tell the user precisely how much we copied, rather
2221 	 * than something useful like the total amount we had available for
2222 	 * her.  Note that this interface is not idempotent; the entire
2223 	 * answer must generated ahead of time.
2224 	 */
2225 	valsize = min(len, sopt->sopt_valsize);
2226 	sopt->sopt_valsize = valsize;
2227 	if (sopt->sopt_val != NULL) {
2228 		if (sopt->sopt_td != NULL)
2229 			error = copyout(buf, sopt->sopt_val, valsize);
2230 		else
2231 			bcopy(buf, sopt->sopt_val, valsize);
2232 	}
2233 	return (error);
2234 }
2235 
2236 int
2237 sogetopt(so, sopt)
2238 	struct socket *so;
2239 	struct sockopt *sopt;
2240 {
2241 	int	error, optval;
2242 	struct	linger l;
2243 	struct	timeval tv;
2244 #ifdef MAC
2245 	struct mac extmac;
2246 #endif
2247 
2248 	error = 0;
2249 	if (sopt->sopt_level != SOL_SOCKET) {
2250 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2251 			return ((*so->so_proto->pr_ctloutput)
2252 				  (so, sopt));
2253 		} else
2254 			return (ENOPROTOOPT);
2255 	} else {
2256 		switch (sopt->sopt_name) {
2257 #ifdef INET
2258 		case SO_ACCEPTFILTER:
2259 			error = do_getopt_accept_filter(so, sopt);
2260 			break;
2261 #endif
2262 		case SO_LINGER:
2263 			SOCK_LOCK(so);
2264 			l.l_onoff = so->so_options & SO_LINGER;
2265 			l.l_linger = so->so_linger;
2266 			SOCK_UNLOCK(so);
2267 			error = sooptcopyout(sopt, &l, sizeof l);
2268 			break;
2269 
2270 		case SO_USELOOPBACK:
2271 		case SO_DONTROUTE:
2272 		case SO_DEBUG:
2273 		case SO_KEEPALIVE:
2274 		case SO_REUSEADDR:
2275 		case SO_REUSEPORT:
2276 		case SO_BROADCAST:
2277 		case SO_OOBINLINE:
2278 		case SO_ACCEPTCONN:
2279 		case SO_TIMESTAMP:
2280 		case SO_BINTIME:
2281 		case SO_NOSIGPIPE:
2282 			optval = so->so_options & sopt->sopt_name;
2283 integer:
2284 			error = sooptcopyout(sopt, &optval, sizeof optval);
2285 			break;
2286 
2287 		case SO_TYPE:
2288 			optval = so->so_type;
2289 			goto integer;
2290 
2291 		case SO_ERROR:
2292 			SOCK_LOCK(so);
2293 			optval = so->so_error;
2294 			so->so_error = 0;
2295 			SOCK_UNLOCK(so);
2296 			goto integer;
2297 
2298 		case SO_SNDBUF:
2299 			optval = so->so_snd.sb_hiwat;
2300 			goto integer;
2301 
2302 		case SO_RCVBUF:
2303 			optval = so->so_rcv.sb_hiwat;
2304 			goto integer;
2305 
2306 		case SO_SNDLOWAT:
2307 			optval = so->so_snd.sb_lowat;
2308 			goto integer;
2309 
2310 		case SO_RCVLOWAT:
2311 			optval = so->so_rcv.sb_lowat;
2312 			goto integer;
2313 
2314 		case SO_SNDTIMEO:
2315 		case SO_RCVTIMEO:
2316 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2317 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2318 
2319 			tv.tv_sec = optval / hz;
2320 			tv.tv_usec = (optval % hz) * tick;
2321 #ifdef COMPAT_IA32
2322 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2323 				struct timeval32 tv32;
2324 
2325 				CP(tv, tv32, tv_sec);
2326 				CP(tv, tv32, tv_usec);
2327 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2328 			} else
2329 #endif
2330 				error = sooptcopyout(sopt, &tv, sizeof tv);
2331 			break;
2332 
2333 		case SO_LABEL:
2334 #ifdef MAC
2335 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2336 			    sizeof(extmac));
2337 			if (error)
2338 				return (error);
2339 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2340 			    so, &extmac);
2341 			if (error)
2342 				return (error);
2343 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2344 #else
2345 			error = EOPNOTSUPP;
2346 #endif
2347 			break;
2348 
2349 		case SO_PEERLABEL:
2350 #ifdef MAC
2351 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2352 			    sizeof(extmac));
2353 			if (error)
2354 				return (error);
2355 			error = mac_getsockopt_peerlabel(
2356 			    sopt->sopt_td->td_ucred, so, &extmac);
2357 			if (error)
2358 				return (error);
2359 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2360 #else
2361 			error = EOPNOTSUPP;
2362 #endif
2363 			break;
2364 
2365 		case SO_LISTENQLIMIT:
2366 			optval = so->so_qlimit;
2367 			goto integer;
2368 
2369 		case SO_LISTENQLEN:
2370 			optval = so->so_qlen;
2371 			goto integer;
2372 
2373 		case SO_LISTENINCQLEN:
2374 			optval = so->so_incqlen;
2375 			goto integer;
2376 
2377 		default:
2378 			error = ENOPROTOOPT;
2379 			break;
2380 		}
2381 		return (error);
2382 	}
2383 }
2384 
2385 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2386 int
2387 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2388 {
2389 	struct mbuf *m, *m_prev;
2390 	int sopt_size = sopt->sopt_valsize;
2391 
2392 	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2393 	if (m == NULL)
2394 		return ENOBUFS;
2395 	if (sopt_size > MLEN) {
2396 		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2397 		if ((m->m_flags & M_EXT) == 0) {
2398 			m_free(m);
2399 			return ENOBUFS;
2400 		}
2401 		m->m_len = min(MCLBYTES, sopt_size);
2402 	} else {
2403 		m->m_len = min(MLEN, sopt_size);
2404 	}
2405 	sopt_size -= m->m_len;
2406 	*mp = m;
2407 	m_prev = m;
2408 
2409 	while (sopt_size) {
2410 		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2411 		if (m == NULL) {
2412 			m_freem(*mp);
2413 			return ENOBUFS;
2414 		}
2415 		if (sopt_size > MLEN) {
2416 			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2417 			    M_DONTWAIT);
2418 			if ((m->m_flags & M_EXT) == 0) {
2419 				m_freem(m);
2420 				m_freem(*mp);
2421 				return ENOBUFS;
2422 			}
2423 			m->m_len = min(MCLBYTES, sopt_size);
2424 		} else {
2425 			m->m_len = min(MLEN, sopt_size);
2426 		}
2427 		sopt_size -= m->m_len;
2428 		m_prev->m_next = m;
2429 		m_prev = m;
2430 	}
2431 	return (0);
2432 }
2433 
2434 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2435 int
2436 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2437 {
2438 	struct mbuf *m0 = m;
2439 
2440 	if (sopt->sopt_val == NULL)
2441 		return (0);
2442 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2443 		if (sopt->sopt_td != NULL) {
2444 			int error;
2445 
2446 			error = copyin(sopt->sopt_val, mtod(m, char *),
2447 				       m->m_len);
2448 			if (error != 0) {
2449 				m_freem(m0);
2450 				return(error);
2451 			}
2452 		} else
2453 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2454 		sopt->sopt_valsize -= m->m_len;
2455 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2456 		m = m->m_next;
2457 	}
2458 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2459 		panic("ip6_sooptmcopyin");
2460 	return (0);
2461 }
2462 
2463 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2464 int
2465 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2466 {
2467 	struct mbuf *m0 = m;
2468 	size_t valsize = 0;
2469 
2470 	if (sopt->sopt_val == NULL)
2471 		return (0);
2472 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2473 		if (sopt->sopt_td != NULL) {
2474 			int error;
2475 
2476 			error = copyout(mtod(m, char *), sopt->sopt_val,
2477 				       m->m_len);
2478 			if (error != 0) {
2479 				m_freem(m0);
2480 				return(error);
2481 			}
2482 		} else
2483 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2484 	       sopt->sopt_valsize -= m->m_len;
2485 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2486 	       valsize += m->m_len;
2487 	       m = m->m_next;
2488 	}
2489 	if (m != NULL) {
2490 		/* enough soopt buffer should be given from user-land */
2491 		m_freem(m0);
2492 		return(EINVAL);
2493 	}
2494 	sopt->sopt_valsize = valsize;
2495 	return (0);
2496 }
2497 
2498 /*
2499  * sohasoutofband(): protocol notifies socket layer of the arrival of new
2500  * out-of-band data, which will then notify socket consumers.
2501  */
2502 void
2503 sohasoutofband(so)
2504 	struct socket *so;
2505 {
2506 	if (so->so_sigio != NULL)
2507 		pgsigio(&so->so_sigio, SIGURG, 0);
2508 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2509 }
2510 
2511 int
2512 sopoll(struct socket *so, int events, struct ucred *active_cred,
2513     struct thread *td)
2514 {
2515 
2516 	/* XXXRW: Temporary debugging. */
2517 	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
2518 	    ("sopoll: protocol calls sopoll"));
2519 
2520 	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2521 	    td));
2522 }
2523 
2524 int
2525 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2526     struct thread *td)
2527 {
2528 	int revents = 0;
2529 
2530 	SOCKBUF_LOCK(&so->so_snd);
2531 	SOCKBUF_LOCK(&so->so_rcv);
2532 	if (events & (POLLIN | POLLRDNORM))
2533 		if (soreadable(so))
2534 			revents |= events & (POLLIN | POLLRDNORM);
2535 
2536 	if (events & POLLINIGNEOF)
2537 		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2538 		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2539 			revents |= POLLINIGNEOF;
2540 
2541 	if (events & (POLLOUT | POLLWRNORM))
2542 		if (sowriteable(so))
2543 			revents |= events & (POLLOUT | POLLWRNORM);
2544 
2545 	if (events & (POLLPRI | POLLRDBAND))
2546 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2547 			revents |= events & (POLLPRI | POLLRDBAND);
2548 
2549 	if (revents == 0) {
2550 		if (events &
2551 		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2552 		     POLLRDBAND)) {
2553 			selrecord(td, &so->so_rcv.sb_sel);
2554 			so->so_rcv.sb_flags |= SB_SEL;
2555 		}
2556 
2557 		if (events & (POLLOUT | POLLWRNORM)) {
2558 			selrecord(td, &so->so_snd.sb_sel);
2559 			so->so_snd.sb_flags |= SB_SEL;
2560 		}
2561 	}
2562 
2563 	SOCKBUF_UNLOCK(&so->so_rcv);
2564 	SOCKBUF_UNLOCK(&so->so_snd);
2565 	return (revents);
2566 }
2567 
2568 int
2569 soo_kqfilter(struct file *fp, struct knote *kn)
2570 {
2571 	struct socket *so = kn->kn_fp->f_data;
2572 	struct sockbuf *sb;
2573 
2574 	switch (kn->kn_filter) {
2575 	case EVFILT_READ:
2576 		if (so->so_options & SO_ACCEPTCONN)
2577 			kn->kn_fop = &solisten_filtops;
2578 		else
2579 			kn->kn_fop = &soread_filtops;
2580 		sb = &so->so_rcv;
2581 		break;
2582 	case EVFILT_WRITE:
2583 		kn->kn_fop = &sowrite_filtops;
2584 		sb = &so->so_snd;
2585 		break;
2586 	default:
2587 		return (EINVAL);
2588 	}
2589 
2590 	SOCKBUF_LOCK(sb);
2591 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2592 	sb->sb_flags |= SB_KNOTE;
2593 	SOCKBUF_UNLOCK(sb);
2594 	return (0);
2595 }
2596 
2597 /*
2598  * Some routines that return EOPNOTSUPP for entry points that are not
2599  * supported by a protocol.  Fill in as needed.
2600  */
2601 int
2602 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2603 {
2604 	return EOPNOTSUPP;
2605 }
2606 
2607 int
2608 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2609 {
2610 	return EOPNOTSUPP;
2611 }
2612 
2613 int
2614 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2615 {
2616 	return EOPNOTSUPP;
2617 }
2618 
2619 int
2620 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2621 {
2622 	return EOPNOTSUPP;
2623 }
2624 
2625 int
2626 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2627 {
2628 	return EOPNOTSUPP;
2629 }
2630 
2631 int
2632 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2633 	struct ifnet *ifp, struct thread *td)
2634 {
2635 	return EOPNOTSUPP;
2636 }
2637 
2638 int
2639 pru_disconnect_notsupp(struct socket *so)
2640 {
2641 	return EOPNOTSUPP;
2642 }
2643 
2644 int
2645 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
2646 {
2647 	return EOPNOTSUPP;
2648 }
2649 
2650 int
2651 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2652 {
2653 	return EOPNOTSUPP;
2654 }
2655 
2656 int
2657 pru_rcvd_notsupp(struct socket *so, int flags)
2658 {
2659 	return EOPNOTSUPP;
2660 }
2661 
2662 int
2663 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2664 {
2665 	return EOPNOTSUPP;
2666 }
2667 
2668 int
2669 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2670 	struct sockaddr *addr, struct mbuf *control, struct thread *td)
2671 {
2672 	return EOPNOTSUPP;
2673 }
2674 
2675 /*
2676  * This isn't really a ``null'' operation, but it's the default one and
2677  * doesn't do anything destructive.
2678  */
2679 int
2680 pru_sense_null(struct socket *so, struct stat *sb)
2681 {
2682 	sb->st_blksize = so->so_snd.sb_hiwat;
2683 	return 0;
2684 }
2685 
2686 int
2687 pru_shutdown_notsupp(struct socket *so)
2688 {
2689 	return EOPNOTSUPP;
2690 }
2691 
2692 int
2693 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2694 {
2695 	return EOPNOTSUPP;
2696 }
2697 
2698 int
2699 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2700 	struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
2701 {
2702 	return EOPNOTSUPP;
2703 }
2704 
2705 int
2706 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2707 	struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
2708 	int *flagsp)
2709 {
2710 	return EOPNOTSUPP;
2711 }
2712 
2713 int
2714 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
2715 	struct thread *td)
2716 {
2717 	return EOPNOTSUPP;
2718 }
2719 
2720 static void
2721 filt_sordetach(struct knote *kn)
2722 {
2723 	struct socket *so = kn->kn_fp->f_data;
2724 
2725 	SOCKBUF_LOCK(&so->so_rcv);
2726 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2727 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2728 		so->so_rcv.sb_flags &= ~SB_KNOTE;
2729 	SOCKBUF_UNLOCK(&so->so_rcv);
2730 }
2731 
2732 /*ARGSUSED*/
2733 static int
2734 filt_soread(struct knote *kn, long hint)
2735 {
2736 	struct socket *so;
2737 
2738 	so = kn->kn_fp->f_data;
2739 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2740 
2741 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2742 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2743 		kn->kn_flags |= EV_EOF;
2744 		kn->kn_fflags = so->so_error;
2745 		return (1);
2746 	} else if (so->so_error)	/* temporary udp error */
2747 		return (1);
2748 	else if (kn->kn_sfflags & NOTE_LOWAT)
2749 		return (kn->kn_data >= kn->kn_sdata);
2750 	else
2751 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2752 }
2753 
2754 static void
2755 filt_sowdetach(struct knote *kn)
2756 {
2757 	struct socket *so = kn->kn_fp->f_data;
2758 
2759 	SOCKBUF_LOCK(&so->so_snd);
2760 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2761 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2762 		so->so_snd.sb_flags &= ~SB_KNOTE;
2763 	SOCKBUF_UNLOCK(&so->so_snd);
2764 }
2765 
2766 /*ARGSUSED*/
2767 static int
2768 filt_sowrite(struct knote *kn, long hint)
2769 {
2770 	struct socket *so;
2771 
2772 	so = kn->kn_fp->f_data;
2773 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2774 	kn->kn_data = sbspace(&so->so_snd);
2775 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2776 		kn->kn_flags |= EV_EOF;
2777 		kn->kn_fflags = so->so_error;
2778 		return (1);
2779 	} else if (so->so_error)	/* temporary udp error */
2780 		return (1);
2781 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2782 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2783 		return (0);
2784 	else if (kn->kn_sfflags & NOTE_LOWAT)
2785 		return (kn->kn_data >= kn->kn_sdata);
2786 	else
2787 		return (kn->kn_data >= so->so_snd.sb_lowat);
2788 }
2789 
2790 /*ARGSUSED*/
2791 static int
2792 filt_solisten(struct knote *kn, long hint)
2793 {
2794 	struct socket *so = kn->kn_fp->f_data;
2795 
2796 	kn->kn_data = so->so_qlen;
2797 	return (! TAILQ_EMPTY(&so->so_comp));
2798 }
2799 
2800 int
2801 socheckuid(struct socket *so, uid_t uid)
2802 {
2803 
2804 	if (so == NULL)
2805 		return (EPERM);
2806 	if (so->so_cred->cr_uid != uid)
2807 		return (EPERM);
2808 	return (0);
2809 }
2810 
2811 static int
2812 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
2813 {
2814 	int error;
2815 	int val;
2816 
2817 	val = somaxconn;
2818 	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2819 	if (error || !req->newptr )
2820 		return (error);
2821 
2822 	if (val < 1 || val > USHRT_MAX)
2823 		return (EINVAL);
2824 
2825 	somaxconn = val;
2826 	return (0);
2827 }
2828 
2829 /*
2830  * These functions are used by protocols to notify the socket layer (and its
2831  * consumers) of state changes in the sockets driven by protocol-side events.
2832  */
2833 
2834 /*
2835  * Procedures to manipulate state flags of socket and do appropriate wakeups.
2836  *
2837  * Normal sequence from the active (originating) side is that
2838  * soisconnecting() is called during processing of connect() call, resulting
2839  * in an eventual call to soisconnected() if/when the connection is
2840  * established.  When the connection is torn down soisdisconnecting() is
2841  * called during processing of disconnect() call, and soisdisconnected() is
2842  * called when the connection to the peer is totally severed.  The semantics
2843  * of these routines are such that connectionless protocols can call
2844  * soisconnected() and soisdisconnected() only, bypassing the in-progress
2845  * calls when setting up a ``connection'' takes no time.
2846  *
2847  * From the passive side, a socket is created with two queues of sockets:
2848  * so_incomp for connections in progress and so_comp for connections already
2849  * made and awaiting user acceptance.  As a protocol is preparing incoming
2850  * connections, it creates a socket structure queued on so_incomp by calling
2851  * sonewconn().  When the connection is established, soisconnected() is
2852  * called, and transfers the socket structure to so_comp, making it available
2853  * to accept().
2854  *
2855  * If a socket is closed with sockets on either so_incomp or so_comp, these
2856  * sockets are dropped.
2857  *
2858  * If higher-level protocols are implemented in the kernel, the wakeups done
2859  * here will sometimes cause software-interrupt process scheduling.
2860  */
2861 void
2862 soisconnecting(so)
2863 	register struct socket *so;
2864 {
2865 
2866 	SOCK_LOCK(so);
2867 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
2868 	so->so_state |= SS_ISCONNECTING;
2869 	SOCK_UNLOCK(so);
2870 }
2871 
2872 void
2873 soisconnected(so)
2874 	struct socket *so;
2875 {
2876 	struct socket *head;
2877 
2878 	ACCEPT_LOCK();
2879 	SOCK_LOCK(so);
2880 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
2881 	so->so_state |= SS_ISCONNECTED;
2882 	head = so->so_head;
2883 	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
2884 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
2885 			SOCK_UNLOCK(so);
2886 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
2887 			head->so_incqlen--;
2888 			so->so_qstate &= ~SQ_INCOMP;
2889 			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
2890 			head->so_qlen++;
2891 			so->so_qstate |= SQ_COMP;
2892 			ACCEPT_UNLOCK();
2893 			sorwakeup(head);
2894 			wakeup_one(&head->so_timeo);
2895 		} else {
2896 			ACCEPT_UNLOCK();
2897 			so->so_upcall =
2898 			    head->so_accf->so_accept_filter->accf_callback;
2899 			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
2900 			so->so_rcv.sb_flags |= SB_UPCALL;
2901 			so->so_options &= ~SO_ACCEPTFILTER;
2902 			SOCK_UNLOCK(so);
2903 			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
2904 		}
2905 		return;
2906 	}
2907 	SOCK_UNLOCK(so);
2908 	ACCEPT_UNLOCK();
2909 	wakeup(&so->so_timeo);
2910 	sorwakeup(so);
2911 	sowwakeup(so);
2912 }
2913 
2914 void
2915 soisdisconnecting(so)
2916 	register struct socket *so;
2917 {
2918 
2919 	/*
2920 	 * Note: This code assumes that SOCK_LOCK(so) and
2921 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2922 	 */
2923 	SOCKBUF_LOCK(&so->so_rcv);
2924 	so->so_state &= ~SS_ISCONNECTING;
2925 	so->so_state |= SS_ISDISCONNECTING;
2926 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2927 	sorwakeup_locked(so);
2928 	SOCKBUF_LOCK(&so->so_snd);
2929 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
2930 	sowwakeup_locked(so);
2931 	wakeup(&so->so_timeo);
2932 }
2933 
2934 void
2935 soisdisconnected(so)
2936 	register struct socket *so;
2937 {
2938 
2939 	/*
2940 	 * Note: This code assumes that SOCK_LOCK(so) and
2941 	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
2942 	 */
2943 	SOCKBUF_LOCK(&so->so_rcv);
2944 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
2945 	so->so_state |= SS_ISDISCONNECTED;
2946 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
2947 	sorwakeup_locked(so);
2948 	SOCKBUF_LOCK(&so->so_snd);
2949 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
2950 	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
2951 	sowwakeup_locked(so);
2952 	wakeup(&so->so_timeo);
2953 }
2954 
2955 /*
2956  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2957  */
2958 struct sockaddr *
2959 sodupsockaddr(const struct sockaddr *sa, int mflags)
2960 {
2961 	struct sockaddr *sa2;
2962 
2963 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
2964 	if (sa2)
2965 		bcopy(sa, sa2, sa->sa_len);
2966 	return sa2;
2967 }
2968 
2969 /*
2970  * Create an external-format (``xsocket'') structure using the information in
2971  * the kernel-format socket structure pointed to by so.  This is done to
2972  * reduce the spew of irrelevant information over this interface, to isolate
2973  * user code from changes in the kernel structure, and potentially to provide
2974  * information-hiding if we decide that some of this information should be
2975  * hidden from users.
2976  */
2977 void
2978 sotoxsocket(struct socket *so, struct xsocket *xso)
2979 {
2980 	xso->xso_len = sizeof *xso;
2981 	xso->xso_so = so;
2982 	xso->so_type = so->so_type;
2983 	xso->so_options = so->so_options;
2984 	xso->so_linger = so->so_linger;
2985 	xso->so_state = so->so_state;
2986 	xso->so_pcb = so->so_pcb;
2987 	xso->xso_protocol = so->so_proto->pr_protocol;
2988 	xso->xso_family = so->so_proto->pr_domain->dom_family;
2989 	xso->so_qlen = so->so_qlen;
2990 	xso->so_incqlen = so->so_incqlen;
2991 	xso->so_qlimit = so->so_qlimit;
2992 	xso->so_timeo = so->so_timeo;
2993 	xso->so_error = so->so_error;
2994 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
2995 	xso->so_oobmark = so->so_oobmark;
2996 	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2997 	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2998 	xso->so_uid = so->so_cred->cr_uid;
2999 }
3000