xref: /freebsd/sys/kern/uipc_socket.c (revision d056fa046c6a91b90cd98165face0e42a33a5173)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2006 Robert N. M. Watson
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32  */
33 
34 /*
35  * Comments on the socket life cycle:
36  *
37  * soalloc() sets of socket layer state for a socket, called only by
38  * socreate() and sonewconn().  Socket layer private.
39  *
40  * sdealloc() tears down socket layer state for a socket, called only by
41  * sofree() and sonewconn().  Socket layer private.
42  *
43  * pru_attach() associates protocol layer state with an allocated socket;
44  * called only once, may fail, aborting socket allocation.  This is called
45  * from socreate() and sonewconn().  Socket layer private.
46  *
47  * pru_detach() disassociates protocol layer state from an attached socket,
48  * and will be called exactly once for sockets in which pru_attach() has
49  * been successfully called.  If pru_attach() returned an error,
50  * pru_detach() will not be called.  Socket layer private.
51  *
52  * socreate() creates a socket and attaches protocol state.  This is a public
53  * interface that may be used by socket layer consumers to create new
54  * sockets.
55  *
56  * sonewconn() creates a socket and attaches protocol state.  This is a
57  * public interface  that may be used by protocols to create new sockets when
58  * a new connection is received and will be available for accept() on a
59  * listen socket.
60  *
61  * soclose() destroys a socket after possibly waiting for it to disconnect.
62  * This is a public interface that socket consumers should use to close and
63  * release a socket when done with it.
64  *
65  * soabort() destroys a socket without waiting for it to disconnect (used
66  * only for incoming connections that are already partially or fully
67  * connected).  This is used internally by the socket layer when clearing
68  * listen socket queues (due to overflow or close on the listen socket), but
69  * is also a public interface protocols may use to abort connections in
70  * their incomplete listen queues should they no longer be required.  Sockets
71  * placed in completed connection listen queues should not be aborted.
72  *
73  * sofree() will free a socket and its protocol state if all references on
74  * the socket have been released, and is the public interface to attempt to
75  * free a socket when a reference is removed.  This is a socket layer private
76  * interface.
77  *
78  * NOTE: In addition to socreate() and soclose(), which provide a single
79  * socket reference to the consumer to be managed as required, there are two
80  * calls to explicitly manage socket references, soref(), and sorele().
81  * Currently, these are generally required only when transitioning a socket
82  * from a listen queue to a file descriptor, in order to prevent garbage
83  * collection of the socket at an untimely moment.  For a number of reasons,
84  * these interfaces are not preferred, and should be avoided.
85  *
86  * XXXRW: The behavior of sockets after soclose() but before the last
87  * sorele() is poorly defined.  We can probably entirely eliminate them with
88  * a little work, since consumers are managing references anyway.
89  */
90 
91 #include <sys/cdefs.h>
92 __FBSDID("$FreeBSD$");
93 
94 #include "opt_inet.h"
95 #include "opt_mac.h"
96 #include "opt_zero.h"
97 #include "opt_compat.h"
98 
99 #include <sys/param.h>
100 #include <sys/systm.h>
101 #include <sys/fcntl.h>
102 #include <sys/limits.h>
103 #include <sys/lock.h>
104 #include <sys/mac.h>
105 #include <sys/malloc.h>
106 #include <sys/mbuf.h>
107 #include <sys/mutex.h>
108 #include <sys/domain.h>
109 #include <sys/file.h>			/* for struct knote */
110 #include <sys/kernel.h>
111 #include <sys/event.h>
112 #include <sys/eventhandler.h>
113 #include <sys/poll.h>
114 #include <sys/proc.h>
115 #include <sys/protosw.h>
116 #include <sys/socket.h>
117 #include <sys/socketvar.h>
118 #include <sys/resourcevar.h>
119 #include <sys/signalvar.h>
120 #include <sys/sysctl.h>
121 #include <sys/uio.h>
122 #include <sys/jail.h>
123 
124 #include <vm/uma.h>
125 
126 #ifdef COMPAT_IA32
127 #include <sys/mount.h>
128 #include <compat/freebsd32/freebsd32.h>
129 
130 extern struct sysentvec ia32_freebsd_sysvec;
131 #endif
132 
133 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
134 		    int flags);
135 
136 static void	filt_sordetach(struct knote *kn);
137 static int	filt_soread(struct knote *kn, long hint);
138 static void	filt_sowdetach(struct knote *kn);
139 static int	filt_sowrite(struct knote *kn, long hint);
140 static int	filt_solisten(struct knote *kn, long hint);
141 
142 static struct filterops solisten_filtops =
143 	{ 1, NULL, filt_sordetach, filt_solisten };
144 static struct filterops soread_filtops =
145 	{ 1, NULL, filt_sordetach, filt_soread };
146 static struct filterops sowrite_filtops =
147 	{ 1, NULL, filt_sowdetach, filt_sowrite };
148 
149 uma_zone_t socket_zone;
150 so_gen_t	so_gencnt;	/* generation count for sockets */
151 
152 int	maxsockets;
153 
154 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
155 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
156 
157 static int somaxconn = SOMAXCONN;
158 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
159 /* XXX: we dont have SYSCTL_USHORT */
160 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
161     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
162     "queue size");
163 static int numopensockets;
164 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
165     &numopensockets, 0, "Number of open sockets");
166 #ifdef ZERO_COPY_SOCKETS
167 /* These aren't static because they're used in other files. */
168 int so_zero_copy_send = 1;
169 int so_zero_copy_receive = 1;
170 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
171     "Zero copy controls");
172 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
173     &so_zero_copy_receive, 0, "Enable zero copy receive");
174 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
175     &so_zero_copy_send, 0, "Enable zero copy send");
176 #endif /* ZERO_COPY_SOCKETS */
177 
178 /*
179  * accept_mtx locks down per-socket fields relating to accept queues.  See
180  * socketvar.h for an annotation of the protected fields of struct socket.
181  */
182 struct mtx accept_mtx;
183 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
184 
185 /*
186  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
187  * so_gencnt field.
188  */
189 static struct mtx so_global_mtx;
190 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
191 
192 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
193 
194 static int
195 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
196 {
197 	int error, newmaxsockets;
198 
199 	newmaxsockets = maxsockets;
200 	error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req);
201 	if (error == 0 && req->newptr) {
202 		if (newmaxsockets > maxsockets) {
203 			maxsockets = newmaxsockets;
204 			if (maxsockets > ((maxfiles / 4) * 3)) {
205 				maxfiles = (maxsockets * 5) / 4;
206 				maxfilesperproc = (maxfiles * 9) / 10;
207 			}
208 			EVENTHANDLER_INVOKE(maxsockets_change);
209 		} else
210 			error = EINVAL;
211 	}
212 	return (error);
213 }
214 
215 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
216     &maxsockets, 0, sysctl_maxsockets, "IU",
217     "Maximum number of sockets avaliable");
218 
219 /*
220  * Initialise maxsockets
221  */
222 static void init_maxsockets(void *ignored)
223 {
224 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
225 	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
226 }
227 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
228 
229 /*
230  * Socket operation routines.
231  * These routines are called by the routines in
232  * sys_socket.c or from a system process, and
233  * implement the semantics of socket operations by
234  * switching out to the protocol specific routines.
235  */
236 
237 /*
238  * Get a socket structure from our zone, and initialize it.
239  * Note that it would probably be better to allocate socket
240  * and PCB at the same time, but I'm not convinced that all
241  * the protocols can be easily modified to do this.
242  *
243  * soalloc() returns a socket with a ref count of 0.
244  */
245 static struct socket *
246 soalloc(int mflags)
247 {
248 	struct socket *so;
249 
250 	so = uma_zalloc(socket_zone, mflags | M_ZERO);
251 	if (so == NULL)
252 		return (NULL);
253 #ifdef MAC
254 	if (mac_init_socket(so, mflags) != 0) {
255 		uma_zfree(socket_zone, so);
256 		return (NULL);
257 	}
258 #endif
259 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
260 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
261 	TAILQ_INIT(&so->so_aiojobq);
262 	mtx_lock(&so_global_mtx);
263 	so->so_gencnt = ++so_gencnt;
264 	++numopensockets;
265 	mtx_unlock(&so_global_mtx);
266 	return (so);
267 }
268 
269 static void
270 sodealloc(struct socket *so)
271 {
272 
273 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
274 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
275 
276 	mtx_lock(&so_global_mtx);
277 	so->so_gencnt = ++so_gencnt;
278 	mtx_unlock(&so_global_mtx);
279 	if (so->so_rcv.sb_hiwat)
280 		(void)chgsbsize(so->so_cred->cr_uidinfo,
281 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
282 	if (so->so_snd.sb_hiwat)
283 		(void)chgsbsize(so->so_cred->cr_uidinfo,
284 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
285 #ifdef INET
286 	/* remove acccept filter if one is present. */
287 	if (so->so_accf != NULL)
288 		do_setopt_accept_filter(so, NULL);
289 #endif
290 #ifdef MAC
291 	mac_destroy_socket(so);
292 #endif
293 	crfree(so->so_cred);
294 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
295 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
296 	uma_zfree(socket_zone, so);
297 	mtx_lock(&so_global_mtx);
298 	--numopensockets;
299 	mtx_unlock(&so_global_mtx);
300 }
301 
302 /*
303  * socreate returns a socket with a ref count of 1.  The socket should be
304  * closed with soclose().
305  */
306 int
307 socreate(dom, aso, type, proto, cred, td)
308 	int dom;
309 	struct socket **aso;
310 	int type;
311 	int proto;
312 	struct ucred *cred;
313 	struct thread *td;
314 {
315 	struct protosw *prp;
316 	struct socket *so;
317 	int error;
318 
319 	if (proto)
320 		prp = pffindproto(dom, proto, type);
321 	else
322 		prp = pffindtype(dom, type);
323 
324 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
325 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
326 		return (EPROTONOSUPPORT);
327 
328 	if (jailed(cred) && jail_socket_unixiproute_only &&
329 	    prp->pr_domain->dom_family != PF_LOCAL &&
330 	    prp->pr_domain->dom_family != PF_INET &&
331 	    prp->pr_domain->dom_family != PF_ROUTE) {
332 		return (EPROTONOSUPPORT);
333 	}
334 
335 	if (prp->pr_type != type)
336 		return (EPROTOTYPE);
337 	so = soalloc(M_WAITOK);
338 	if (so == NULL)
339 		return (ENOBUFS);
340 
341 	TAILQ_INIT(&so->so_incomp);
342 	TAILQ_INIT(&so->so_comp);
343 	so->so_type = type;
344 	so->so_cred = crhold(cred);
345 	so->so_proto = prp;
346 #ifdef MAC
347 	mac_create_socket(cred, so);
348 #endif
349 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
350 	    NULL, NULL, NULL);
351 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
352 	    NULL, NULL, NULL);
353 	so->so_count = 1;
354 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
355 	if (error) {
356 		sodealloc(so);
357 		return (error);
358 	}
359 	*aso = so;
360 	return (0);
361 }
362 
363 #ifdef REGRESSION
364 static int regression_sonewconn_earlytest = 1;
365 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
366     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
367 #endif
368 
369 /*
370  * When an attempt at a new connection is noted on a socket
371  * which accepts connections, sonewconn is called.  If the
372  * connection is possible (subject to space constraints, etc.)
373  * then we allocate a new structure, propoerly linked into the
374  * data structure of the original socket, and return this.
375  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
376  *
377  * note: the ref count on the socket is 0 on return
378  */
379 struct socket *
380 sonewconn(head, connstatus)
381 	register struct socket *head;
382 	int connstatus;
383 {
384 	register struct socket *so;
385 	int over;
386 
387 	ACCEPT_LOCK();
388 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
389 	ACCEPT_UNLOCK();
390 #ifdef REGRESSION
391 	if (regression_sonewconn_earlytest && over)
392 #else
393 	if (over)
394 #endif
395 		return (NULL);
396 	so = soalloc(M_NOWAIT);
397 	if (so == NULL)
398 		return (NULL);
399 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
400 		connstatus = 0;
401 	so->so_head = head;
402 	so->so_type = head->so_type;
403 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
404 	so->so_linger = head->so_linger;
405 	so->so_state = head->so_state | SS_NOFDREF;
406 	so->so_proto = head->so_proto;
407 	so->so_timeo = head->so_timeo;
408 	so->so_cred = crhold(head->so_cred);
409 #ifdef MAC
410 	SOCK_LOCK(head);
411 	mac_create_socket_from_socket(head, so);
412 	SOCK_UNLOCK(head);
413 #endif
414 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
415 	    NULL, NULL, NULL);
416 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
417 	    NULL, NULL, NULL);
418 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
419 	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
420 		sodealloc(so);
421 		return (NULL);
422 	}
423 	so->so_state |= connstatus;
424 	ACCEPT_LOCK();
425 	if (connstatus) {
426 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
427 		so->so_qstate |= SQ_COMP;
428 		head->so_qlen++;
429 	} else {
430 		/*
431 		 * Keep removing sockets from the head until there's room for
432 		 * us to insert on the tail.  In pre-locking revisions, this
433 		 * was a simple if(), but as we could be racing with other
434 		 * threads and soabort() requires dropping locks, we must
435 		 * loop waiting for the condition to be true.
436 		 */
437 		while (head->so_incqlen > head->so_qlimit) {
438 			struct socket *sp;
439 			sp = TAILQ_FIRST(&head->so_incomp);
440 			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
441 			head->so_incqlen--;
442 			sp->so_qstate &= ~SQ_INCOMP;
443 			sp->so_head = NULL;
444 			ACCEPT_UNLOCK();
445 			soabort(sp);
446 			ACCEPT_LOCK();
447 		}
448 		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
449 		so->so_qstate |= SQ_INCOMP;
450 		head->so_incqlen++;
451 	}
452 	ACCEPT_UNLOCK();
453 	if (connstatus) {
454 		sorwakeup(head);
455 		wakeup_one(&head->so_timeo);
456 	}
457 	return (so);
458 }
459 
460 int
461 sobind(so, nam, td)
462 	struct socket *so;
463 	struct sockaddr *nam;
464 	struct thread *td;
465 {
466 
467 	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
468 }
469 
470 /*
471  * solisten() transitions a socket from a non-listening state to a listening
472  * state, but can also be used to update the listen queue depth on an
473  * existing listen socket.  The protocol will call back into the sockets
474  * layer using solisten_proto_check() and solisten_proto() to check and set
475  * socket-layer listen state.  Call backs are used so that the protocol can
476  * acquire both protocol and socket layer locks in whatever order is required
477  * by the protocol.
478  *
479  * Protocol implementors are advised to hold the socket lock across the
480  * socket-layer test and set to avoid races at the socket layer.
481  */
482 int
483 solisten(so, backlog, td)
484 	struct socket *so;
485 	int backlog;
486 	struct thread *td;
487 {
488 
489 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
490 }
491 
492 int
493 solisten_proto_check(so)
494 	struct socket *so;
495 {
496 
497 	SOCK_LOCK_ASSERT(so);
498 
499 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
500 	    SS_ISDISCONNECTING))
501 		return (EINVAL);
502 	return (0);
503 }
504 
505 void
506 solisten_proto(so, backlog)
507 	struct socket *so;
508 	int backlog;
509 {
510 
511 	SOCK_LOCK_ASSERT(so);
512 
513 	if (backlog < 0 || backlog > somaxconn)
514 		backlog = somaxconn;
515 	so->so_qlimit = backlog;
516 	so->so_options |= SO_ACCEPTCONN;
517 }
518 
519 /*
520  * Attempt to free a socket.  This should really be sotryfree().
521  *
522  * sofree() will succeed if:
523  *
524  * - There are no outstanding file descriptor references or related consumers
525  *   (so_count == 0).
526  *
527  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
528  *
529  * - The protocol does not have an outstanding strong reference on the socket
530  *   (SS_PROTOREF).
531  *
532  * - The socket is not in a completed connection queue, so a process has been
533  *   notified that it is present.  If it is removed, the user process may
534  *   block in accept() despite select() saying the socket was ready.
535  *
536  * Otherwise, it will quietly abort so that a future call to sofree(), when
537  * conditions are right, can succeed.
538  */
539 void
540 sofree(so)
541 	struct socket *so;
542 {
543 	struct socket *head;
544 
545 	ACCEPT_LOCK_ASSERT();
546 	SOCK_LOCK_ASSERT(so);
547 
548 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
549 	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
550 		SOCK_UNLOCK(so);
551 		ACCEPT_UNLOCK();
552 		return;
553 	}
554 
555 	head = so->so_head;
556 	if (head != NULL) {
557 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
558 		    (so->so_qstate & SQ_INCOMP) != 0,
559 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
560 		    "SQ_INCOMP"));
561 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
562 		    (so->so_qstate & SQ_INCOMP) == 0,
563 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
564 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
565 		head->so_incqlen--;
566 		so->so_qstate &= ~SQ_INCOMP;
567 		so->so_head = NULL;
568 	}
569 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
570 	    (so->so_qstate & SQ_INCOMP) == 0,
571 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
572 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
573 	SOCK_UNLOCK(so);
574 	ACCEPT_UNLOCK();
575 
576 	SOCKBUF_LOCK(&so->so_snd);
577 	so->so_snd.sb_flags |= SB_NOINTR;
578 	(void)sblock(&so->so_snd, M_WAITOK);
579 	/*
580 	 * socantsendmore_locked() drops the socket buffer mutex so that it
581 	 * can safely perform wakeups.  Re-acquire the mutex before
582 	 * continuing.
583 	 */
584 	socantsendmore_locked(so);
585 	SOCKBUF_LOCK(&so->so_snd);
586 	sbunlock(&so->so_snd);
587 	sbrelease_locked(&so->so_snd, so);
588 	SOCKBUF_UNLOCK(&so->so_snd);
589 	sorflush(so);
590 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
591 	knlist_destroy(&so->so_snd.sb_sel.si_note);
592 	sodealloc(so);
593 }
594 
595 /*
596  * Close a socket on last file table reference removal.
597  * Initiate disconnect if connected.
598  * Free socket when disconnect complete.
599  *
600  * This function will sorele() the socket.  Note that soclose() may be
601  * called prior to the ref count reaching zero.  The actual socket
602  * structure will not be freed until the ref count reaches zero.
603  */
604 int
605 soclose(so)
606 	struct socket *so;
607 {
608 	int error = 0;
609 
610 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
611 
612 	funsetown(&so->so_sigio);
613 	if (so->so_options & SO_ACCEPTCONN) {
614 		struct socket *sp;
615 		ACCEPT_LOCK();
616 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
617 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
618 			so->so_incqlen--;
619 			sp->so_qstate &= ~SQ_INCOMP;
620 			sp->so_head = NULL;
621 			ACCEPT_UNLOCK();
622 			soabort(sp);
623 			ACCEPT_LOCK();
624 		}
625 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
626 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
627 			so->so_qlen--;
628 			sp->so_qstate &= ~SQ_COMP;
629 			sp->so_head = NULL;
630 			ACCEPT_UNLOCK();
631 			soabort(sp);
632 			ACCEPT_LOCK();
633 		}
634 		ACCEPT_UNLOCK();
635 	}
636 	if (so->so_state & SS_ISCONNECTED) {
637 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
638 			error = sodisconnect(so);
639 			if (error)
640 				goto drop;
641 		}
642 		if (so->so_options & SO_LINGER) {
643 			if ((so->so_state & SS_ISDISCONNECTING) &&
644 			    (so->so_state & SS_NBIO))
645 				goto drop;
646 			while (so->so_state & SS_ISCONNECTED) {
647 				error = tsleep(&so->so_timeo,
648 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
649 				if (error)
650 					break;
651 			}
652 		}
653 	}
654 
655 drop:
656 	if (*so->so_proto->pr_usrreqs->pru_detach != NULL)
657 		(*so->so_proto->pr_usrreqs->pru_detach)(so);
658 	ACCEPT_LOCK();
659 	SOCK_LOCK(so);
660 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
661 	so->so_state |= SS_NOFDREF;
662 	sorele(so);
663 	return (error);
664 }
665 
666 /*
667  * soabort() allows the socket code or protocol code to detach a socket that
668  * has been in an incomplete or completed listen queue, but has not yet been
669  * accepted.
670  *
671  * This interface is tricky, because it is called on an unreferenced socket,
672  * and must be called only by a thread that has actually removed the socket
673  * from the listen queue it was on, or races with other threads are risked.
674  *
675  * This interface will call into the protocol code, so must not be called
676  * with any socket locks held.  Protocols do call it while holding their own
677  * recursible protocol mutexes, but this is something that should be subject
678  * to review in the future.
679  *
680  * XXXRW: Why do we maintain a distinction between pru_abort() and
681  * pru_detach()?
682  */
683 void
684 soabort(so)
685 	struct socket *so;
686 {
687 
688 	/*
689 	 * In as much as is possible, assert that no references to this
690 	 * socket are held.  This is not quite the same as asserting that the
691 	 * current thread is responsible for arranging for no references, but
692 	 * is as close as we can get for now.
693 	 */
694 	KASSERT(so->so_count == 0, ("soabort: so_count"));
695 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
696 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
697 	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
698 	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
699 
700 	if (*so->so_proto->pr_usrreqs->pru_abort != NULL)
701 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
702 	ACCEPT_LOCK();
703 	SOCK_LOCK(so);
704 	sofree(so);
705 }
706 
707 int
708 soaccept(so, nam)
709 	struct socket *so;
710 	struct sockaddr **nam;
711 {
712 	int error;
713 
714 	SOCK_LOCK(so);
715 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
716 	so->so_state &= ~SS_NOFDREF;
717 	SOCK_UNLOCK(so);
718 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
719 	return (error);
720 }
721 
722 int
723 soconnect(so, nam, td)
724 	struct socket *so;
725 	struct sockaddr *nam;
726 	struct thread *td;
727 {
728 	int error;
729 
730 	if (so->so_options & SO_ACCEPTCONN)
731 		return (EOPNOTSUPP);
732 	/*
733 	 * If protocol is connection-based, can only connect once.
734 	 * Otherwise, if connected, try to disconnect first.
735 	 * This allows user to disconnect by connecting to, e.g.,
736 	 * a null address.
737 	 */
738 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
739 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
740 	    (error = sodisconnect(so)))) {
741 		error = EISCONN;
742 	} else {
743 		/*
744 		 * Prevent accumulated error from previous connection
745 		 * from biting us.
746 		 */
747 		so->so_error = 0;
748 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
749 	}
750 
751 	return (error);
752 }
753 
754 int
755 soconnect2(so1, so2)
756 	struct socket *so1;
757 	struct socket *so2;
758 {
759 
760 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
761 }
762 
763 int
764 sodisconnect(so)
765 	struct socket *so;
766 {
767 	int error;
768 
769 	if ((so->so_state & SS_ISCONNECTED) == 0)
770 		return (ENOTCONN);
771 	if (so->so_state & SS_ISDISCONNECTING)
772 		return (EALREADY);
773 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
774 	return (error);
775 }
776 
777 #ifdef ZERO_COPY_SOCKETS
778 struct so_zerocopy_stats{
779 	int size_ok;
780 	int align_ok;
781 	int found_ifp;
782 };
783 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
784 #include <netinet/in.h>
785 #include <net/route.h>
786 #include <netinet/in_pcb.h>
787 #include <vm/vm.h>
788 #include <vm/vm_page.h>
789 #include <vm/vm_object.h>
790 #endif /*ZERO_COPY_SOCKETS*/
791 
792 /*
793  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
794  * all of the data referenced by the uio.  If desired, it uses zero-copy.
795  * *space will be updated to reflect data copied in.
796  *
797  * NB: If atomic I/O is requested, the caller must already have checked that
798  * space can hold resid bytes.
799  *
800  * NB: In the event of an error, the caller may need to free the partial
801  * chain pointed to by *mpp.  The contents of both *uio and *space may be
802  * modified even in the case of an error.
803  */
804 static int
805 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
806     int flags)
807 {
808 	struct mbuf *m, **mp, *top;
809 	long len, resid;
810 	int error;
811 #ifdef ZERO_COPY_SOCKETS
812 	int cow_send;
813 #endif
814 
815 	*retmp = top = NULL;
816 	mp = &top;
817 	len = 0;
818 	resid = uio->uio_resid;
819 	error = 0;
820 	do {
821 #ifdef ZERO_COPY_SOCKETS
822 		cow_send = 0;
823 #endif /* ZERO_COPY_SOCKETS */
824 		if (resid >= MINCLSIZE) {
825 #ifdef ZERO_COPY_SOCKETS
826 			if (top == NULL) {
827 				MGETHDR(m, M_TRYWAIT, MT_DATA);
828 				if (m == NULL) {
829 					error = ENOBUFS;
830 					goto out;
831 				}
832 				m->m_pkthdr.len = 0;
833 				m->m_pkthdr.rcvif = NULL;
834 			} else {
835 				MGET(m, M_TRYWAIT, MT_DATA);
836 				if (m == NULL) {
837 					error = ENOBUFS;
838 					goto out;
839 				}
840 			}
841 			if (so_zero_copy_send &&
842 			    resid>=PAGE_SIZE &&
843 			    *space>=PAGE_SIZE &&
844 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
845 				so_zerocp_stats.size_ok++;
846 				so_zerocp_stats.align_ok++;
847 				cow_send = socow_setup(m, uio);
848 				len = cow_send;
849 			}
850 			if (!cow_send) {
851 				MCLGET(m, M_TRYWAIT);
852 				if ((m->m_flags & M_EXT) == 0) {
853 					m_free(m);
854 					m = NULL;
855 				} else {
856 					len = min(min(MCLBYTES, resid),
857 					    *space);
858 				}
859 			}
860 #else /* ZERO_COPY_SOCKETS */
861 			if (top == NULL) {
862 				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
863 				m->m_pkthdr.len = 0;
864 				m->m_pkthdr.rcvif = NULL;
865 			} else
866 				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
867 			len = min(min(MCLBYTES, resid), *space);
868 #endif /* ZERO_COPY_SOCKETS */
869 		} else {
870 			if (top == NULL) {
871 				m = m_gethdr(M_TRYWAIT, MT_DATA);
872 				m->m_pkthdr.len = 0;
873 				m->m_pkthdr.rcvif = NULL;
874 
875 				len = min(min(MHLEN, resid), *space);
876 				/*
877 				 * For datagram protocols, leave room
878 				 * for protocol headers in first mbuf.
879 				 */
880 				if (atomic && m && len < MHLEN)
881 					MH_ALIGN(m, len);
882 			} else {
883 				m = m_get(M_TRYWAIT, MT_DATA);
884 				len = min(min(MLEN, resid), *space);
885 			}
886 		}
887 		if (m == NULL) {
888 			error = ENOBUFS;
889 			goto out;
890 		}
891 
892 		*space -= len;
893 #ifdef ZERO_COPY_SOCKETS
894 		if (cow_send)
895 			error = 0;
896 		else
897 #endif /* ZERO_COPY_SOCKETS */
898 		error = uiomove(mtod(m, void *), (int)len, uio);
899 		resid = uio->uio_resid;
900 		m->m_len = len;
901 		*mp = m;
902 		top->m_pkthdr.len += len;
903 		if (error)
904 			goto out;
905 		mp = &m->m_next;
906 		if (resid <= 0) {
907 			if (flags & MSG_EOR)
908 				top->m_flags |= M_EOR;
909 			break;
910 		}
911 	} while (*space > 0 && atomic);
912 out:
913 	*retmp = top;
914 	return (error);
915 }
916 
917 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
918 
919 int
920 sosend_dgram(so, addr, uio, top, control, flags, td)
921 	struct socket *so;
922 	struct sockaddr *addr;
923 	struct uio *uio;
924 	struct mbuf *top;
925 	struct mbuf *control;
926 	int flags;
927 	struct thread *td;
928 {
929 	long space, resid;
930 	int clen = 0, error, dontroute;
931 	int atomic = sosendallatonce(so) || top;
932 
933 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
934 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
935 	    ("sodgram_send: !PR_ATOMIC"));
936 
937 	if (uio != NULL)
938 		resid = uio->uio_resid;
939 	else
940 		resid = top->m_pkthdr.len;
941 	/*
942 	 * In theory resid should be unsigned.
943 	 * However, space must be signed, as it might be less than 0
944 	 * if we over-committed, and we must use a signed comparison
945 	 * of space and resid.  On the other hand, a negative resid
946 	 * causes us to loop sending 0-length segments to the protocol.
947 	 *
948 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
949 	 * type sockets since that's an error.
950 	 */
951 	if (resid < 0) {
952 		error = EINVAL;
953 		goto out;
954 	}
955 
956 	dontroute =
957 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
958 	if (td != NULL)
959 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
960 	if (control != NULL)
961 		clen = control->m_len;
962 
963 	SOCKBUF_LOCK(&so->so_snd);
964 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
965 		SOCKBUF_UNLOCK(&so->so_snd);
966 		error = EPIPE;
967 		goto out;
968 	}
969 	if (so->so_error) {
970 		error = so->so_error;
971 		so->so_error = 0;
972 		SOCKBUF_UNLOCK(&so->so_snd);
973 		goto out;
974 	}
975 	if ((so->so_state & SS_ISCONNECTED) == 0) {
976 		/*
977 		 * `sendto' and `sendmsg' is allowed on a connection-
978 		 * based socket if it supports implied connect.
979 		 * Return ENOTCONN if not connected and no address is
980 		 * supplied.
981 		 */
982 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
983 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
984 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
985 			    !(resid == 0 && clen != 0)) {
986 				SOCKBUF_UNLOCK(&so->so_snd);
987 				error = ENOTCONN;
988 				goto out;
989 			}
990 		} else if (addr == NULL) {
991 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
992 				error = ENOTCONN;
993 			else
994 				error = EDESTADDRREQ;
995 			SOCKBUF_UNLOCK(&so->so_snd);
996 			goto out;
997 		}
998 	}
999 
1000 	/*
1001 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1002 	 * problem and need fixing.
1003 	 */
1004 	space = sbspace(&so->so_snd);
1005 	if (flags & MSG_OOB)
1006 		space += 1024;
1007 	space -= clen;
1008 	if (resid > space) {
1009 		error = EMSGSIZE;
1010 		goto out;
1011 	}
1012 	SOCKBUF_UNLOCK(&so->so_snd);
1013 	if (uio == NULL) {
1014 		resid = 0;
1015 		if (flags & MSG_EOR)
1016 			top->m_flags |= M_EOR;
1017 	} else {
1018 		error = sosend_copyin(uio, &top, atomic, &space, flags);
1019 		if (error)
1020 			goto out;
1021 		resid = uio->uio_resid;
1022 	}
1023 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1024 	/*
1025 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1026 	 * than with.
1027 	 */
1028 	if (dontroute) {
1029 		SOCK_LOCK(so);
1030 		so->so_options |= SO_DONTROUTE;
1031 		SOCK_UNLOCK(so);
1032 	}
1033 	/*
1034 	 * XXX all the SBS_CANTSENDMORE checks previously
1035 	 * done could be out of date.  We could have recieved
1036 	 * a reset packet in an interrupt or maybe we slept
1037 	 * while doing page faults in uiomove() etc. We could
1038 	 * probably recheck again inside the locking protection
1039 	 * here, but there are probably other places that this
1040 	 * also happens.  We must rethink this.
1041 	 */
1042 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1043 	    (flags & MSG_OOB) ? PRUS_OOB :
1044 	/*
1045 	 * If the user set MSG_EOF, the protocol
1046 	 * understands this flag and nothing left to
1047 	 * send then use PRU_SEND_EOF instead of PRU_SEND.
1048 	 */
1049 	    ((flags & MSG_EOF) &&
1050 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1051 	     (resid <= 0)) ?
1052 		PRUS_EOF :
1053 		/* If there is more to send set PRUS_MORETOCOME */
1054 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1055 		top, addr, control, td);
1056 	if (dontroute) {
1057 		SOCK_LOCK(so);
1058 		so->so_options &= ~SO_DONTROUTE;
1059 		SOCK_UNLOCK(so);
1060 	}
1061 	clen = 0;
1062 	control = NULL;
1063 	top = NULL;
1064 out:
1065 	if (top != NULL)
1066 		m_freem(top);
1067 	if (control != NULL)
1068 		m_freem(control);
1069 	return (error);
1070 }
1071 
1072 /*
1073  * Send on a socket.
1074  * If send must go all at once and message is larger than
1075  * send buffering, then hard error.
1076  * Lock against other senders.
1077  * If must go all at once and not enough room now, then
1078  * inform user that this would block and do nothing.
1079  * Otherwise, if nonblocking, send as much as possible.
1080  * The data to be sent is described by "uio" if nonzero,
1081  * otherwise by the mbuf chain "top" (which must be null
1082  * if uio is not).  Data provided in mbuf chain must be small
1083  * enough to send all at once.
1084  *
1085  * Returns nonzero on error, timeout or signal; callers
1086  * must check for short counts if EINTR/ERESTART are returned.
1087  * Data and control buffers are freed on return.
1088  */
1089 #define	snderr(errno)	{ error = (errno); goto release; }
1090 int
1091 sosend(so, addr, uio, top, control, flags, td)
1092 	struct socket *so;
1093 	struct sockaddr *addr;
1094 	struct uio *uio;
1095 	struct mbuf *top;
1096 	struct mbuf *control;
1097 	int flags;
1098 	struct thread *td;
1099 {
1100 	long space, resid;
1101 	int clen = 0, error, dontroute;
1102 	int atomic = sosendallatonce(so) || top;
1103 
1104 	if (uio != NULL)
1105 		resid = uio->uio_resid;
1106 	else
1107 		resid = top->m_pkthdr.len;
1108 	/*
1109 	 * In theory resid should be unsigned.
1110 	 * However, space must be signed, as it might be less than 0
1111 	 * if we over-committed, and we must use a signed comparison
1112 	 * of space and resid.  On the other hand, a negative resid
1113 	 * causes us to loop sending 0-length segments to the protocol.
1114 	 *
1115 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1116 	 * type sockets since that's an error.
1117 	 */
1118 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1119 		error = EINVAL;
1120 		goto out;
1121 	}
1122 
1123 	dontroute =
1124 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1125 	    (so->so_proto->pr_flags & PR_ATOMIC);
1126 	if (td != NULL)
1127 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1128 	if (control != NULL)
1129 		clen = control->m_len;
1130 
1131 	SOCKBUF_LOCK(&so->so_snd);
1132 restart:
1133 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1134 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1135 	if (error)
1136 		goto out_locked;
1137 	do {
1138 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1139 		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1140 			snderr(EPIPE);
1141 		if (so->so_error) {
1142 			error = so->so_error;
1143 			so->so_error = 0;
1144 			goto release;
1145 		}
1146 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1147 			/*
1148 			 * `sendto' and `sendmsg' is allowed on a connection-
1149 			 * based socket if it supports implied connect.
1150 			 * Return ENOTCONN if not connected and no address is
1151 			 * supplied.
1152 			 */
1153 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1154 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1155 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1156 				    !(resid == 0 && clen != 0))
1157 					snderr(ENOTCONN);
1158 			} else if (addr == NULL)
1159 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1160 				   ENOTCONN : EDESTADDRREQ);
1161 		}
1162 		space = sbspace(&so->so_snd);
1163 		if (flags & MSG_OOB)
1164 			space += 1024;
1165 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1166 		    clen > so->so_snd.sb_hiwat)
1167 			snderr(EMSGSIZE);
1168 		if (space < resid + clen &&
1169 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1170 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1171 				snderr(EWOULDBLOCK);
1172 			sbunlock(&so->so_snd);
1173 			error = sbwait(&so->so_snd);
1174 			if (error)
1175 				goto out_locked;
1176 			goto restart;
1177 		}
1178 		SOCKBUF_UNLOCK(&so->so_snd);
1179 		space -= clen;
1180 		do {
1181 			if (uio == NULL) {
1182 				resid = 0;
1183 				if (flags & MSG_EOR)
1184 					top->m_flags |= M_EOR;
1185 			} else {
1186 				error = sosend_copyin(uio, &top, atomic,
1187 				    &space, flags);
1188 				if (error != 0) {
1189 					SOCKBUF_LOCK(&so->so_snd);
1190 					goto release;
1191 				}
1192 				resid = uio->uio_resid;
1193 			}
1194 			if (dontroute) {
1195 				SOCK_LOCK(so);
1196 				so->so_options |= SO_DONTROUTE;
1197 				SOCK_UNLOCK(so);
1198 			}
1199 			/*
1200 			 * XXX all the SBS_CANTSENDMORE checks previously
1201 			 * done could be out of date.  We could have recieved
1202 			 * a reset packet in an interrupt or maybe we slept
1203 			 * while doing page faults in uiomove() etc. We could
1204 			 * probably recheck again inside the locking protection
1205 			 * here, but there are probably other places that this
1206 			 * also happens.  We must rethink this.
1207 			 */
1208 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1209 			    (flags & MSG_OOB) ? PRUS_OOB :
1210 			/*
1211 			 * If the user set MSG_EOF, the protocol
1212 			 * understands this flag and nothing left to
1213 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
1214 			 */
1215 			    ((flags & MSG_EOF) &&
1216 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1217 			     (resid <= 0)) ?
1218 				PRUS_EOF :
1219 			/* If there is more to send set PRUS_MORETOCOME */
1220 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1221 			    top, addr, control, td);
1222 			if (dontroute) {
1223 				SOCK_LOCK(so);
1224 				so->so_options &= ~SO_DONTROUTE;
1225 				SOCK_UNLOCK(so);
1226 			}
1227 			clen = 0;
1228 			control = NULL;
1229 			top = NULL;
1230 			if (error) {
1231 				SOCKBUF_LOCK(&so->so_snd);
1232 				goto release;
1233 			}
1234 		} while (resid && space > 0);
1235 		SOCKBUF_LOCK(&so->so_snd);
1236 	} while (resid);
1237 
1238 release:
1239 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1240 	sbunlock(&so->so_snd);
1241 out_locked:
1242 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1243 	SOCKBUF_UNLOCK(&so->so_snd);
1244 out:
1245 	if (top != NULL)
1246 		m_freem(top);
1247 	if (control != NULL)
1248 		m_freem(control);
1249 	return (error);
1250 }
1251 #undef snderr
1252 
1253 /*
1254  * The part of soreceive() that implements reading non-inline out-of-band
1255  * data from a socket.  For more complete comments, see soreceive(), from
1256  * which this code originated.
1257  *
1258  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1259  * unable to return an mbuf chain to the caller.
1260  */
1261 static int
1262 soreceive_rcvoob(so, uio, flags)
1263 	struct socket *so;
1264 	struct uio *uio;
1265 	int flags;
1266 {
1267 	struct protosw *pr = so->so_proto;
1268 	struct mbuf *m;
1269 	int error;
1270 
1271 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1272 
1273 	m = m_get(M_TRYWAIT, MT_DATA);
1274 	if (m == NULL)
1275 		return (ENOBUFS);
1276 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1277 	if (error)
1278 		goto bad;
1279 	do {
1280 #ifdef ZERO_COPY_SOCKETS
1281 		if (so_zero_copy_receive) {
1282 			int disposable;
1283 
1284 			if ((m->m_flags & M_EXT)
1285 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1286 				disposable = 1;
1287 			else
1288 				disposable = 0;
1289 
1290 			error = uiomoveco(mtod(m, void *),
1291 					  min(uio->uio_resid, m->m_len),
1292 					  uio, disposable);
1293 		} else
1294 #endif /* ZERO_COPY_SOCKETS */
1295 		error = uiomove(mtod(m, void *),
1296 		    (int) min(uio->uio_resid, m->m_len), uio);
1297 		m = m_free(m);
1298 	} while (uio->uio_resid && error == 0 && m);
1299 bad:
1300 	if (m != NULL)
1301 		m_freem(m);
1302 	return (error);
1303 }
1304 
1305 /*
1306  * Following replacement or removal of the first mbuf on the first mbuf chain
1307  * of a socket buffer, push necessary state changes back into the socket
1308  * buffer so that other consumers see the values consistently.  'nextrecord'
1309  * is the callers locally stored value of the original value of
1310  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1311  * NOTE: 'nextrecord' may be NULL.
1312  */
1313 static __inline void
1314 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1315 {
1316 
1317 	SOCKBUF_LOCK_ASSERT(sb);
1318 	/*
1319 	 * First, update for the new value of nextrecord.  If necessary, make
1320 	 * it the first record.
1321 	 */
1322 	if (sb->sb_mb != NULL)
1323 		sb->sb_mb->m_nextpkt = nextrecord;
1324 	else
1325 		sb->sb_mb = nextrecord;
1326 
1327         /*
1328          * Now update any dependent socket buffer fields to reflect the new
1329          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1330 	 * addition of a second clause that takes care of the case where
1331 	 * sb_mb has been updated, but remains the last record.
1332          */
1333         if (sb->sb_mb == NULL) {
1334                 sb->sb_mbtail = NULL;
1335                 sb->sb_lastrecord = NULL;
1336         } else if (sb->sb_mb->m_nextpkt == NULL)
1337                 sb->sb_lastrecord = sb->sb_mb;
1338 }
1339 
1340 
1341 /*
1342  * Implement receive operations on a socket.
1343  * We depend on the way that records are added to the sockbuf
1344  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1345  * must begin with an address if the protocol so specifies,
1346  * followed by an optional mbuf or mbufs containing ancillary data,
1347  * and then zero or more mbufs of data.
1348  * In order to avoid blocking network interrupts for the entire time here,
1349  * we splx() while doing the actual copy to user space.
1350  * Although the sockbuf is locked, new data may still be appended,
1351  * and thus we must maintain consistency of the sockbuf during that time.
1352  *
1353  * The caller may receive the data as a single mbuf chain by supplying
1354  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1355  * only for the count in uio_resid.
1356  */
1357 int
1358 soreceive(so, psa, uio, mp0, controlp, flagsp)
1359 	struct socket *so;
1360 	struct sockaddr **psa;
1361 	struct uio *uio;
1362 	struct mbuf **mp0;
1363 	struct mbuf **controlp;
1364 	int *flagsp;
1365 {
1366 	struct mbuf *m, **mp;
1367 	int flags, len, error, offset;
1368 	struct protosw *pr = so->so_proto;
1369 	struct mbuf *nextrecord;
1370 	int moff, type = 0;
1371 	int orig_resid = uio->uio_resid;
1372 
1373 	mp = mp0;
1374 	if (psa != NULL)
1375 		*psa = NULL;
1376 	if (controlp != NULL)
1377 		*controlp = NULL;
1378 	if (flagsp != NULL)
1379 		flags = *flagsp &~ MSG_EOR;
1380 	else
1381 		flags = 0;
1382 	if (flags & MSG_OOB)
1383 		return (soreceive_rcvoob(so, uio, flags));
1384 	if (mp != NULL)
1385 		*mp = NULL;
1386 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1387 	    && uio->uio_resid)
1388 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1389 
1390 	SOCKBUF_LOCK(&so->so_rcv);
1391 restart:
1392 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1393 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1394 	if (error)
1395 		goto out;
1396 
1397 	m = so->so_rcv.sb_mb;
1398 	/*
1399 	 * If we have less data than requested, block awaiting more
1400 	 * (subject to any timeout) if:
1401 	 *   1. the current count is less than the low water mark, or
1402 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1403 	 *	receive operation at once if we block (resid <= hiwat).
1404 	 *   3. MSG_DONTWAIT is not set
1405 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1406 	 * we have to do the receive in sections, and thus risk returning
1407 	 * a short count if a timeout or signal occurs after we start.
1408 	 */
1409 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1410 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1411 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1412 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1413 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1414 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1415 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1416 		    m, so->so_rcv.sb_cc));
1417 		if (so->so_error) {
1418 			if (m != NULL)
1419 				goto dontblock;
1420 			error = so->so_error;
1421 			if ((flags & MSG_PEEK) == 0)
1422 				so->so_error = 0;
1423 			goto release;
1424 		}
1425 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1426 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1427 			if (m)
1428 				goto dontblock;
1429 			else
1430 				goto release;
1431 		}
1432 		for (; m != NULL; m = m->m_next)
1433 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1434 				m = so->so_rcv.sb_mb;
1435 				goto dontblock;
1436 			}
1437 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1438 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1439 			error = ENOTCONN;
1440 			goto release;
1441 		}
1442 		if (uio->uio_resid == 0)
1443 			goto release;
1444 		if ((so->so_state & SS_NBIO) ||
1445 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1446 			error = EWOULDBLOCK;
1447 			goto release;
1448 		}
1449 		SBLASTRECORDCHK(&so->so_rcv);
1450 		SBLASTMBUFCHK(&so->so_rcv);
1451 		sbunlock(&so->so_rcv);
1452 		error = sbwait(&so->so_rcv);
1453 		if (error)
1454 			goto out;
1455 		goto restart;
1456 	}
1457 dontblock:
1458 	/*
1459 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1460 	 * pointer to the next record in the socket buffer.  We must keep the
1461 	 * various socket buffer pointers and local stack versions of the
1462 	 * pointers in sync, pushing out modifications before dropping the
1463 	 * socket buffer mutex, and re-reading them when picking it up.
1464 	 *
1465 	 * Otherwise, we will race with the network stack appending new data
1466 	 * or records onto the socket buffer by using inconsistent/stale
1467 	 * versions of the field, possibly resulting in socket buffer
1468 	 * corruption.
1469 	 *
1470 	 * By holding the high-level sblock(), we prevent simultaneous
1471 	 * readers from pulling off the front of the socket buffer.
1472 	 */
1473 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1474 	if (uio->uio_td)
1475 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1476 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1477 	SBLASTRECORDCHK(&so->so_rcv);
1478 	SBLASTMBUFCHK(&so->so_rcv);
1479 	nextrecord = m->m_nextpkt;
1480 	if (pr->pr_flags & PR_ADDR) {
1481 		KASSERT(m->m_type == MT_SONAME,
1482 		    ("m->m_type == %d", m->m_type));
1483 		orig_resid = 0;
1484 		if (psa != NULL)
1485 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1486 			    M_NOWAIT);
1487 		if (flags & MSG_PEEK) {
1488 			m = m->m_next;
1489 		} else {
1490 			sbfree(&so->so_rcv, m);
1491 			so->so_rcv.sb_mb = m_free(m);
1492 			m = so->so_rcv.sb_mb;
1493 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1494 		}
1495 	}
1496 
1497 	/*
1498 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1499 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1500 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1501 	 * perform externalization (or freeing if controlp == NULL).
1502 	 */
1503 	if (m != NULL && m->m_type == MT_CONTROL) {
1504 		struct mbuf *cm = NULL, *cmn;
1505 		struct mbuf **cme = &cm;
1506 
1507 		do {
1508 			if (flags & MSG_PEEK) {
1509 				if (controlp != NULL) {
1510 					*controlp = m_copy(m, 0, m->m_len);
1511 					controlp = &(*controlp)->m_next;
1512 				}
1513 				m = m->m_next;
1514 			} else {
1515 				sbfree(&so->so_rcv, m);
1516 				so->so_rcv.sb_mb = m->m_next;
1517 				m->m_next = NULL;
1518 				*cme = m;
1519 				cme = &(*cme)->m_next;
1520 				m = so->so_rcv.sb_mb;
1521 			}
1522 		} while (m != NULL && m->m_type == MT_CONTROL);
1523 		if ((flags & MSG_PEEK) == 0)
1524 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1525 		while (cm != NULL) {
1526 			cmn = cm->m_next;
1527 			cm->m_next = NULL;
1528 			if (pr->pr_domain->dom_externalize != NULL) {
1529 				SOCKBUF_UNLOCK(&so->so_rcv);
1530 				error = (*pr->pr_domain->dom_externalize)
1531 				    (cm, controlp);
1532 				SOCKBUF_LOCK(&so->so_rcv);
1533 			} else if (controlp != NULL)
1534 				*controlp = cm;
1535 			else
1536 				m_freem(cm);
1537 			if (controlp != NULL) {
1538 				orig_resid = 0;
1539 				while (*controlp != NULL)
1540 					controlp = &(*controlp)->m_next;
1541 			}
1542 			cm = cmn;
1543 		}
1544 		if (so->so_rcv.sb_mb)
1545 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1546 		else
1547 			nextrecord = NULL;
1548 		orig_resid = 0;
1549 	}
1550 	if (m != NULL) {
1551 		if ((flags & MSG_PEEK) == 0) {
1552 			KASSERT(m->m_nextpkt == nextrecord,
1553 			    ("soreceive: post-control, nextrecord !sync"));
1554 			if (nextrecord == NULL) {
1555 				KASSERT(so->so_rcv.sb_mb == m,
1556 				    ("soreceive: post-control, sb_mb!=m"));
1557 				KASSERT(so->so_rcv.sb_lastrecord == m,
1558 				    ("soreceive: post-control, lastrecord!=m"));
1559 			}
1560 		}
1561 		type = m->m_type;
1562 		if (type == MT_OOBDATA)
1563 			flags |= MSG_OOB;
1564 	} else {
1565 		if ((flags & MSG_PEEK) == 0) {
1566 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1567 			    ("soreceive: sb_mb != nextrecord"));
1568 			if (so->so_rcv.sb_mb == NULL) {
1569 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1570 				    ("soreceive: sb_lastercord != NULL"));
1571 			}
1572 		}
1573 	}
1574 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1575 	SBLASTRECORDCHK(&so->so_rcv);
1576 	SBLASTMBUFCHK(&so->so_rcv);
1577 
1578 	/*
1579 	 * Now continue to read any data mbufs off of the head of the socket
1580 	 * buffer until the read request is satisfied.  Note that 'type' is
1581 	 * used to store the type of any mbuf reads that have happened so far
1582 	 * such that soreceive() can stop reading if the type changes, which
1583 	 * causes soreceive() to return only one of regular data and inline
1584 	 * out-of-band data in a single socket receive operation.
1585 	 */
1586 	moff = 0;
1587 	offset = 0;
1588 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1589 		/*
1590 		 * If the type of mbuf has changed since the last mbuf
1591 		 * examined ('type'), end the receive operation.
1592 	 	 */
1593 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1594 		if (m->m_type == MT_OOBDATA) {
1595 			if (type != MT_OOBDATA)
1596 				break;
1597 		} else if (type == MT_OOBDATA)
1598 			break;
1599 		else
1600 		    KASSERT(m->m_type == MT_DATA,
1601 			("m->m_type == %d", m->m_type));
1602 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1603 		len = uio->uio_resid;
1604 		if (so->so_oobmark && len > so->so_oobmark - offset)
1605 			len = so->so_oobmark - offset;
1606 		if (len > m->m_len - moff)
1607 			len = m->m_len - moff;
1608 		/*
1609 		 * If mp is set, just pass back the mbufs.
1610 		 * Otherwise copy them out via the uio, then free.
1611 		 * Sockbuf must be consistent here (points to current mbuf,
1612 		 * it points to next record) when we drop priority;
1613 		 * we must note any additions to the sockbuf when we
1614 		 * block interrupts again.
1615 		 */
1616 		if (mp == NULL) {
1617 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1618 			SBLASTRECORDCHK(&so->so_rcv);
1619 			SBLASTMBUFCHK(&so->so_rcv);
1620 			SOCKBUF_UNLOCK(&so->so_rcv);
1621 #ifdef ZERO_COPY_SOCKETS
1622 			if (so_zero_copy_receive) {
1623 				int disposable;
1624 
1625 				if ((m->m_flags & M_EXT)
1626 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1627 					disposable = 1;
1628 				else
1629 					disposable = 0;
1630 
1631 				error = uiomoveco(mtod(m, char *) + moff,
1632 						  (int)len, uio,
1633 						  disposable);
1634 			} else
1635 #endif /* ZERO_COPY_SOCKETS */
1636 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1637 			SOCKBUF_LOCK(&so->so_rcv);
1638 			if (error)
1639 				goto release;
1640 		} else
1641 			uio->uio_resid -= len;
1642 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1643 		if (len == m->m_len - moff) {
1644 			if (m->m_flags & M_EOR)
1645 				flags |= MSG_EOR;
1646 			if (flags & MSG_PEEK) {
1647 				m = m->m_next;
1648 				moff = 0;
1649 			} else {
1650 				nextrecord = m->m_nextpkt;
1651 				sbfree(&so->so_rcv, m);
1652 				if (mp != NULL) {
1653 					*mp = m;
1654 					mp = &m->m_next;
1655 					so->so_rcv.sb_mb = m = m->m_next;
1656 					*mp = NULL;
1657 				} else {
1658 					so->so_rcv.sb_mb = m_free(m);
1659 					m = so->so_rcv.sb_mb;
1660 				}
1661 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1662 				SBLASTRECORDCHK(&so->so_rcv);
1663 				SBLASTMBUFCHK(&so->so_rcv);
1664 			}
1665 		} else {
1666 			if (flags & MSG_PEEK)
1667 				moff += len;
1668 			else {
1669 				if (mp != NULL) {
1670 					int copy_flag;
1671 
1672 					if (flags & MSG_DONTWAIT)
1673 						copy_flag = M_DONTWAIT;
1674 					else
1675 						copy_flag = M_TRYWAIT;
1676 					if (copy_flag == M_TRYWAIT)
1677 						SOCKBUF_UNLOCK(&so->so_rcv);
1678 					*mp = m_copym(m, 0, len, copy_flag);
1679 					if (copy_flag == M_TRYWAIT)
1680 						SOCKBUF_LOCK(&so->so_rcv);
1681  					if (*mp == NULL) {
1682  						/*
1683  						 * m_copym() couldn't allocate an mbuf.
1684 						 * Adjust uio_resid back (it was adjusted
1685 						 * down by len bytes, which we didn't end
1686 						 * up "copying" over).
1687  						 */
1688  						uio->uio_resid += len;
1689  						break;
1690  					}
1691 				}
1692 				m->m_data += len;
1693 				m->m_len -= len;
1694 				so->so_rcv.sb_cc -= len;
1695 			}
1696 		}
1697 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1698 		if (so->so_oobmark) {
1699 			if ((flags & MSG_PEEK) == 0) {
1700 				so->so_oobmark -= len;
1701 				if (so->so_oobmark == 0) {
1702 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1703 					break;
1704 				}
1705 			} else {
1706 				offset += len;
1707 				if (offset == so->so_oobmark)
1708 					break;
1709 			}
1710 		}
1711 		if (flags & MSG_EOR)
1712 			break;
1713 		/*
1714 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1715 		 * we must not quit until "uio->uio_resid == 0" or an error
1716 		 * termination.  If a signal/timeout occurs, return
1717 		 * with a short count but without error.
1718 		 * Keep sockbuf locked against other readers.
1719 		 */
1720 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1721 		    !sosendallatonce(so) && nextrecord == NULL) {
1722 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1723 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1724 				break;
1725 			/*
1726 			 * Notify the protocol that some data has been
1727 			 * drained before blocking.
1728 			 */
1729 			if (pr->pr_flags & PR_WANTRCVD) {
1730 				SOCKBUF_UNLOCK(&so->so_rcv);
1731 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1732 				SOCKBUF_LOCK(&so->so_rcv);
1733 			}
1734 			SBLASTRECORDCHK(&so->so_rcv);
1735 			SBLASTMBUFCHK(&so->so_rcv);
1736 			error = sbwait(&so->so_rcv);
1737 			if (error)
1738 				goto release;
1739 			m = so->so_rcv.sb_mb;
1740 			if (m != NULL)
1741 				nextrecord = m->m_nextpkt;
1742 		}
1743 	}
1744 
1745 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1746 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1747 		flags |= MSG_TRUNC;
1748 		if ((flags & MSG_PEEK) == 0)
1749 			(void) sbdroprecord_locked(&so->so_rcv);
1750 	}
1751 	if ((flags & MSG_PEEK) == 0) {
1752 		if (m == NULL) {
1753 			/*
1754 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1755 			 * part makes sure sb_lastrecord is up-to-date if
1756 			 * there is still data in the socket buffer.
1757 			 */
1758 			so->so_rcv.sb_mb = nextrecord;
1759 			if (so->so_rcv.sb_mb == NULL) {
1760 				so->so_rcv.sb_mbtail = NULL;
1761 				so->so_rcv.sb_lastrecord = NULL;
1762 			} else if (nextrecord->m_nextpkt == NULL)
1763 				so->so_rcv.sb_lastrecord = nextrecord;
1764 		}
1765 		SBLASTRECORDCHK(&so->so_rcv);
1766 		SBLASTMBUFCHK(&so->so_rcv);
1767 		/*
1768 		 * If soreceive() is being done from the socket callback, then
1769 		 * don't need to generate ACK to peer to update window, since
1770 		 * ACK will be generated on return to TCP.
1771 		 */
1772 		if (!(flags & MSG_SOCALLBCK) &&
1773 		    (pr->pr_flags & PR_WANTRCVD)) {
1774 			SOCKBUF_UNLOCK(&so->so_rcv);
1775 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1776 			SOCKBUF_LOCK(&so->so_rcv);
1777 		}
1778 	}
1779 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1780 	if (orig_resid == uio->uio_resid && orig_resid &&
1781 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1782 		sbunlock(&so->so_rcv);
1783 		goto restart;
1784 	}
1785 
1786 	if (flagsp != NULL)
1787 		*flagsp |= flags;
1788 release:
1789 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1790 	sbunlock(&so->so_rcv);
1791 out:
1792 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1793 	SOCKBUF_UNLOCK(&so->so_rcv);
1794 	return (error);
1795 }
1796 
1797 int
1798 soshutdown(so, how)
1799 	struct socket *so;
1800 	int how;
1801 {
1802 	struct protosw *pr = so->so_proto;
1803 
1804 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1805 		return (EINVAL);
1806 
1807 	if (how != SHUT_WR)
1808 		sorflush(so);
1809 	if (how != SHUT_RD)
1810 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1811 	return (0);
1812 }
1813 
1814 void
1815 sorflush(so)
1816 	struct socket *so;
1817 {
1818 	struct sockbuf *sb = &so->so_rcv;
1819 	struct protosw *pr = so->so_proto;
1820 	struct sockbuf asb;
1821 
1822 	/*
1823 	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1824 	 * the socket buffer, then zero'd the original to clear the buffer
1825 	 * fields.  However, with mutexes in the socket buffer, this causes
1826 	 * problems.  We only clear the zeroable bits of the original;
1827 	 * however, we have to initialize and destroy the mutex in the copy
1828 	 * so that dom_dispose() and sbrelease() can lock t as needed.
1829 	 */
1830 	SOCKBUF_LOCK(sb);
1831 	sb->sb_flags |= SB_NOINTR;
1832 	(void) sblock(sb, M_WAITOK);
1833 	/*
1834 	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1835 	 * can safely perform wakeups.  Re-acquire the mutex before
1836 	 * continuing.
1837 	 */
1838 	socantrcvmore_locked(so);
1839 	SOCKBUF_LOCK(sb);
1840 	sbunlock(sb);
1841 	/*
1842 	 * Invalidate/clear most of the sockbuf structure, but leave
1843 	 * selinfo and mutex data unchanged.
1844 	 */
1845 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1846 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1847 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1848 	bzero(&sb->sb_startzero,
1849 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1850 	SOCKBUF_UNLOCK(sb);
1851 
1852 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1853 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1854 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1855 	sbrelease(&asb, so);
1856 	SOCKBUF_LOCK_DESTROY(&asb);
1857 }
1858 
1859 /*
1860  * Perhaps this routine, and sooptcopyout(), below, ought to come in
1861  * an additional variant to handle the case where the option value needs
1862  * to be some kind of integer, but not a specific size.
1863  * In addition to their use here, these functions are also called by the
1864  * protocol-level pr_ctloutput() routines.
1865  */
1866 int
1867 sooptcopyin(sopt, buf, len, minlen)
1868 	struct	sockopt *sopt;
1869 	void	*buf;
1870 	size_t	len;
1871 	size_t	minlen;
1872 {
1873 	size_t	valsize;
1874 
1875 	/*
1876 	 * If the user gives us more than we wanted, we ignore it,
1877 	 * but if we don't get the minimum length the caller
1878 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
1879 	 * is set to however much we actually retrieved.
1880 	 */
1881 	if ((valsize = sopt->sopt_valsize) < minlen)
1882 		return EINVAL;
1883 	if (valsize > len)
1884 		sopt->sopt_valsize = valsize = len;
1885 
1886 	if (sopt->sopt_td != NULL)
1887 		return (copyin(sopt->sopt_val, buf, valsize));
1888 
1889 	bcopy(sopt->sopt_val, buf, valsize);
1890 	return (0);
1891 }
1892 
1893 /*
1894  * Kernel version of setsockopt(2)/
1895  * XXX: optlen is size_t, not socklen_t
1896  */
1897 int
1898 so_setsockopt(struct socket *so, int level, int optname, void *optval,
1899     size_t optlen)
1900 {
1901 	struct sockopt sopt;
1902 
1903 	sopt.sopt_level = level;
1904 	sopt.sopt_name = optname;
1905 	sopt.sopt_dir = SOPT_SET;
1906 	sopt.sopt_val = optval;
1907 	sopt.sopt_valsize = optlen;
1908 	sopt.sopt_td = NULL;
1909 	return (sosetopt(so, &sopt));
1910 }
1911 
1912 int
1913 sosetopt(so, sopt)
1914 	struct socket *so;
1915 	struct sockopt *sopt;
1916 {
1917 	int	error, optval;
1918 	struct	linger l;
1919 	struct	timeval tv;
1920 	u_long  val;
1921 #ifdef MAC
1922 	struct mac extmac;
1923 #endif
1924 
1925 	error = 0;
1926 	if (sopt->sopt_level != SOL_SOCKET) {
1927 		if (so->so_proto && so->so_proto->pr_ctloutput)
1928 			return ((*so->so_proto->pr_ctloutput)
1929 				  (so, sopt));
1930 		error = ENOPROTOOPT;
1931 	} else {
1932 		switch (sopt->sopt_name) {
1933 #ifdef INET
1934 		case SO_ACCEPTFILTER:
1935 			error = do_setopt_accept_filter(so, sopt);
1936 			if (error)
1937 				goto bad;
1938 			break;
1939 #endif
1940 		case SO_LINGER:
1941 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1942 			if (error)
1943 				goto bad;
1944 
1945 			SOCK_LOCK(so);
1946 			so->so_linger = l.l_linger;
1947 			if (l.l_onoff)
1948 				so->so_options |= SO_LINGER;
1949 			else
1950 				so->so_options &= ~SO_LINGER;
1951 			SOCK_UNLOCK(so);
1952 			break;
1953 
1954 		case SO_DEBUG:
1955 		case SO_KEEPALIVE:
1956 		case SO_DONTROUTE:
1957 		case SO_USELOOPBACK:
1958 		case SO_BROADCAST:
1959 		case SO_REUSEADDR:
1960 		case SO_REUSEPORT:
1961 		case SO_OOBINLINE:
1962 		case SO_TIMESTAMP:
1963 		case SO_BINTIME:
1964 		case SO_NOSIGPIPE:
1965 			error = sooptcopyin(sopt, &optval, sizeof optval,
1966 					    sizeof optval);
1967 			if (error)
1968 				goto bad;
1969 			SOCK_LOCK(so);
1970 			if (optval)
1971 				so->so_options |= sopt->sopt_name;
1972 			else
1973 				so->so_options &= ~sopt->sopt_name;
1974 			SOCK_UNLOCK(so);
1975 			break;
1976 
1977 		case SO_SNDBUF:
1978 		case SO_RCVBUF:
1979 		case SO_SNDLOWAT:
1980 		case SO_RCVLOWAT:
1981 			error = sooptcopyin(sopt, &optval, sizeof optval,
1982 					    sizeof optval);
1983 			if (error)
1984 				goto bad;
1985 
1986 			/*
1987 			 * Values < 1 make no sense for any of these
1988 			 * options, so disallow them.
1989 			 */
1990 			if (optval < 1) {
1991 				error = EINVAL;
1992 				goto bad;
1993 			}
1994 
1995 			switch (sopt->sopt_name) {
1996 			case SO_SNDBUF:
1997 			case SO_RCVBUF:
1998 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1999 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2000 				    so, curthread) == 0) {
2001 					error = ENOBUFS;
2002 					goto bad;
2003 				}
2004 				break;
2005 
2006 			/*
2007 			 * Make sure the low-water is never greater than
2008 			 * the high-water.
2009 			 */
2010 			case SO_SNDLOWAT:
2011 				SOCKBUF_LOCK(&so->so_snd);
2012 				so->so_snd.sb_lowat =
2013 				    (optval > so->so_snd.sb_hiwat) ?
2014 				    so->so_snd.sb_hiwat : optval;
2015 				SOCKBUF_UNLOCK(&so->so_snd);
2016 				break;
2017 			case SO_RCVLOWAT:
2018 				SOCKBUF_LOCK(&so->so_rcv);
2019 				so->so_rcv.sb_lowat =
2020 				    (optval > so->so_rcv.sb_hiwat) ?
2021 				    so->so_rcv.sb_hiwat : optval;
2022 				SOCKBUF_UNLOCK(&so->so_rcv);
2023 				break;
2024 			}
2025 			break;
2026 
2027 		case SO_SNDTIMEO:
2028 		case SO_RCVTIMEO:
2029 #ifdef COMPAT_IA32
2030 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2031 				struct timeval32 tv32;
2032 
2033 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2034 				    sizeof tv32);
2035 				CP(tv32, tv, tv_sec);
2036 				CP(tv32, tv, tv_usec);
2037 			} else
2038 #endif
2039 				error = sooptcopyin(sopt, &tv, sizeof tv,
2040 				    sizeof tv);
2041 			if (error)
2042 				goto bad;
2043 
2044 			/* assert(hz > 0); */
2045 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2046 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2047 				error = EDOM;
2048 				goto bad;
2049 			}
2050 			/* assert(tick > 0); */
2051 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2052 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
2053 			if (val > INT_MAX) {
2054 				error = EDOM;
2055 				goto bad;
2056 			}
2057 			if (val == 0 && tv.tv_usec != 0)
2058 				val = 1;
2059 
2060 			switch (sopt->sopt_name) {
2061 			case SO_SNDTIMEO:
2062 				so->so_snd.sb_timeo = val;
2063 				break;
2064 			case SO_RCVTIMEO:
2065 				so->so_rcv.sb_timeo = val;
2066 				break;
2067 			}
2068 			break;
2069 
2070 		case SO_LABEL:
2071 #ifdef MAC
2072 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2073 			    sizeof extmac);
2074 			if (error)
2075 				goto bad;
2076 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2077 			    so, &extmac);
2078 #else
2079 			error = EOPNOTSUPP;
2080 #endif
2081 			break;
2082 
2083 		default:
2084 			error = ENOPROTOOPT;
2085 			break;
2086 		}
2087 		if (error == 0 && so->so_proto != NULL &&
2088 		    so->so_proto->pr_ctloutput != NULL) {
2089 			(void) ((*so->so_proto->pr_ctloutput)
2090 				  (so, sopt));
2091 		}
2092 	}
2093 bad:
2094 	return (error);
2095 }
2096 
2097 /* Helper routine for getsockopt */
2098 int
2099 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2100 {
2101 	int	error;
2102 	size_t	valsize;
2103 
2104 	error = 0;
2105 
2106 	/*
2107 	 * Documented get behavior is that we always return a value,
2108 	 * possibly truncated to fit in the user's buffer.
2109 	 * Traditional behavior is that we always tell the user
2110 	 * precisely how much we copied, rather than something useful
2111 	 * like the total amount we had available for her.
2112 	 * Note that this interface is not idempotent; the entire answer must
2113 	 * generated ahead of time.
2114 	 */
2115 	valsize = min(len, sopt->sopt_valsize);
2116 	sopt->sopt_valsize = valsize;
2117 	if (sopt->sopt_val != NULL) {
2118 		if (sopt->sopt_td != NULL)
2119 			error = copyout(buf, sopt->sopt_val, valsize);
2120 		else
2121 			bcopy(buf, sopt->sopt_val, valsize);
2122 	}
2123 	return (error);
2124 }
2125 
2126 int
2127 sogetopt(so, sopt)
2128 	struct socket *so;
2129 	struct sockopt *sopt;
2130 {
2131 	int	error, optval;
2132 	struct	linger l;
2133 	struct	timeval tv;
2134 #ifdef MAC
2135 	struct mac extmac;
2136 #endif
2137 
2138 	error = 0;
2139 	if (sopt->sopt_level != SOL_SOCKET) {
2140 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2141 			return ((*so->so_proto->pr_ctloutput)
2142 				  (so, sopt));
2143 		} else
2144 			return (ENOPROTOOPT);
2145 	} else {
2146 		switch (sopt->sopt_name) {
2147 #ifdef INET
2148 		case SO_ACCEPTFILTER:
2149 			error = do_getopt_accept_filter(so, sopt);
2150 			break;
2151 #endif
2152 		case SO_LINGER:
2153 			SOCK_LOCK(so);
2154 			l.l_onoff = so->so_options & SO_LINGER;
2155 			l.l_linger = so->so_linger;
2156 			SOCK_UNLOCK(so);
2157 			error = sooptcopyout(sopt, &l, sizeof l);
2158 			break;
2159 
2160 		case SO_USELOOPBACK:
2161 		case SO_DONTROUTE:
2162 		case SO_DEBUG:
2163 		case SO_KEEPALIVE:
2164 		case SO_REUSEADDR:
2165 		case SO_REUSEPORT:
2166 		case SO_BROADCAST:
2167 		case SO_OOBINLINE:
2168 		case SO_ACCEPTCONN:
2169 		case SO_TIMESTAMP:
2170 		case SO_BINTIME:
2171 		case SO_NOSIGPIPE:
2172 			optval = so->so_options & sopt->sopt_name;
2173 integer:
2174 			error = sooptcopyout(sopt, &optval, sizeof optval);
2175 			break;
2176 
2177 		case SO_TYPE:
2178 			optval = so->so_type;
2179 			goto integer;
2180 
2181 		case SO_ERROR:
2182 			SOCK_LOCK(so);
2183 			optval = so->so_error;
2184 			so->so_error = 0;
2185 			SOCK_UNLOCK(so);
2186 			goto integer;
2187 
2188 		case SO_SNDBUF:
2189 			optval = so->so_snd.sb_hiwat;
2190 			goto integer;
2191 
2192 		case SO_RCVBUF:
2193 			optval = so->so_rcv.sb_hiwat;
2194 			goto integer;
2195 
2196 		case SO_SNDLOWAT:
2197 			optval = so->so_snd.sb_lowat;
2198 			goto integer;
2199 
2200 		case SO_RCVLOWAT:
2201 			optval = so->so_rcv.sb_lowat;
2202 			goto integer;
2203 
2204 		case SO_SNDTIMEO:
2205 		case SO_RCVTIMEO:
2206 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2207 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2208 
2209 			tv.tv_sec = optval / hz;
2210 			tv.tv_usec = (optval % hz) * tick;
2211 #ifdef COMPAT_IA32
2212 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2213 				struct timeval32 tv32;
2214 
2215 				CP(tv, tv32, tv_sec);
2216 				CP(tv, tv32, tv_usec);
2217 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2218 			} else
2219 #endif
2220 				error = sooptcopyout(sopt, &tv, sizeof tv);
2221 			break;
2222 
2223 		case SO_LABEL:
2224 #ifdef MAC
2225 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2226 			    sizeof(extmac));
2227 			if (error)
2228 				return (error);
2229 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2230 			    so, &extmac);
2231 			if (error)
2232 				return (error);
2233 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2234 #else
2235 			error = EOPNOTSUPP;
2236 #endif
2237 			break;
2238 
2239 		case SO_PEERLABEL:
2240 #ifdef MAC
2241 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2242 			    sizeof(extmac));
2243 			if (error)
2244 				return (error);
2245 			error = mac_getsockopt_peerlabel(
2246 			    sopt->sopt_td->td_ucred, so, &extmac);
2247 			if (error)
2248 				return (error);
2249 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2250 #else
2251 			error = EOPNOTSUPP;
2252 #endif
2253 			break;
2254 
2255 		case SO_LISTENQLIMIT:
2256 			optval = so->so_qlimit;
2257 			goto integer;
2258 
2259 		case SO_LISTENQLEN:
2260 			optval = so->so_qlen;
2261 			goto integer;
2262 
2263 		case SO_LISTENINCQLEN:
2264 			optval = so->so_incqlen;
2265 			goto integer;
2266 
2267 		default:
2268 			error = ENOPROTOOPT;
2269 			break;
2270 		}
2271 		return (error);
2272 	}
2273 }
2274 
2275 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2276 int
2277 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2278 {
2279 	struct mbuf *m, *m_prev;
2280 	int sopt_size = sopt->sopt_valsize;
2281 
2282 	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2283 	if (m == NULL)
2284 		return ENOBUFS;
2285 	if (sopt_size > MLEN) {
2286 		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2287 		if ((m->m_flags & M_EXT) == 0) {
2288 			m_free(m);
2289 			return ENOBUFS;
2290 		}
2291 		m->m_len = min(MCLBYTES, sopt_size);
2292 	} else {
2293 		m->m_len = min(MLEN, sopt_size);
2294 	}
2295 	sopt_size -= m->m_len;
2296 	*mp = m;
2297 	m_prev = m;
2298 
2299 	while (sopt_size) {
2300 		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2301 		if (m == NULL) {
2302 			m_freem(*mp);
2303 			return ENOBUFS;
2304 		}
2305 		if (sopt_size > MLEN) {
2306 			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2307 			    M_DONTWAIT);
2308 			if ((m->m_flags & M_EXT) == 0) {
2309 				m_freem(m);
2310 				m_freem(*mp);
2311 				return ENOBUFS;
2312 			}
2313 			m->m_len = min(MCLBYTES, sopt_size);
2314 		} else {
2315 			m->m_len = min(MLEN, sopt_size);
2316 		}
2317 		sopt_size -= m->m_len;
2318 		m_prev->m_next = m;
2319 		m_prev = m;
2320 	}
2321 	return (0);
2322 }
2323 
2324 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2325 int
2326 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2327 {
2328 	struct mbuf *m0 = m;
2329 
2330 	if (sopt->sopt_val == NULL)
2331 		return (0);
2332 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2333 		if (sopt->sopt_td != NULL) {
2334 			int error;
2335 
2336 			error = copyin(sopt->sopt_val, mtod(m, char *),
2337 				       m->m_len);
2338 			if (error != 0) {
2339 				m_freem(m0);
2340 				return(error);
2341 			}
2342 		} else
2343 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2344 		sopt->sopt_valsize -= m->m_len;
2345 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2346 		m = m->m_next;
2347 	}
2348 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2349 		panic("ip6_sooptmcopyin");
2350 	return (0);
2351 }
2352 
2353 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2354 int
2355 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2356 {
2357 	struct mbuf *m0 = m;
2358 	size_t valsize = 0;
2359 
2360 	if (sopt->sopt_val == NULL)
2361 		return (0);
2362 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2363 		if (sopt->sopt_td != NULL) {
2364 			int error;
2365 
2366 			error = copyout(mtod(m, char *), sopt->sopt_val,
2367 				       m->m_len);
2368 			if (error != 0) {
2369 				m_freem(m0);
2370 				return(error);
2371 			}
2372 		} else
2373 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2374 	       sopt->sopt_valsize -= m->m_len;
2375 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2376 	       valsize += m->m_len;
2377 	       m = m->m_next;
2378 	}
2379 	if (m != NULL) {
2380 		/* enough soopt buffer should be given from user-land */
2381 		m_freem(m0);
2382 		return(EINVAL);
2383 	}
2384 	sopt->sopt_valsize = valsize;
2385 	return (0);
2386 }
2387 
2388 void
2389 sohasoutofband(so)
2390 	struct socket *so;
2391 {
2392 	if (so->so_sigio != NULL)
2393 		pgsigio(&so->so_sigio, SIGURG, 0);
2394 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2395 }
2396 
2397 int
2398 sopoll(struct socket *so, int events, struct ucred *active_cred,
2399     struct thread *td)
2400 {
2401 	int revents = 0;
2402 
2403 	SOCKBUF_LOCK(&so->so_snd);
2404 	SOCKBUF_LOCK(&so->so_rcv);
2405 	if (events & (POLLIN | POLLRDNORM))
2406 		if (soreadable(so))
2407 			revents |= events & (POLLIN | POLLRDNORM);
2408 
2409 	if (events & POLLINIGNEOF)
2410 		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2411 		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2412 			revents |= POLLINIGNEOF;
2413 
2414 	if (events & (POLLOUT | POLLWRNORM))
2415 		if (sowriteable(so))
2416 			revents |= events & (POLLOUT | POLLWRNORM);
2417 
2418 	if (events & (POLLPRI | POLLRDBAND))
2419 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2420 			revents |= events & (POLLPRI | POLLRDBAND);
2421 
2422 	if (revents == 0) {
2423 		if (events &
2424 		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2425 		     POLLRDBAND)) {
2426 			selrecord(td, &so->so_rcv.sb_sel);
2427 			so->so_rcv.sb_flags |= SB_SEL;
2428 		}
2429 
2430 		if (events & (POLLOUT | POLLWRNORM)) {
2431 			selrecord(td, &so->so_snd.sb_sel);
2432 			so->so_snd.sb_flags |= SB_SEL;
2433 		}
2434 	}
2435 
2436 	SOCKBUF_UNLOCK(&so->so_rcv);
2437 	SOCKBUF_UNLOCK(&so->so_snd);
2438 	return (revents);
2439 }
2440 
2441 int
2442 soo_kqfilter(struct file *fp, struct knote *kn)
2443 {
2444 	struct socket *so = kn->kn_fp->f_data;
2445 	struct sockbuf *sb;
2446 
2447 	switch (kn->kn_filter) {
2448 	case EVFILT_READ:
2449 		if (so->so_options & SO_ACCEPTCONN)
2450 			kn->kn_fop = &solisten_filtops;
2451 		else
2452 			kn->kn_fop = &soread_filtops;
2453 		sb = &so->so_rcv;
2454 		break;
2455 	case EVFILT_WRITE:
2456 		kn->kn_fop = &sowrite_filtops;
2457 		sb = &so->so_snd;
2458 		break;
2459 	default:
2460 		return (EINVAL);
2461 	}
2462 
2463 	SOCKBUF_LOCK(sb);
2464 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2465 	sb->sb_flags |= SB_KNOTE;
2466 	SOCKBUF_UNLOCK(sb);
2467 	return (0);
2468 }
2469 
2470 static void
2471 filt_sordetach(struct knote *kn)
2472 {
2473 	struct socket *so = kn->kn_fp->f_data;
2474 
2475 	SOCKBUF_LOCK(&so->so_rcv);
2476 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2477 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2478 		so->so_rcv.sb_flags &= ~SB_KNOTE;
2479 	SOCKBUF_UNLOCK(&so->so_rcv);
2480 }
2481 
2482 /*ARGSUSED*/
2483 static int
2484 filt_soread(struct knote *kn, long hint)
2485 {
2486 	struct socket *so;
2487 
2488 	so = kn->kn_fp->f_data;
2489 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2490 
2491 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2492 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2493 		kn->kn_flags |= EV_EOF;
2494 		kn->kn_fflags = so->so_error;
2495 		return (1);
2496 	} else if (so->so_error)	/* temporary udp error */
2497 		return (1);
2498 	else if (kn->kn_sfflags & NOTE_LOWAT)
2499 		return (kn->kn_data >= kn->kn_sdata);
2500 	else
2501 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2502 }
2503 
2504 static void
2505 filt_sowdetach(struct knote *kn)
2506 {
2507 	struct socket *so = kn->kn_fp->f_data;
2508 
2509 	SOCKBUF_LOCK(&so->so_snd);
2510 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2511 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2512 		so->so_snd.sb_flags &= ~SB_KNOTE;
2513 	SOCKBUF_UNLOCK(&so->so_snd);
2514 }
2515 
2516 /*ARGSUSED*/
2517 static int
2518 filt_sowrite(struct knote *kn, long hint)
2519 {
2520 	struct socket *so;
2521 
2522 	so = kn->kn_fp->f_data;
2523 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2524 	kn->kn_data = sbspace(&so->so_snd);
2525 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2526 		kn->kn_flags |= EV_EOF;
2527 		kn->kn_fflags = so->so_error;
2528 		return (1);
2529 	} else if (so->so_error)	/* temporary udp error */
2530 		return (1);
2531 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2532 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2533 		return (0);
2534 	else if (kn->kn_sfflags & NOTE_LOWAT)
2535 		return (kn->kn_data >= kn->kn_sdata);
2536 	else
2537 		return (kn->kn_data >= so->so_snd.sb_lowat);
2538 }
2539 
2540 /*ARGSUSED*/
2541 static int
2542 filt_solisten(struct knote *kn, long hint)
2543 {
2544 	struct socket *so = kn->kn_fp->f_data;
2545 
2546 	kn->kn_data = so->so_qlen;
2547 	return (! TAILQ_EMPTY(&so->so_comp));
2548 }
2549 
2550 int
2551 socheckuid(struct socket *so, uid_t uid)
2552 {
2553 
2554 	if (so == NULL)
2555 		return (EPERM);
2556 	if (so->so_cred->cr_uid != uid)
2557 		return (EPERM);
2558 	return (0);
2559 }
2560 
2561 static int
2562 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2563 {
2564 	int error;
2565 	int val;
2566 
2567 	val = somaxconn;
2568 	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2569 	if (error || !req->newptr )
2570 		return (error);
2571 
2572 	if (val < 1 || val > USHRT_MAX)
2573 		return (EINVAL);
2574 
2575 	somaxconn = val;
2576 	return (0);
2577 }
2578