xref: /freebsd/sys/kern/uipc_socket.c (revision 3d11b6c8f01e1fca5936a11d6996448467851a94)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2004 The FreeBSD Foundation
5  * Copyright (c) 2004-2006 Robert N. M. Watson
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
32  */
33 
34 /*
35  * Comments on the socket life cycle:
36  *
37  * soalloc() sets of socket layer state for a socket, called only by
38  * socreate() and sonewconn().  Socket layer private.
39  *
40  * sdealloc() tears down socket layer state for a socket, called only by
41  * sofree() and sonewconn().  Socket layer private.
42  *
43  * pru_attach() associates protocol layer state with an allocated socket;
44  * called only once, may fail, aborting socket allocation.  This is called
45  * from socreate() and sonewconn().  Socket layer private.
46  *
47  * pru_detach() disassociates protocol layer state from an attached socket,
48  * and will be called exactly once for sockets in which pru_attach() has
49  * been successfully called.  If pru_attach() returned an error,
50  * pru_detach() will not be called.  Socket layer private.
51  *
52  * socreate() creates a socket and attaches protocol state.  This is a public
53  * interface that may be used by socket layer consumers to create new
54  * sockets.
55  *
56  * sonewconn() creates a socket and attaches protocol state.  This is a
57  * public interface  that may be used by protocols to create new sockets when
58  * a new connection is received and will be available for accept() on a
59  * listen socket.
60  *
61  * soclose() destroys a socket after possibly waiting for it to disconnect.
62  * This is a public interface that socket consumers should use to close and
63  * release a socket when done with it.
64  *
65  * soabort() destroys a socket without waiting for it to disconnect (used
66  * only for incoming connections that are already partially or fully
67  * connected).  This is used internally by the socket layer when clearing
68  * listen socket queues (due to overflow or close on the listen socket), but
69  * is also a public interface protocols may use to abort connections in
70  * their incomplete listen queues should they no longer be required.  Sockets
71  * placed in completed connection listen queues should not be aborted.
72  *
73  * sofree() will free a socket and its protocol state if all references on
74  * the socket have been released, and is the public interface to attempt to
75  * free a socket when a reference is removed.  This is a socket layer private
76  * interface.
77  *
78  * NOTE: In addition to socreate() and soclose(), which provide a single
79  * socket reference to the consumer to be managed as required, there are two
80  * calls to explicitly manage socket references, soref(), and sorele().
81  * Currently, these are generally required only when transitioning a socket
82  * from a listen queue to a file descriptor, in order to prevent garbage
83  * collection of the socket at an untimely moment.  For a number of reasons,
84  * these interfaces are not preferred, and should be avoided.
85  *
86  * XXXRW: The behavior of sockets after soclose() but before the last
87  * sorele() is poorly defined.  We can probably entirely eliminate them with
88  * a little work, since consumers are managing references anyway.
89  */
90 
91 #include <sys/cdefs.h>
92 __FBSDID("$FreeBSD$");
93 
94 #include "opt_inet.h"
95 #include "opt_mac.h"
96 #include "opt_zero.h"
97 #include "opt_compat.h"
98 
99 #include <sys/param.h>
100 #include <sys/systm.h>
101 #include <sys/fcntl.h>
102 #include <sys/limits.h>
103 #include <sys/lock.h>
104 #include <sys/mac.h>
105 #include <sys/malloc.h>
106 #include <sys/mbuf.h>
107 #include <sys/mutex.h>
108 #include <sys/domain.h>
109 #include <sys/file.h>			/* for struct knote */
110 #include <sys/kernel.h>
111 #include <sys/event.h>
112 #include <sys/poll.h>
113 #include <sys/proc.h>
114 #include <sys/protosw.h>
115 #include <sys/socket.h>
116 #include <sys/socketvar.h>
117 #include <sys/resourcevar.h>
118 #include <sys/signalvar.h>
119 #include <sys/sysctl.h>
120 #include <sys/uio.h>
121 #include <sys/jail.h>
122 
123 #include <vm/uma.h>
124 
125 #ifdef COMPAT_IA32
126 #include <sys/mount.h>
127 #include <compat/freebsd32/freebsd32.h>
128 
129 extern struct sysentvec ia32_freebsd_sysvec;
130 #endif
131 
132 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
133 		    int flags);
134 
135 static void	filt_sordetach(struct knote *kn);
136 static int	filt_soread(struct knote *kn, long hint);
137 static void	filt_sowdetach(struct knote *kn);
138 static int	filt_sowrite(struct knote *kn, long hint);
139 static int	filt_solisten(struct knote *kn, long hint);
140 
141 static struct filterops solisten_filtops =
142 	{ 1, NULL, filt_sordetach, filt_solisten };
143 static struct filterops soread_filtops =
144 	{ 1, NULL, filt_sordetach, filt_soread };
145 static struct filterops sowrite_filtops =
146 	{ 1, NULL, filt_sowdetach, filt_sowrite };
147 
148 uma_zone_t socket_zone;
149 so_gen_t	so_gencnt;	/* generation count for sockets */
150 
151 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
152 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
153 
154 SYSCTL_DECL(_kern_ipc);
155 
156 static int somaxconn = SOMAXCONN;
157 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
158 /* XXX: we dont have SYSCTL_USHORT */
159 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
160     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
161     "queue size");
162 static int numopensockets;
163 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
164     &numopensockets, 0, "Number of open sockets");
165 #ifdef ZERO_COPY_SOCKETS
166 /* These aren't static because they're used in other files. */
167 int so_zero_copy_send = 1;
168 int so_zero_copy_receive = 1;
169 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
170     "Zero copy controls");
171 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
172     &so_zero_copy_receive, 0, "Enable zero copy receive");
173 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
174     &so_zero_copy_send, 0, "Enable zero copy send");
175 #endif /* ZERO_COPY_SOCKETS */
176 
177 /*
178  * accept_mtx locks down per-socket fields relating to accept queues.  See
179  * socketvar.h for an annotation of the protected fields of struct socket.
180  */
181 struct mtx accept_mtx;
182 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
183 
184 /*
185  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
186  * so_gencnt field.
187  */
188 static struct mtx so_global_mtx;
189 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
190 
191 /*
192  * Socket operation routines.
193  * These routines are called by the routines in
194  * sys_socket.c or from a system process, and
195  * implement the semantics of socket operations by
196  * switching out to the protocol specific routines.
197  */
198 
199 /*
200  * Get a socket structure from our zone, and initialize it.
201  * Note that it would probably be better to allocate socket
202  * and PCB at the same time, but I'm not convinced that all
203  * the protocols can be easily modified to do this.
204  *
205  * soalloc() returns a socket with a ref count of 0.
206  */
207 struct socket *
208 soalloc(int mflags)
209 {
210 	struct socket *so;
211 
212 	so = uma_zalloc(socket_zone, mflags | M_ZERO);
213 	if (so != NULL) {
214 #ifdef MAC
215 		if (mac_init_socket(so, mflags) != 0) {
216 			uma_zfree(socket_zone, so);
217 			return (NULL);
218 		}
219 #endif
220 		SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
221 		SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
222 		TAILQ_INIT(&so->so_aiojobq);
223 		mtx_lock(&so_global_mtx);
224 		so->so_gencnt = ++so_gencnt;
225 		++numopensockets;
226 		mtx_unlock(&so_global_mtx);
227 	}
228 	return (so);
229 }
230 
231 /*
232  * socreate returns a socket with a ref count of 1.  The socket should be
233  * closed with soclose().
234  */
235 int
236 socreate(dom, aso, type, proto, cred, td)
237 	int dom;
238 	struct socket **aso;
239 	int type;
240 	int proto;
241 	struct ucred *cred;
242 	struct thread *td;
243 {
244 	struct protosw *prp;
245 	struct socket *so;
246 	int error;
247 
248 	if (proto)
249 		prp = pffindproto(dom, proto, type);
250 	else
251 		prp = pffindtype(dom, type);
252 
253 	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
254 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
255 		return (EPROTONOSUPPORT);
256 
257 	if (jailed(cred) && jail_socket_unixiproute_only &&
258 	    prp->pr_domain->dom_family != PF_LOCAL &&
259 	    prp->pr_domain->dom_family != PF_INET &&
260 	    prp->pr_domain->dom_family != PF_ROUTE) {
261 		return (EPROTONOSUPPORT);
262 	}
263 
264 	if (prp->pr_type != type)
265 		return (EPROTOTYPE);
266 	so = soalloc(M_WAITOK);
267 	if (so == NULL)
268 		return (ENOBUFS);
269 
270 	TAILQ_INIT(&so->so_incomp);
271 	TAILQ_INIT(&so->so_comp);
272 	so->so_type = type;
273 	so->so_cred = crhold(cred);
274 	so->so_proto = prp;
275 #ifdef MAC
276 	mac_create_socket(cred, so);
277 #endif
278 	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
279 	    NULL, NULL, NULL);
280 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
281 	    NULL, NULL, NULL);
282 	so->so_count = 1;
283 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
284 	if (error) {
285 		ACCEPT_LOCK();
286 		SOCK_LOCK(so);
287 		so->so_state |= SS_NOFDREF;
288 		sorele(so);
289 		return (error);
290 	}
291 	*aso = so;
292 	return (0);
293 }
294 
295 int
296 sobind(so, nam, td)
297 	struct socket *so;
298 	struct sockaddr *nam;
299 	struct thread *td;
300 {
301 
302 	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
303 }
304 
305 void
306 sodealloc(struct socket *so)
307 {
308 
309 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
310 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
311 
312 	mtx_lock(&so_global_mtx);
313 	so->so_gencnt = ++so_gencnt;
314 	mtx_unlock(&so_global_mtx);
315 	if (so->so_rcv.sb_hiwat)
316 		(void)chgsbsize(so->so_cred->cr_uidinfo,
317 		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
318 	if (so->so_snd.sb_hiwat)
319 		(void)chgsbsize(so->so_cred->cr_uidinfo,
320 		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
321 #ifdef INET
322 	/* remove acccept filter if one is present. */
323 	if (so->so_accf != NULL)
324 		do_setopt_accept_filter(so, NULL);
325 #endif
326 #ifdef MAC
327 	mac_destroy_socket(so);
328 #endif
329 	crfree(so->so_cred);
330 	SOCKBUF_LOCK_DESTROY(&so->so_snd);
331 	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
332 	uma_zfree(socket_zone, so);
333 	mtx_lock(&so_global_mtx);
334 	--numopensockets;
335 	mtx_unlock(&so_global_mtx);
336 }
337 
338 /*
339  * solisten() transitions a socket from a non-listening state to a listening
340  * state, but can also be used to update the listen queue depth on an
341  * existing listen socket.  The protocol will call back into the sockets
342  * layer using solisten_proto_check() and solisten_proto() to check and set
343  * socket-layer listen state.  Call backs are used so that the protocol can
344  * acquire both protocol and socket layer locks in whatever order is required
345  * by the protocol.
346  *
347  * Protocol implementors are advised to hold the socket lock across the
348  * socket-layer test and set to avoid races at the socket layer.
349  */
350 int
351 solisten(so, backlog, td)
352 	struct socket *so;
353 	int backlog;
354 	struct thread *td;
355 {
356 
357 	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
358 }
359 
360 int
361 solisten_proto_check(so)
362 	struct socket *so;
363 {
364 
365 	SOCK_LOCK_ASSERT(so);
366 
367 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
368 	    SS_ISDISCONNECTING))
369 		return (EINVAL);
370 	return (0);
371 }
372 
373 void
374 solisten_proto(so, backlog)
375 	struct socket *so;
376 	int backlog;
377 {
378 
379 	SOCK_LOCK_ASSERT(so);
380 
381 	if (backlog < 0 || backlog > somaxconn)
382 		backlog = somaxconn;
383 	so->so_qlimit = backlog;
384 	so->so_options |= SO_ACCEPTCONN;
385 }
386 
387 /*
388  * Attempt to free a socket.  This should really be sotryfree().
389  *
390  * sofree() will succeed if:
391  *
392  * - There are no outstanding file descriptor references or related consumers
393  *   (so_count == 0).
394  *
395  * - The socket has been closed by user space, if ever open (SS_NOFDREF).
396  *
397  * - The protocol does not have an outstanding strong reference on the socket
398  *   (SS_PROTOREF).
399  *
400  * Otherwise, it will quietly abort so that a future call to sofree(), when
401  * conditions are right, can succeed.
402  */
403 void
404 sofree(so)
405 	struct socket *so;
406 {
407 	struct socket *head;
408 
409 	ACCEPT_LOCK_ASSERT();
410 	SOCK_LOCK_ASSERT(so);
411 
412 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
413 	    (so->so_state & SS_PROTOREF)) {
414 		SOCK_UNLOCK(so);
415 		ACCEPT_UNLOCK();
416 		return;
417 	}
418 
419 	head = so->so_head;
420 	if (head != NULL) {
421 		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
422 		    (so->so_qstate & SQ_INCOMP) != 0,
423 		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
424 		    "SQ_INCOMP"));
425 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
426 		    (so->so_qstate & SQ_INCOMP) == 0,
427 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
428 		/*
429 		 * accept(2) is responsible draining the completed
430 		 * connection queue and freeing those sockets, so
431 		 * we just return here if this socket is currently
432 		 * on the completed connection queue.  Otherwise,
433 		 * accept(2) may hang after select(2) has indicating
434 		 * that a listening socket was ready.  If it's an
435 		 * incomplete connection, we remove it from the queue
436 		 * and free it; otherwise, it won't be released until
437 		 * the listening socket is closed.
438 		 */
439 		if ((so->so_qstate & SQ_COMP) != 0) {
440 			SOCK_UNLOCK(so);
441 			ACCEPT_UNLOCK();
442 			return;
443 		}
444 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
445 		head->so_incqlen--;
446 		so->so_qstate &= ~SQ_INCOMP;
447 		so->so_head = NULL;
448 	}
449 	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
450 	    (so->so_qstate & SQ_INCOMP) == 0,
451 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
452 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
453 	SOCK_UNLOCK(so);
454 	ACCEPT_UNLOCK();
455 
456 	SOCKBUF_LOCK(&so->so_snd);
457 	so->so_snd.sb_flags |= SB_NOINTR;
458 	(void)sblock(&so->so_snd, M_WAITOK);
459 	/*
460 	 * socantsendmore_locked() drops the socket buffer mutex so that it
461 	 * can safely perform wakeups.  Re-acquire the mutex before
462 	 * continuing.
463 	 */
464 	socantsendmore_locked(so);
465 	SOCKBUF_LOCK(&so->so_snd);
466 	sbunlock(&so->so_snd);
467 	sbrelease_locked(&so->so_snd, so);
468 	SOCKBUF_UNLOCK(&so->so_snd);
469 	sorflush(so);
470 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
471 	knlist_destroy(&so->so_snd.sb_sel.si_note);
472 	sodealloc(so);
473 }
474 
475 /*
476  * Close a socket on last file table reference removal.
477  * Initiate disconnect if connected.
478  * Free socket when disconnect complete.
479  *
480  * This function will sorele() the socket.  Note that soclose() may be
481  * called prior to the ref count reaching zero.  The actual socket
482  * structure will not be freed until the ref count reaches zero.
483  */
484 int
485 soclose(so)
486 	struct socket *so;
487 {
488 	int error = 0;
489 
490 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
491 
492 	funsetown(&so->so_sigio);
493 	if (so->so_options & SO_ACCEPTCONN) {
494 		struct socket *sp;
495 		ACCEPT_LOCK();
496 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
497 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
498 			so->so_incqlen--;
499 			sp->so_qstate &= ~SQ_INCOMP;
500 			sp->so_head = NULL;
501 			ACCEPT_UNLOCK();
502 			soabort(sp);
503 			ACCEPT_LOCK();
504 		}
505 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
506 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
507 			so->so_qlen--;
508 			sp->so_qstate &= ~SQ_COMP;
509 			sp->so_head = NULL;
510 			ACCEPT_UNLOCK();
511 			soabort(sp);
512 			ACCEPT_LOCK();
513 		}
514 		ACCEPT_UNLOCK();
515 	}
516 	if (so->so_state & SS_ISCONNECTED) {
517 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
518 			error = sodisconnect(so);
519 			if (error)
520 				goto drop;
521 		}
522 		if (so->so_options & SO_LINGER) {
523 			if ((so->so_state & SS_ISDISCONNECTING) &&
524 			    (so->so_state & SS_NBIO))
525 				goto drop;
526 			while (so->so_state & SS_ISCONNECTED) {
527 				error = tsleep(&so->so_timeo,
528 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
529 				if (error)
530 					break;
531 			}
532 		}
533 	}
534 
535 drop:
536 	(*so->so_proto->pr_usrreqs->pru_detach)(so);
537 	ACCEPT_LOCK();
538 	SOCK_LOCK(so);
539 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
540 	so->so_state |= SS_NOFDREF;
541 	sorele(so);
542 	return (error);
543 }
544 
545 /*
546  * soabort() allows the socket code or protocol code to detach a socket that
547  * has been in an incomplete or completed listen queue, but has not yet been
548  * accepted.
549  *
550  * This interface is tricky, because it is called on an unreferenced socket,
551  * and must be called only by a thread that has actually removed the socket
552  * from the listen queue it was on, or races with other threads are risked.
553  *
554  * This interface will call into the protocol code, so must not be called
555  * with any socket locks held.  Protocols do call it while holding their own
556  * recursible protocol mutexes, but this is something that should be subject
557  * to review in the future.
558  *
559  * XXXRW: Why do we maintain a distinction between pru_abort() and
560  * pru_detach()?
561  */
562 void
563 soabort(so)
564 	struct socket *so;
565 {
566 
567 	/*
568 	 * In as much as is possible, assert that no references to this
569 	 * socket are held.  This is not quite the same as asserting that the
570 	 * current thread is responsible for arranging for no references, but
571 	 * is as close as we can get for now.
572 	 */
573 	KASSERT(so->so_count == 0, ("soabort: so_count"));
574 	KASSERT(!(so->so_state & SS_PROTOREF), ("soabort: SS_PROTOREF"));
575 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
576 
577 	(*so->so_proto->pr_usrreqs->pru_abort)(so);
578 	ACCEPT_LOCK();
579 	SOCK_LOCK(so);
580 	sofree(so);
581 }
582 
583 int
584 soaccept(so, nam)
585 	struct socket *so;
586 	struct sockaddr **nam;
587 {
588 	int error;
589 
590 	SOCK_LOCK(so);
591 	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
592 	so->so_state &= ~SS_NOFDREF;
593 	SOCK_UNLOCK(so);
594 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
595 	return (error);
596 }
597 
598 int
599 soconnect(so, nam, td)
600 	struct socket *so;
601 	struct sockaddr *nam;
602 	struct thread *td;
603 {
604 	int error;
605 
606 	if (so->so_options & SO_ACCEPTCONN)
607 		return (EOPNOTSUPP);
608 	/*
609 	 * If protocol is connection-based, can only connect once.
610 	 * Otherwise, if connected, try to disconnect first.
611 	 * This allows user to disconnect by connecting to, e.g.,
612 	 * a null address.
613 	 */
614 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
615 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
616 	    (error = sodisconnect(so)))) {
617 		error = EISCONN;
618 	} else {
619 		/*
620 		 * Prevent accumulated error from previous connection
621 		 * from biting us.
622 		 */
623 		so->so_error = 0;
624 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
625 	}
626 
627 	return (error);
628 }
629 
630 int
631 soconnect2(so1, so2)
632 	struct socket *so1;
633 	struct socket *so2;
634 {
635 
636 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
637 }
638 
639 int
640 sodisconnect(so)
641 	struct socket *so;
642 {
643 	int error;
644 
645 	if ((so->so_state & SS_ISCONNECTED) == 0)
646 		return (ENOTCONN);
647 	if (so->so_state & SS_ISDISCONNECTING)
648 		return (EALREADY);
649 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
650 	return (error);
651 }
652 
653 #ifdef ZERO_COPY_SOCKETS
654 struct so_zerocopy_stats{
655 	int size_ok;
656 	int align_ok;
657 	int found_ifp;
658 };
659 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
660 #include <netinet/in.h>
661 #include <net/route.h>
662 #include <netinet/in_pcb.h>
663 #include <vm/vm.h>
664 #include <vm/vm_page.h>
665 #include <vm/vm_object.h>
666 #endif /*ZERO_COPY_SOCKETS*/
667 
668 /*
669  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
670  * all of the data referenced by the uio.  If desired, it uses zero-copy.
671  * *space will be updated to reflect data copied in.
672  *
673  * NB: If atomic I/O is requested, the caller must already have checked that
674  * space can hold resid bytes.
675  *
676  * NB: In the event of an error, the caller may need to free the partial
677  * chain pointed to by *mpp.  The contents of both *uio and *space may be
678  * modified even in the case of an error.
679  */
680 static int
681 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
682     int flags)
683 {
684 	struct mbuf *m, **mp, *top;
685 	long len, resid;
686 	int error;
687 #ifdef ZERO_COPY_SOCKETS
688 	int cow_send;
689 #endif
690 
691 	*retmp = top = NULL;
692 	mp = &top;
693 	len = 0;
694 	resid = uio->uio_resid;
695 	error = 0;
696 	do {
697 #ifdef ZERO_COPY_SOCKETS
698 		cow_send = 0;
699 #endif /* ZERO_COPY_SOCKETS */
700 		if (resid >= MINCLSIZE) {
701 #ifdef ZERO_COPY_SOCKETS
702 			if (top == NULL) {
703 				MGETHDR(m, M_TRYWAIT, MT_DATA);
704 				if (m == NULL) {
705 					error = ENOBUFS;
706 					goto out;
707 				}
708 				m->m_pkthdr.len = 0;
709 				m->m_pkthdr.rcvif = NULL;
710 			} else {
711 				MGET(m, M_TRYWAIT, MT_DATA);
712 				if (m == NULL) {
713 					error = ENOBUFS;
714 					goto out;
715 				}
716 			}
717 			if (so_zero_copy_send &&
718 			    resid>=PAGE_SIZE &&
719 			    *space>=PAGE_SIZE &&
720 			    uio->uio_iov->iov_len>=PAGE_SIZE) {
721 				so_zerocp_stats.size_ok++;
722 				so_zerocp_stats.align_ok++;
723 				cow_send = socow_setup(m, uio);
724 				len = cow_send;
725 			}
726 			if (!cow_send) {
727 				MCLGET(m, M_TRYWAIT);
728 				if ((m->m_flags & M_EXT) == 0) {
729 					m_free(m);
730 					m = NULL;
731 				} else {
732 					len = min(min(MCLBYTES, resid),
733 					    *space);
734 				}
735 			}
736 #else /* ZERO_COPY_SOCKETS */
737 			if (top == NULL) {
738 				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
739 				m->m_pkthdr.len = 0;
740 				m->m_pkthdr.rcvif = NULL;
741 			} else
742 				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
743 			len = min(min(MCLBYTES, resid), *space);
744 #endif /* ZERO_COPY_SOCKETS */
745 		} else {
746 			if (top == NULL) {
747 				m = m_gethdr(M_TRYWAIT, MT_DATA);
748 				m->m_pkthdr.len = 0;
749 				m->m_pkthdr.rcvif = NULL;
750 
751 				len = min(min(MHLEN, resid), *space);
752 				/*
753 				 * For datagram protocols, leave room
754 				 * for protocol headers in first mbuf.
755 				 */
756 				if (atomic && m && len < MHLEN)
757 					MH_ALIGN(m, len);
758 			} else {
759 				m = m_get(M_TRYWAIT, MT_DATA);
760 				len = min(min(MLEN, resid), *space);
761 			}
762 		}
763 		if (m == NULL) {
764 			error = ENOBUFS;
765 			goto out;
766 		}
767 
768 		*space -= len;
769 #ifdef ZERO_COPY_SOCKETS
770 		if (cow_send)
771 			error = 0;
772 		else
773 #endif /* ZERO_COPY_SOCKETS */
774 		error = uiomove(mtod(m, void *), (int)len, uio);
775 		resid = uio->uio_resid;
776 		m->m_len = len;
777 		*mp = m;
778 		top->m_pkthdr.len += len;
779 		if (error)
780 			goto out;
781 		mp = &m->m_next;
782 		if (resid <= 0) {
783 			if (flags & MSG_EOR)
784 				top->m_flags |= M_EOR;
785 			break;
786 		}
787 	} while (*space > 0 && atomic);
788 out:
789 	*retmp = top;
790 	return (error);
791 }
792 
793 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
794 
795 int
796 sosend_dgram(so, addr, uio, top, control, flags, td)
797 	struct socket *so;
798 	struct sockaddr *addr;
799 	struct uio *uio;
800 	struct mbuf *top;
801 	struct mbuf *control;
802 	int flags;
803 	struct thread *td;
804 {
805 	long space, resid;
806 	int clen = 0, error, dontroute;
807 	int atomic = sosendallatonce(so) || top;
808 
809 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
810 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
811 	    ("sodgram_send: !PR_ATOMIC"));
812 
813 	if (uio != NULL)
814 		resid = uio->uio_resid;
815 	else
816 		resid = top->m_pkthdr.len;
817 	/*
818 	 * In theory resid should be unsigned.
819 	 * However, space must be signed, as it might be less than 0
820 	 * if we over-committed, and we must use a signed comparison
821 	 * of space and resid.  On the other hand, a negative resid
822 	 * causes us to loop sending 0-length segments to the protocol.
823 	 *
824 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
825 	 * type sockets since that's an error.
826 	 */
827 	if (resid < 0) {
828 		error = EINVAL;
829 		goto out;
830 	}
831 
832 	dontroute =
833 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
834 	if (td != NULL)
835 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
836 	if (control != NULL)
837 		clen = control->m_len;
838 
839 	SOCKBUF_LOCK(&so->so_snd);
840 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
841 		SOCKBUF_UNLOCK(&so->so_snd);
842 		error = EPIPE;
843 		goto out;
844 	}
845 	if (so->so_error) {
846 		error = so->so_error;
847 		so->so_error = 0;
848 		SOCKBUF_UNLOCK(&so->so_snd);
849 		goto out;
850 	}
851 	if ((so->so_state & SS_ISCONNECTED) == 0) {
852 		/*
853 		 * `sendto' and `sendmsg' is allowed on a connection-
854 		 * based socket if it supports implied connect.
855 		 * Return ENOTCONN if not connected and no address is
856 		 * supplied.
857 		 */
858 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
859 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
860 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
861 			    !(resid == 0 && clen != 0)) {
862 				SOCKBUF_UNLOCK(&so->so_snd);
863 				error = ENOTCONN;
864 				goto out;
865 			}
866 		} else if (addr == NULL) {
867 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
868 				error = ENOTCONN;
869 			else
870 				error = EDESTADDRREQ;
871 			SOCKBUF_UNLOCK(&so->so_snd);
872 			goto out;
873 		}
874 	}
875 
876 	/*
877 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
878 	 * problem and need fixing.
879 	 */
880 	space = sbspace(&so->so_snd);
881 	if (flags & MSG_OOB)
882 		space += 1024;
883 	space -= clen;
884 	if (resid > space) {
885 		error = EMSGSIZE;
886 		goto out;
887 	}
888 	SOCKBUF_UNLOCK(&so->so_snd);
889 	if (uio == NULL) {
890 		resid = 0;
891 		if (flags & MSG_EOR)
892 			top->m_flags |= M_EOR;
893 	} else {
894 		error = sosend_copyin(uio, &top, atomic, &space, flags);
895 		if (error)
896 			goto out;
897 		resid = uio->uio_resid;
898 	}
899 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
900 	/*
901 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
902 	 * than with.
903 	 */
904 	if (dontroute) {
905 		SOCK_LOCK(so);
906 		so->so_options |= SO_DONTROUTE;
907 		SOCK_UNLOCK(so);
908 	}
909 	/*
910 	 * XXX all the SBS_CANTSENDMORE checks previously
911 	 * done could be out of date.  We could have recieved
912 	 * a reset packet in an interrupt or maybe we slept
913 	 * while doing page faults in uiomove() etc. We could
914 	 * probably recheck again inside the locking protection
915 	 * here, but there are probably other places that this
916 	 * also happens.  We must rethink this.
917 	 */
918 	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
919 	    (flags & MSG_OOB) ? PRUS_OOB :
920 	/*
921 	 * If the user set MSG_EOF, the protocol
922 	 * understands this flag and nothing left to
923 	 * send then use PRU_SEND_EOF instead of PRU_SEND.
924 	 */
925 	    ((flags & MSG_EOF) &&
926 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
927 	     (resid <= 0)) ?
928 		PRUS_EOF :
929 		/* If there is more to send set PRUS_MORETOCOME */
930 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
931 		top, addr, control, td);
932 	if (dontroute) {
933 		SOCK_LOCK(so);
934 		so->so_options &= ~SO_DONTROUTE;
935 		SOCK_UNLOCK(so);
936 	}
937 	clen = 0;
938 	control = NULL;
939 	top = NULL;
940 out:
941 	if (top != NULL)
942 		m_freem(top);
943 	if (control != NULL)
944 		m_freem(control);
945 	return (error);
946 }
947 
948 /*
949  * Send on a socket.
950  * If send must go all at once and message is larger than
951  * send buffering, then hard error.
952  * Lock against other senders.
953  * If must go all at once and not enough room now, then
954  * inform user that this would block and do nothing.
955  * Otherwise, if nonblocking, send as much as possible.
956  * The data to be sent is described by "uio" if nonzero,
957  * otherwise by the mbuf chain "top" (which must be null
958  * if uio is not).  Data provided in mbuf chain must be small
959  * enough to send all at once.
960  *
961  * Returns nonzero on error, timeout or signal; callers
962  * must check for short counts if EINTR/ERESTART are returned.
963  * Data and control buffers are freed on return.
964  */
965 #define	snderr(errno)	{ error = (errno); goto release; }
966 int
967 sosend(so, addr, uio, top, control, flags, td)
968 	struct socket *so;
969 	struct sockaddr *addr;
970 	struct uio *uio;
971 	struct mbuf *top;
972 	struct mbuf *control;
973 	int flags;
974 	struct thread *td;
975 {
976 	long space, resid;
977 	int clen = 0, error, dontroute;
978 	int atomic = sosendallatonce(so) || top;
979 
980 	if (uio != NULL)
981 		resid = uio->uio_resid;
982 	else
983 		resid = top->m_pkthdr.len;
984 	/*
985 	 * In theory resid should be unsigned.
986 	 * However, space must be signed, as it might be less than 0
987 	 * if we over-committed, and we must use a signed comparison
988 	 * of space and resid.  On the other hand, a negative resid
989 	 * causes us to loop sending 0-length segments to the protocol.
990 	 *
991 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
992 	 * type sockets since that's an error.
993 	 */
994 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
995 		error = EINVAL;
996 		goto out;
997 	}
998 
999 	dontroute =
1000 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1001 	    (so->so_proto->pr_flags & PR_ATOMIC);
1002 	if (td != NULL)
1003 		td->td_proc->p_stats->p_ru.ru_msgsnd++;
1004 	if (control != NULL)
1005 		clen = control->m_len;
1006 
1007 	SOCKBUF_LOCK(&so->so_snd);
1008 restart:
1009 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1010 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1011 	if (error)
1012 		goto out_locked;
1013 	do {
1014 		SOCKBUF_LOCK_ASSERT(&so->so_snd);
1015 		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
1016 			snderr(EPIPE);
1017 		if (so->so_error) {
1018 			error = so->so_error;
1019 			so->so_error = 0;
1020 			goto release;
1021 		}
1022 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1023 			/*
1024 			 * `sendto' and `sendmsg' is allowed on a connection-
1025 			 * based socket if it supports implied connect.
1026 			 * Return ENOTCONN if not connected and no address is
1027 			 * supplied.
1028 			 */
1029 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1030 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1031 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1032 				    !(resid == 0 && clen != 0))
1033 					snderr(ENOTCONN);
1034 			} else if (addr == NULL)
1035 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
1036 				   ENOTCONN : EDESTADDRREQ);
1037 		}
1038 		space = sbspace(&so->so_snd);
1039 		if (flags & MSG_OOB)
1040 			space += 1024;
1041 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1042 		    clen > so->so_snd.sb_hiwat)
1043 			snderr(EMSGSIZE);
1044 		if (space < resid + clen &&
1045 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1046 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
1047 				snderr(EWOULDBLOCK);
1048 			sbunlock(&so->so_snd);
1049 			error = sbwait(&so->so_snd);
1050 			if (error)
1051 				goto out_locked;
1052 			goto restart;
1053 		}
1054 		SOCKBUF_UNLOCK(&so->so_snd);
1055 		space -= clen;
1056 		do {
1057 			if (uio == NULL) {
1058 				resid = 0;
1059 				if (flags & MSG_EOR)
1060 					top->m_flags |= M_EOR;
1061 			} else {
1062 				error = sosend_copyin(uio, &top, atomic,
1063 				    &space, flags);
1064 				if (error != 0) {
1065 					SOCKBUF_LOCK(&so->so_snd);
1066 					goto release;
1067 				}
1068 				resid = uio->uio_resid;
1069 			}
1070 			if (dontroute) {
1071 				SOCK_LOCK(so);
1072 				so->so_options |= SO_DONTROUTE;
1073 				SOCK_UNLOCK(so);
1074 			}
1075 			/*
1076 			 * XXX all the SBS_CANTSENDMORE checks previously
1077 			 * done could be out of date.  We could have recieved
1078 			 * a reset packet in an interrupt or maybe we slept
1079 			 * while doing page faults in uiomove() etc. We could
1080 			 * probably recheck again inside the locking protection
1081 			 * here, but there are probably other places that this
1082 			 * also happens.  We must rethink this.
1083 			 */
1084 			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1085 			    (flags & MSG_OOB) ? PRUS_OOB :
1086 			/*
1087 			 * If the user set MSG_EOF, the protocol
1088 			 * understands this flag and nothing left to
1089 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
1090 			 */
1091 			    ((flags & MSG_EOF) &&
1092 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1093 			     (resid <= 0)) ?
1094 				PRUS_EOF :
1095 			/* If there is more to send set PRUS_MORETOCOME */
1096 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1097 			    top, addr, control, td);
1098 			if (dontroute) {
1099 				SOCK_LOCK(so);
1100 				so->so_options &= ~SO_DONTROUTE;
1101 				SOCK_UNLOCK(so);
1102 			}
1103 			clen = 0;
1104 			control = NULL;
1105 			top = NULL;
1106 			if (error) {
1107 				SOCKBUF_LOCK(&so->so_snd);
1108 				goto release;
1109 			}
1110 		} while (resid && space > 0);
1111 		SOCKBUF_LOCK(&so->so_snd);
1112 	} while (resid);
1113 
1114 release:
1115 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1116 	sbunlock(&so->so_snd);
1117 out_locked:
1118 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
1119 	SOCKBUF_UNLOCK(&so->so_snd);
1120 out:
1121 	if (top != NULL)
1122 		m_freem(top);
1123 	if (control != NULL)
1124 		m_freem(control);
1125 	return (error);
1126 }
1127 #undef snderr
1128 
1129 /*
1130  * The part of soreceive() that implements reading non-inline out-of-band
1131  * data from a socket.  For more complete comments, see soreceive(), from
1132  * which this code originated.
1133  *
1134  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1135  * unable to return an mbuf chain to the caller.
1136  */
1137 static int
1138 soreceive_rcvoob(so, uio, flags)
1139 	struct socket *so;
1140 	struct uio *uio;
1141 	int flags;
1142 {
1143 	struct protosw *pr = so->so_proto;
1144 	struct mbuf *m;
1145 	int error;
1146 
1147 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1148 
1149 	m = m_get(M_TRYWAIT, MT_DATA);
1150 	if (m == NULL)
1151 		return (ENOBUFS);
1152 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1153 	if (error)
1154 		goto bad;
1155 	do {
1156 #ifdef ZERO_COPY_SOCKETS
1157 		if (so_zero_copy_receive) {
1158 			int disposable;
1159 
1160 			if ((m->m_flags & M_EXT)
1161 			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1162 				disposable = 1;
1163 			else
1164 				disposable = 0;
1165 
1166 			error = uiomoveco(mtod(m, void *),
1167 					  min(uio->uio_resid, m->m_len),
1168 					  uio, disposable);
1169 		} else
1170 #endif /* ZERO_COPY_SOCKETS */
1171 		error = uiomove(mtod(m, void *),
1172 		    (int) min(uio->uio_resid, m->m_len), uio);
1173 		m = m_free(m);
1174 	} while (uio->uio_resid && error == 0 && m);
1175 bad:
1176 	if (m != NULL)
1177 		m_freem(m);
1178 	return (error);
1179 }
1180 
1181 /*
1182  * Following replacement or removal of the first mbuf on the first mbuf chain
1183  * of a socket buffer, push necessary state changes back into the socket
1184  * buffer so that other consumers see the values consistently.  'nextrecord'
1185  * is the callers locally stored value of the original value of
1186  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1187  * NOTE: 'nextrecord' may be NULL.
1188  */
1189 static __inline void
1190 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1191 {
1192 
1193 	SOCKBUF_LOCK_ASSERT(sb);
1194 	/*
1195 	 * First, update for the new value of nextrecord.  If necessary, make
1196 	 * it the first record.
1197 	 */
1198 	if (sb->sb_mb != NULL)
1199 		sb->sb_mb->m_nextpkt = nextrecord;
1200 	else
1201 		sb->sb_mb = nextrecord;
1202 
1203         /*
1204          * Now update any dependent socket buffer fields to reflect the new
1205          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1206 	 * addition of a second clause that takes care of the case where
1207 	 * sb_mb has been updated, but remains the last record.
1208          */
1209         if (sb->sb_mb == NULL) {
1210                 sb->sb_mbtail = NULL;
1211                 sb->sb_lastrecord = NULL;
1212         } else if (sb->sb_mb->m_nextpkt == NULL)
1213                 sb->sb_lastrecord = sb->sb_mb;
1214 }
1215 
1216 
1217 /*
1218  * Implement receive operations on a socket.
1219  * We depend on the way that records are added to the sockbuf
1220  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1221  * must begin with an address if the protocol so specifies,
1222  * followed by an optional mbuf or mbufs containing ancillary data,
1223  * and then zero or more mbufs of data.
1224  * In order to avoid blocking network interrupts for the entire time here,
1225  * we splx() while doing the actual copy to user space.
1226  * Although the sockbuf is locked, new data may still be appended,
1227  * and thus we must maintain consistency of the sockbuf during that time.
1228  *
1229  * The caller may receive the data as a single mbuf chain by supplying
1230  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1231  * only for the count in uio_resid.
1232  */
1233 int
1234 soreceive(so, psa, uio, mp0, controlp, flagsp)
1235 	struct socket *so;
1236 	struct sockaddr **psa;
1237 	struct uio *uio;
1238 	struct mbuf **mp0;
1239 	struct mbuf **controlp;
1240 	int *flagsp;
1241 {
1242 	struct mbuf *m, **mp;
1243 	int flags, len, error, offset;
1244 	struct protosw *pr = so->so_proto;
1245 	struct mbuf *nextrecord;
1246 	int moff, type = 0;
1247 	int orig_resid = uio->uio_resid;
1248 
1249 	mp = mp0;
1250 	if (psa != NULL)
1251 		*psa = NULL;
1252 	if (controlp != NULL)
1253 		*controlp = NULL;
1254 	if (flagsp != NULL)
1255 		flags = *flagsp &~ MSG_EOR;
1256 	else
1257 		flags = 0;
1258 	if (flags & MSG_OOB)
1259 		return (soreceive_rcvoob(so, uio, flags));
1260 	if (mp != NULL)
1261 		*mp = NULL;
1262 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1263 	    && uio->uio_resid)
1264 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1265 
1266 	SOCKBUF_LOCK(&so->so_rcv);
1267 restart:
1268 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1269 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1270 	if (error)
1271 		goto out;
1272 
1273 	m = so->so_rcv.sb_mb;
1274 	/*
1275 	 * If we have less data than requested, block awaiting more
1276 	 * (subject to any timeout) if:
1277 	 *   1. the current count is less than the low water mark, or
1278 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1279 	 *	receive operation at once if we block (resid <= hiwat).
1280 	 *   3. MSG_DONTWAIT is not set
1281 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1282 	 * we have to do the receive in sections, and thus risk returning
1283 	 * a short count if a timeout or signal occurs after we start.
1284 	 */
1285 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1286 	    so->so_rcv.sb_cc < uio->uio_resid) &&
1287 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1288 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1289 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1290 		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1291 		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1292 		    m, so->so_rcv.sb_cc));
1293 		if (so->so_error) {
1294 			if (m != NULL)
1295 				goto dontblock;
1296 			error = so->so_error;
1297 			if ((flags & MSG_PEEK) == 0)
1298 				so->so_error = 0;
1299 			goto release;
1300 		}
1301 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1302 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1303 			if (m)
1304 				goto dontblock;
1305 			else
1306 				goto release;
1307 		}
1308 		for (; m != NULL; m = m->m_next)
1309 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1310 				m = so->so_rcv.sb_mb;
1311 				goto dontblock;
1312 			}
1313 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1314 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1315 			error = ENOTCONN;
1316 			goto release;
1317 		}
1318 		if (uio->uio_resid == 0)
1319 			goto release;
1320 		if ((so->so_state & SS_NBIO) ||
1321 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1322 			error = EWOULDBLOCK;
1323 			goto release;
1324 		}
1325 		SBLASTRECORDCHK(&so->so_rcv);
1326 		SBLASTMBUFCHK(&so->so_rcv);
1327 		sbunlock(&so->so_rcv);
1328 		error = sbwait(&so->so_rcv);
1329 		if (error)
1330 			goto out;
1331 		goto restart;
1332 	}
1333 dontblock:
1334 	/*
1335 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1336 	 * pointer to the next record in the socket buffer.  We must keep the
1337 	 * various socket buffer pointers and local stack versions of the
1338 	 * pointers in sync, pushing out modifications before dropping the
1339 	 * socket buffer mutex, and re-reading them when picking it up.
1340 	 *
1341 	 * Otherwise, we will race with the network stack appending new data
1342 	 * or records onto the socket buffer by using inconsistent/stale
1343 	 * versions of the field, possibly resulting in socket buffer
1344 	 * corruption.
1345 	 *
1346 	 * By holding the high-level sblock(), we prevent simultaneous
1347 	 * readers from pulling off the front of the socket buffer.
1348 	 */
1349 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1350 	if (uio->uio_td)
1351 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1352 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1353 	SBLASTRECORDCHK(&so->so_rcv);
1354 	SBLASTMBUFCHK(&so->so_rcv);
1355 	nextrecord = m->m_nextpkt;
1356 	if (pr->pr_flags & PR_ADDR) {
1357 		KASSERT(m->m_type == MT_SONAME,
1358 		    ("m->m_type == %d", m->m_type));
1359 		orig_resid = 0;
1360 		if (psa != NULL)
1361 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1362 			    M_NOWAIT);
1363 		if (flags & MSG_PEEK) {
1364 			m = m->m_next;
1365 		} else {
1366 			sbfree(&so->so_rcv, m);
1367 			so->so_rcv.sb_mb = m_free(m);
1368 			m = so->so_rcv.sb_mb;
1369 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1370 		}
1371 	}
1372 
1373 	/*
1374 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1375 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1376 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1377 	 * perform externalization (or freeing if controlp == NULL).
1378 	 */
1379 	if (m != NULL && m->m_type == MT_CONTROL) {
1380 		struct mbuf *cm = NULL, *cmn;
1381 		struct mbuf **cme = &cm;
1382 
1383 		do {
1384 			if (flags & MSG_PEEK) {
1385 				if (controlp != NULL) {
1386 					*controlp = m_copy(m, 0, m->m_len);
1387 					controlp = &(*controlp)->m_next;
1388 				}
1389 				m = m->m_next;
1390 			} else {
1391 				sbfree(&so->so_rcv, m);
1392 				so->so_rcv.sb_mb = m->m_next;
1393 				m->m_next = NULL;
1394 				*cme = m;
1395 				cme = &(*cme)->m_next;
1396 				m = so->so_rcv.sb_mb;
1397 			}
1398 		} while (m != NULL && m->m_type == MT_CONTROL);
1399 		if ((flags & MSG_PEEK) == 0)
1400 			sockbuf_pushsync(&so->so_rcv, nextrecord);
1401 		while (cm != NULL) {
1402 			cmn = cm->m_next;
1403 			cm->m_next = NULL;
1404 			if (pr->pr_domain->dom_externalize != NULL) {
1405 				SOCKBUF_UNLOCK(&so->so_rcv);
1406 				error = (*pr->pr_domain->dom_externalize)
1407 				    (cm, controlp);
1408 				SOCKBUF_LOCK(&so->so_rcv);
1409 			} else if (controlp != NULL)
1410 				*controlp = cm;
1411 			else
1412 				m_freem(cm);
1413 			if (controlp != NULL) {
1414 				orig_resid = 0;
1415 				while (*controlp != NULL)
1416 					controlp = &(*controlp)->m_next;
1417 			}
1418 			cm = cmn;
1419 		}
1420 		if (so->so_rcv.sb_mb)
1421 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1422 		else
1423 			nextrecord = NULL;
1424 		orig_resid = 0;
1425 	}
1426 	if (m != NULL) {
1427 		if ((flags & MSG_PEEK) == 0) {
1428 			KASSERT(m->m_nextpkt == nextrecord,
1429 			    ("soreceive: post-control, nextrecord !sync"));
1430 			if (nextrecord == NULL) {
1431 				KASSERT(so->so_rcv.sb_mb == m,
1432 				    ("soreceive: post-control, sb_mb!=m"));
1433 				KASSERT(so->so_rcv.sb_lastrecord == m,
1434 				    ("soreceive: post-control, lastrecord!=m"));
1435 			}
1436 		}
1437 		type = m->m_type;
1438 		if (type == MT_OOBDATA)
1439 			flags |= MSG_OOB;
1440 	} else {
1441 		if ((flags & MSG_PEEK) == 0) {
1442 			KASSERT(so->so_rcv.sb_mb == nextrecord,
1443 			    ("soreceive: sb_mb != nextrecord"));
1444 			if (so->so_rcv.sb_mb == NULL) {
1445 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1446 				    ("soreceive: sb_lastercord != NULL"));
1447 			}
1448 		}
1449 	}
1450 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1451 	SBLASTRECORDCHK(&so->so_rcv);
1452 	SBLASTMBUFCHK(&so->so_rcv);
1453 
1454 	/*
1455 	 * Now continue to read any data mbufs off of the head of the socket
1456 	 * buffer until the read request is satisfied.  Note that 'type' is
1457 	 * used to store the type of any mbuf reads that have happened so far
1458 	 * such that soreceive() can stop reading if the type changes, which
1459 	 * causes soreceive() to return only one of regular data and inline
1460 	 * out-of-band data in a single socket receive operation.
1461 	 */
1462 	moff = 0;
1463 	offset = 0;
1464 	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1465 		/*
1466 		 * If the type of mbuf has changed since the last mbuf
1467 		 * examined ('type'), end the receive operation.
1468 	 	 */
1469 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1470 		if (m->m_type == MT_OOBDATA) {
1471 			if (type != MT_OOBDATA)
1472 				break;
1473 		} else if (type == MT_OOBDATA)
1474 			break;
1475 		else
1476 		    KASSERT(m->m_type == MT_DATA,
1477 			("m->m_type == %d", m->m_type));
1478 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1479 		len = uio->uio_resid;
1480 		if (so->so_oobmark && len > so->so_oobmark - offset)
1481 			len = so->so_oobmark - offset;
1482 		if (len > m->m_len - moff)
1483 			len = m->m_len - moff;
1484 		/*
1485 		 * If mp is set, just pass back the mbufs.
1486 		 * Otherwise copy them out via the uio, then free.
1487 		 * Sockbuf must be consistent here (points to current mbuf,
1488 		 * it points to next record) when we drop priority;
1489 		 * we must note any additions to the sockbuf when we
1490 		 * block interrupts again.
1491 		 */
1492 		if (mp == NULL) {
1493 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1494 			SBLASTRECORDCHK(&so->so_rcv);
1495 			SBLASTMBUFCHK(&so->so_rcv);
1496 			SOCKBUF_UNLOCK(&so->so_rcv);
1497 #ifdef ZERO_COPY_SOCKETS
1498 			if (so_zero_copy_receive) {
1499 				int disposable;
1500 
1501 				if ((m->m_flags & M_EXT)
1502 				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1503 					disposable = 1;
1504 				else
1505 					disposable = 0;
1506 
1507 				error = uiomoveco(mtod(m, char *) + moff,
1508 						  (int)len, uio,
1509 						  disposable);
1510 			} else
1511 #endif /* ZERO_COPY_SOCKETS */
1512 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1513 			SOCKBUF_LOCK(&so->so_rcv);
1514 			if (error)
1515 				goto release;
1516 		} else
1517 			uio->uio_resid -= len;
1518 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1519 		if (len == m->m_len - moff) {
1520 			if (m->m_flags & M_EOR)
1521 				flags |= MSG_EOR;
1522 			if (flags & MSG_PEEK) {
1523 				m = m->m_next;
1524 				moff = 0;
1525 			} else {
1526 				nextrecord = m->m_nextpkt;
1527 				sbfree(&so->so_rcv, m);
1528 				if (mp != NULL) {
1529 					*mp = m;
1530 					mp = &m->m_next;
1531 					so->so_rcv.sb_mb = m = m->m_next;
1532 					*mp = NULL;
1533 				} else {
1534 					so->so_rcv.sb_mb = m_free(m);
1535 					m = so->so_rcv.sb_mb;
1536 				}
1537 				sockbuf_pushsync(&so->so_rcv, nextrecord);
1538 				SBLASTRECORDCHK(&so->so_rcv);
1539 				SBLASTMBUFCHK(&so->so_rcv);
1540 			}
1541 		} else {
1542 			if (flags & MSG_PEEK)
1543 				moff += len;
1544 			else {
1545 				if (mp != NULL) {
1546 					int copy_flag;
1547 
1548 					if (flags & MSG_DONTWAIT)
1549 						copy_flag = M_DONTWAIT;
1550 					else
1551 						copy_flag = M_TRYWAIT;
1552 					if (copy_flag == M_TRYWAIT)
1553 						SOCKBUF_UNLOCK(&so->so_rcv);
1554 					*mp = m_copym(m, 0, len, copy_flag);
1555 					if (copy_flag == M_TRYWAIT)
1556 						SOCKBUF_LOCK(&so->so_rcv);
1557  					if (*mp == NULL) {
1558  						/*
1559  						 * m_copym() couldn't allocate an mbuf.
1560 						 * Adjust uio_resid back (it was adjusted
1561 						 * down by len bytes, which we didn't end
1562 						 * up "copying" over).
1563  						 */
1564  						uio->uio_resid += len;
1565  						break;
1566  					}
1567 				}
1568 				m->m_data += len;
1569 				m->m_len -= len;
1570 				so->so_rcv.sb_cc -= len;
1571 			}
1572 		}
1573 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1574 		if (so->so_oobmark) {
1575 			if ((flags & MSG_PEEK) == 0) {
1576 				so->so_oobmark -= len;
1577 				if (so->so_oobmark == 0) {
1578 					so->so_rcv.sb_state |= SBS_RCVATMARK;
1579 					break;
1580 				}
1581 			} else {
1582 				offset += len;
1583 				if (offset == so->so_oobmark)
1584 					break;
1585 			}
1586 		}
1587 		if (flags & MSG_EOR)
1588 			break;
1589 		/*
1590 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1591 		 * we must not quit until "uio->uio_resid == 0" or an error
1592 		 * termination.  If a signal/timeout occurs, return
1593 		 * with a short count but without error.
1594 		 * Keep sockbuf locked against other readers.
1595 		 */
1596 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1597 		    !sosendallatonce(so) && nextrecord == NULL) {
1598 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1599 			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1600 				break;
1601 			/*
1602 			 * Notify the protocol that some data has been
1603 			 * drained before blocking.
1604 			 */
1605 			if (pr->pr_flags & PR_WANTRCVD) {
1606 				SOCKBUF_UNLOCK(&so->so_rcv);
1607 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1608 				SOCKBUF_LOCK(&so->so_rcv);
1609 			}
1610 			SBLASTRECORDCHK(&so->so_rcv);
1611 			SBLASTMBUFCHK(&so->so_rcv);
1612 			error = sbwait(&so->so_rcv);
1613 			if (error)
1614 				goto release;
1615 			m = so->so_rcv.sb_mb;
1616 			if (m != NULL)
1617 				nextrecord = m->m_nextpkt;
1618 		}
1619 	}
1620 
1621 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1622 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1623 		flags |= MSG_TRUNC;
1624 		if ((flags & MSG_PEEK) == 0)
1625 			(void) sbdroprecord_locked(&so->so_rcv);
1626 	}
1627 	if ((flags & MSG_PEEK) == 0) {
1628 		if (m == NULL) {
1629 			/*
1630 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1631 			 * part makes sure sb_lastrecord is up-to-date if
1632 			 * there is still data in the socket buffer.
1633 			 */
1634 			so->so_rcv.sb_mb = nextrecord;
1635 			if (so->so_rcv.sb_mb == NULL) {
1636 				so->so_rcv.sb_mbtail = NULL;
1637 				so->so_rcv.sb_lastrecord = NULL;
1638 			} else if (nextrecord->m_nextpkt == NULL)
1639 				so->so_rcv.sb_lastrecord = nextrecord;
1640 		}
1641 		SBLASTRECORDCHK(&so->so_rcv);
1642 		SBLASTMBUFCHK(&so->so_rcv);
1643 		/*
1644 		 * If soreceive() is being done from the socket callback, then
1645 		 * don't need to generate ACK to peer to update window, since
1646 		 * ACK will be generated on return to TCP.
1647 		 */
1648 		if (!(flags & MSG_SOCALLBCK) &&
1649 		    (pr->pr_flags & PR_WANTRCVD)) {
1650 			SOCKBUF_UNLOCK(&so->so_rcv);
1651 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1652 			SOCKBUF_LOCK(&so->so_rcv);
1653 		}
1654 	}
1655 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1656 	if (orig_resid == uio->uio_resid && orig_resid &&
1657 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1658 		sbunlock(&so->so_rcv);
1659 		goto restart;
1660 	}
1661 
1662 	if (flagsp != NULL)
1663 		*flagsp |= flags;
1664 release:
1665 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1666 	sbunlock(&so->so_rcv);
1667 out:
1668 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1669 	SOCKBUF_UNLOCK(&so->so_rcv);
1670 	return (error);
1671 }
1672 
1673 int
1674 soshutdown(so, how)
1675 	struct socket *so;
1676 	int how;
1677 {
1678 	struct protosw *pr = so->so_proto;
1679 
1680 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1681 		return (EINVAL);
1682 
1683 	if (how != SHUT_WR)
1684 		sorflush(so);
1685 	if (how != SHUT_RD)
1686 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
1687 	return (0);
1688 }
1689 
1690 void
1691 sorflush(so)
1692 	struct socket *so;
1693 {
1694 	struct sockbuf *sb = &so->so_rcv;
1695 	struct protosw *pr = so->so_proto;
1696 	struct sockbuf asb;
1697 
1698 	/*
1699 	 * XXXRW: This is quite ugly.  Previously, this code made a copy of
1700 	 * the socket buffer, then zero'd the original to clear the buffer
1701 	 * fields.  However, with mutexes in the socket buffer, this causes
1702 	 * problems.  We only clear the zeroable bits of the original;
1703 	 * however, we have to initialize and destroy the mutex in the copy
1704 	 * so that dom_dispose() and sbrelease() can lock t as needed.
1705 	 */
1706 	SOCKBUF_LOCK(sb);
1707 	sb->sb_flags |= SB_NOINTR;
1708 	(void) sblock(sb, M_WAITOK);
1709 	/*
1710 	 * socantrcvmore_locked() drops the socket buffer mutex so that it
1711 	 * can safely perform wakeups.  Re-acquire the mutex before
1712 	 * continuing.
1713 	 */
1714 	socantrcvmore_locked(so);
1715 	SOCKBUF_LOCK(sb);
1716 	sbunlock(sb);
1717 	/*
1718 	 * Invalidate/clear most of the sockbuf structure, but leave
1719 	 * selinfo and mutex data unchanged.
1720 	 */
1721 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1722 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
1723 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1724 	bzero(&sb->sb_startzero,
1725 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1726 	SOCKBUF_UNLOCK(sb);
1727 
1728 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1729 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1730 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1731 	sbrelease(&asb, so);
1732 	SOCKBUF_LOCK_DESTROY(&asb);
1733 }
1734 
1735 /*
1736  * Perhaps this routine, and sooptcopyout(), below, ought to come in
1737  * an additional variant to handle the case where the option value needs
1738  * to be some kind of integer, but not a specific size.
1739  * In addition to their use here, these functions are also called by the
1740  * protocol-level pr_ctloutput() routines.
1741  */
1742 int
1743 sooptcopyin(sopt, buf, len, minlen)
1744 	struct	sockopt *sopt;
1745 	void	*buf;
1746 	size_t	len;
1747 	size_t	minlen;
1748 {
1749 	size_t	valsize;
1750 
1751 	/*
1752 	 * If the user gives us more than we wanted, we ignore it,
1753 	 * but if we don't get the minimum length the caller
1754 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
1755 	 * is set to however much we actually retrieved.
1756 	 */
1757 	if ((valsize = sopt->sopt_valsize) < minlen)
1758 		return EINVAL;
1759 	if (valsize > len)
1760 		sopt->sopt_valsize = valsize = len;
1761 
1762 	if (sopt->sopt_td != NULL)
1763 		return (copyin(sopt->sopt_val, buf, valsize));
1764 
1765 	bcopy(sopt->sopt_val, buf, valsize);
1766 	return (0);
1767 }
1768 
1769 /*
1770  * Kernel version of setsockopt(2)/
1771  * XXX: optlen is size_t, not socklen_t
1772  */
1773 int
1774 so_setsockopt(struct socket *so, int level, int optname, void *optval,
1775     size_t optlen)
1776 {
1777 	struct sockopt sopt;
1778 
1779 	sopt.sopt_level = level;
1780 	sopt.sopt_name = optname;
1781 	sopt.sopt_dir = SOPT_SET;
1782 	sopt.sopt_val = optval;
1783 	sopt.sopt_valsize = optlen;
1784 	sopt.sopt_td = NULL;
1785 	return (sosetopt(so, &sopt));
1786 }
1787 
1788 int
1789 sosetopt(so, sopt)
1790 	struct socket *so;
1791 	struct sockopt *sopt;
1792 {
1793 	int	error, optval;
1794 	struct	linger l;
1795 	struct	timeval tv;
1796 	u_long  val;
1797 #ifdef MAC
1798 	struct mac extmac;
1799 #endif
1800 
1801 	error = 0;
1802 	if (sopt->sopt_level != SOL_SOCKET) {
1803 		if (so->so_proto && so->so_proto->pr_ctloutput)
1804 			return ((*so->so_proto->pr_ctloutput)
1805 				  (so, sopt));
1806 		error = ENOPROTOOPT;
1807 	} else {
1808 		switch (sopt->sopt_name) {
1809 #ifdef INET
1810 		case SO_ACCEPTFILTER:
1811 			error = do_setopt_accept_filter(so, sopt);
1812 			if (error)
1813 				goto bad;
1814 			break;
1815 #endif
1816 		case SO_LINGER:
1817 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1818 			if (error)
1819 				goto bad;
1820 
1821 			SOCK_LOCK(so);
1822 			so->so_linger = l.l_linger;
1823 			if (l.l_onoff)
1824 				so->so_options |= SO_LINGER;
1825 			else
1826 				so->so_options &= ~SO_LINGER;
1827 			SOCK_UNLOCK(so);
1828 			break;
1829 
1830 		case SO_DEBUG:
1831 		case SO_KEEPALIVE:
1832 		case SO_DONTROUTE:
1833 		case SO_USELOOPBACK:
1834 		case SO_BROADCAST:
1835 		case SO_REUSEADDR:
1836 		case SO_REUSEPORT:
1837 		case SO_OOBINLINE:
1838 		case SO_TIMESTAMP:
1839 		case SO_BINTIME:
1840 		case SO_NOSIGPIPE:
1841 			error = sooptcopyin(sopt, &optval, sizeof optval,
1842 					    sizeof optval);
1843 			if (error)
1844 				goto bad;
1845 			SOCK_LOCK(so);
1846 			if (optval)
1847 				so->so_options |= sopt->sopt_name;
1848 			else
1849 				so->so_options &= ~sopt->sopt_name;
1850 			SOCK_UNLOCK(so);
1851 			break;
1852 
1853 		case SO_SNDBUF:
1854 		case SO_RCVBUF:
1855 		case SO_SNDLOWAT:
1856 		case SO_RCVLOWAT:
1857 			error = sooptcopyin(sopt, &optval, sizeof optval,
1858 					    sizeof optval);
1859 			if (error)
1860 				goto bad;
1861 
1862 			/*
1863 			 * Values < 1 make no sense for any of these
1864 			 * options, so disallow them.
1865 			 */
1866 			if (optval < 1) {
1867 				error = EINVAL;
1868 				goto bad;
1869 			}
1870 
1871 			switch (sopt->sopt_name) {
1872 			case SO_SNDBUF:
1873 			case SO_RCVBUF:
1874 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1875 				    &so->so_snd : &so->so_rcv, (u_long)optval,
1876 				    so, curthread) == 0) {
1877 					error = ENOBUFS;
1878 					goto bad;
1879 				}
1880 				break;
1881 
1882 			/*
1883 			 * Make sure the low-water is never greater than
1884 			 * the high-water.
1885 			 */
1886 			case SO_SNDLOWAT:
1887 				SOCKBUF_LOCK(&so->so_snd);
1888 				so->so_snd.sb_lowat =
1889 				    (optval > so->so_snd.sb_hiwat) ?
1890 				    so->so_snd.sb_hiwat : optval;
1891 				SOCKBUF_UNLOCK(&so->so_snd);
1892 				break;
1893 			case SO_RCVLOWAT:
1894 				SOCKBUF_LOCK(&so->so_rcv);
1895 				so->so_rcv.sb_lowat =
1896 				    (optval > so->so_rcv.sb_hiwat) ?
1897 				    so->so_rcv.sb_hiwat : optval;
1898 				SOCKBUF_UNLOCK(&so->so_rcv);
1899 				break;
1900 			}
1901 			break;
1902 
1903 		case SO_SNDTIMEO:
1904 		case SO_RCVTIMEO:
1905 #ifdef COMPAT_IA32
1906 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
1907 				struct timeval32 tv32;
1908 
1909 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
1910 				    sizeof tv32);
1911 				CP(tv32, tv, tv_sec);
1912 				CP(tv32, tv, tv_usec);
1913 			} else
1914 #endif
1915 				error = sooptcopyin(sopt, &tv, sizeof tv,
1916 				    sizeof tv);
1917 			if (error)
1918 				goto bad;
1919 
1920 			/* assert(hz > 0); */
1921 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
1922 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1923 				error = EDOM;
1924 				goto bad;
1925 			}
1926 			/* assert(tick > 0); */
1927 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
1928 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1929 			if (val > INT_MAX) {
1930 				error = EDOM;
1931 				goto bad;
1932 			}
1933 			if (val == 0 && tv.tv_usec != 0)
1934 				val = 1;
1935 
1936 			switch (sopt->sopt_name) {
1937 			case SO_SNDTIMEO:
1938 				so->so_snd.sb_timeo = val;
1939 				break;
1940 			case SO_RCVTIMEO:
1941 				so->so_rcv.sb_timeo = val;
1942 				break;
1943 			}
1944 			break;
1945 
1946 		case SO_LABEL:
1947 #ifdef MAC
1948 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
1949 			    sizeof extmac);
1950 			if (error)
1951 				goto bad;
1952 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1953 			    so, &extmac);
1954 #else
1955 			error = EOPNOTSUPP;
1956 #endif
1957 			break;
1958 
1959 		default:
1960 			error = ENOPROTOOPT;
1961 			break;
1962 		}
1963 		if (error == 0 && so->so_proto != NULL &&
1964 		    so->so_proto->pr_ctloutput != NULL) {
1965 			(void) ((*so->so_proto->pr_ctloutput)
1966 				  (so, sopt));
1967 		}
1968 	}
1969 bad:
1970 	return (error);
1971 }
1972 
1973 /* Helper routine for getsockopt */
1974 int
1975 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1976 {
1977 	int	error;
1978 	size_t	valsize;
1979 
1980 	error = 0;
1981 
1982 	/*
1983 	 * Documented get behavior is that we always return a value,
1984 	 * possibly truncated to fit in the user's buffer.
1985 	 * Traditional behavior is that we always tell the user
1986 	 * precisely how much we copied, rather than something useful
1987 	 * like the total amount we had available for her.
1988 	 * Note that this interface is not idempotent; the entire answer must
1989 	 * generated ahead of time.
1990 	 */
1991 	valsize = min(len, sopt->sopt_valsize);
1992 	sopt->sopt_valsize = valsize;
1993 	if (sopt->sopt_val != NULL) {
1994 		if (sopt->sopt_td != NULL)
1995 			error = copyout(buf, sopt->sopt_val, valsize);
1996 		else
1997 			bcopy(buf, sopt->sopt_val, valsize);
1998 	}
1999 	return (error);
2000 }
2001 
2002 int
2003 sogetopt(so, sopt)
2004 	struct socket *so;
2005 	struct sockopt *sopt;
2006 {
2007 	int	error, optval;
2008 	struct	linger l;
2009 	struct	timeval tv;
2010 #ifdef MAC
2011 	struct mac extmac;
2012 #endif
2013 
2014 	error = 0;
2015 	if (sopt->sopt_level != SOL_SOCKET) {
2016 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2017 			return ((*so->so_proto->pr_ctloutput)
2018 				  (so, sopt));
2019 		} else
2020 			return (ENOPROTOOPT);
2021 	} else {
2022 		switch (sopt->sopt_name) {
2023 #ifdef INET
2024 		case SO_ACCEPTFILTER:
2025 			error = do_getopt_accept_filter(so, sopt);
2026 			break;
2027 #endif
2028 		case SO_LINGER:
2029 			SOCK_LOCK(so);
2030 			l.l_onoff = so->so_options & SO_LINGER;
2031 			l.l_linger = so->so_linger;
2032 			SOCK_UNLOCK(so);
2033 			error = sooptcopyout(sopt, &l, sizeof l);
2034 			break;
2035 
2036 		case SO_USELOOPBACK:
2037 		case SO_DONTROUTE:
2038 		case SO_DEBUG:
2039 		case SO_KEEPALIVE:
2040 		case SO_REUSEADDR:
2041 		case SO_REUSEPORT:
2042 		case SO_BROADCAST:
2043 		case SO_OOBINLINE:
2044 		case SO_ACCEPTCONN:
2045 		case SO_TIMESTAMP:
2046 		case SO_BINTIME:
2047 		case SO_NOSIGPIPE:
2048 			optval = so->so_options & sopt->sopt_name;
2049 integer:
2050 			error = sooptcopyout(sopt, &optval, sizeof optval);
2051 			break;
2052 
2053 		case SO_TYPE:
2054 			optval = so->so_type;
2055 			goto integer;
2056 
2057 		case SO_ERROR:
2058 			optval = so->so_error;
2059 			so->so_error = 0;
2060 			goto integer;
2061 
2062 		case SO_SNDBUF:
2063 			optval = so->so_snd.sb_hiwat;
2064 			goto integer;
2065 
2066 		case SO_RCVBUF:
2067 			optval = so->so_rcv.sb_hiwat;
2068 			goto integer;
2069 
2070 		case SO_SNDLOWAT:
2071 			optval = so->so_snd.sb_lowat;
2072 			goto integer;
2073 
2074 		case SO_RCVLOWAT:
2075 			optval = so->so_rcv.sb_lowat;
2076 			goto integer;
2077 
2078 		case SO_SNDTIMEO:
2079 		case SO_RCVTIMEO:
2080 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2081 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2082 
2083 			tv.tv_sec = optval / hz;
2084 			tv.tv_usec = (optval % hz) * tick;
2085 #ifdef COMPAT_IA32
2086 			if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
2087 				struct timeval32 tv32;
2088 
2089 				CP(tv, tv32, tv_sec);
2090 				CP(tv, tv32, tv_usec);
2091 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2092 			} else
2093 #endif
2094 				error = sooptcopyout(sopt, &tv, sizeof tv);
2095 			break;
2096 
2097 		case SO_LABEL:
2098 #ifdef MAC
2099 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2100 			    sizeof(extmac));
2101 			if (error)
2102 				return (error);
2103 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2104 			    so, &extmac);
2105 			if (error)
2106 				return (error);
2107 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2108 #else
2109 			error = EOPNOTSUPP;
2110 #endif
2111 			break;
2112 
2113 		case SO_PEERLABEL:
2114 #ifdef MAC
2115 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2116 			    sizeof(extmac));
2117 			if (error)
2118 				return (error);
2119 			error = mac_getsockopt_peerlabel(
2120 			    sopt->sopt_td->td_ucred, so, &extmac);
2121 			if (error)
2122 				return (error);
2123 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2124 #else
2125 			error = EOPNOTSUPP;
2126 #endif
2127 			break;
2128 
2129 		case SO_LISTENQLIMIT:
2130 			optval = so->so_qlimit;
2131 			goto integer;
2132 
2133 		case SO_LISTENQLEN:
2134 			optval = so->so_qlen;
2135 			goto integer;
2136 
2137 		case SO_LISTENINCQLEN:
2138 			optval = so->so_incqlen;
2139 			goto integer;
2140 
2141 		default:
2142 			error = ENOPROTOOPT;
2143 			break;
2144 		}
2145 		return (error);
2146 	}
2147 }
2148 
2149 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2150 int
2151 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2152 {
2153 	struct mbuf *m, *m_prev;
2154 	int sopt_size = sopt->sopt_valsize;
2155 
2156 	MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2157 	if (m == NULL)
2158 		return ENOBUFS;
2159 	if (sopt_size > MLEN) {
2160 		MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
2161 		if ((m->m_flags & M_EXT) == 0) {
2162 			m_free(m);
2163 			return ENOBUFS;
2164 		}
2165 		m->m_len = min(MCLBYTES, sopt_size);
2166 	} else {
2167 		m->m_len = min(MLEN, sopt_size);
2168 	}
2169 	sopt_size -= m->m_len;
2170 	*mp = m;
2171 	m_prev = m;
2172 
2173 	while (sopt_size) {
2174 		MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2175 		if (m == NULL) {
2176 			m_freem(*mp);
2177 			return ENOBUFS;
2178 		}
2179 		if (sopt_size > MLEN) {
2180 			MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2181 			    M_DONTWAIT);
2182 			if ((m->m_flags & M_EXT) == 0) {
2183 				m_freem(m);
2184 				m_freem(*mp);
2185 				return ENOBUFS;
2186 			}
2187 			m->m_len = min(MCLBYTES, sopt_size);
2188 		} else {
2189 			m->m_len = min(MLEN, sopt_size);
2190 		}
2191 		sopt_size -= m->m_len;
2192 		m_prev->m_next = m;
2193 		m_prev = m;
2194 	}
2195 	return (0);
2196 }
2197 
2198 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2199 int
2200 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2201 {
2202 	struct mbuf *m0 = m;
2203 
2204 	if (sopt->sopt_val == NULL)
2205 		return (0);
2206 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2207 		if (sopt->sopt_td != NULL) {
2208 			int error;
2209 
2210 			error = copyin(sopt->sopt_val, mtod(m, char *),
2211 				       m->m_len);
2212 			if (error != 0) {
2213 				m_freem(m0);
2214 				return(error);
2215 			}
2216 		} else
2217 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2218 		sopt->sopt_valsize -= m->m_len;
2219 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2220 		m = m->m_next;
2221 	}
2222 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2223 		panic("ip6_sooptmcopyin");
2224 	return (0);
2225 }
2226 
2227 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2228 int
2229 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2230 {
2231 	struct mbuf *m0 = m;
2232 	size_t valsize = 0;
2233 
2234 	if (sopt->sopt_val == NULL)
2235 		return (0);
2236 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2237 		if (sopt->sopt_td != NULL) {
2238 			int error;
2239 
2240 			error = copyout(mtod(m, char *), sopt->sopt_val,
2241 				       m->m_len);
2242 			if (error != 0) {
2243 				m_freem(m0);
2244 				return(error);
2245 			}
2246 		} else
2247 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2248 	       sopt->sopt_valsize -= m->m_len;
2249 	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2250 	       valsize += m->m_len;
2251 	       m = m->m_next;
2252 	}
2253 	if (m != NULL) {
2254 		/* enough soopt buffer should be given from user-land */
2255 		m_freem(m0);
2256 		return(EINVAL);
2257 	}
2258 	sopt->sopt_valsize = valsize;
2259 	return (0);
2260 }
2261 
2262 void
2263 sohasoutofband(so)
2264 	struct socket *so;
2265 {
2266 	if (so->so_sigio != NULL)
2267 		pgsigio(&so->so_sigio, SIGURG, 0);
2268 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2269 }
2270 
2271 int
2272 sopoll(struct socket *so, int events, struct ucred *active_cred,
2273     struct thread *td)
2274 {
2275 	int revents = 0;
2276 
2277 	SOCKBUF_LOCK(&so->so_snd);
2278 	SOCKBUF_LOCK(&so->so_rcv);
2279 	if (events & (POLLIN | POLLRDNORM))
2280 		if (soreadable(so))
2281 			revents |= events & (POLLIN | POLLRDNORM);
2282 
2283 	if (events & POLLINIGNEOF)
2284 		if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2285 		    !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2286 			revents |= POLLINIGNEOF;
2287 
2288 	if (events & (POLLOUT | POLLWRNORM))
2289 		if (sowriteable(so))
2290 			revents |= events & (POLLOUT | POLLWRNORM);
2291 
2292 	if (events & (POLLPRI | POLLRDBAND))
2293 		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2294 			revents |= events & (POLLPRI | POLLRDBAND);
2295 
2296 	if (revents == 0) {
2297 		if (events &
2298 		    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2299 		     POLLRDBAND)) {
2300 			selrecord(td, &so->so_rcv.sb_sel);
2301 			so->so_rcv.sb_flags |= SB_SEL;
2302 		}
2303 
2304 		if (events & (POLLOUT | POLLWRNORM)) {
2305 			selrecord(td, &so->so_snd.sb_sel);
2306 			so->so_snd.sb_flags |= SB_SEL;
2307 		}
2308 	}
2309 
2310 	SOCKBUF_UNLOCK(&so->so_rcv);
2311 	SOCKBUF_UNLOCK(&so->so_snd);
2312 	return (revents);
2313 }
2314 
2315 int
2316 soo_kqfilter(struct file *fp, struct knote *kn)
2317 {
2318 	struct socket *so = kn->kn_fp->f_data;
2319 	struct sockbuf *sb;
2320 
2321 	switch (kn->kn_filter) {
2322 	case EVFILT_READ:
2323 		if (so->so_options & SO_ACCEPTCONN)
2324 			kn->kn_fop = &solisten_filtops;
2325 		else
2326 			kn->kn_fop = &soread_filtops;
2327 		sb = &so->so_rcv;
2328 		break;
2329 	case EVFILT_WRITE:
2330 		kn->kn_fop = &sowrite_filtops;
2331 		sb = &so->so_snd;
2332 		break;
2333 	default:
2334 		return (EINVAL);
2335 	}
2336 
2337 	SOCKBUF_LOCK(sb);
2338 	knlist_add(&sb->sb_sel.si_note, kn, 1);
2339 	sb->sb_flags |= SB_KNOTE;
2340 	SOCKBUF_UNLOCK(sb);
2341 	return (0);
2342 }
2343 
2344 static void
2345 filt_sordetach(struct knote *kn)
2346 {
2347 	struct socket *so = kn->kn_fp->f_data;
2348 
2349 	SOCKBUF_LOCK(&so->so_rcv);
2350 	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2351 	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2352 		so->so_rcv.sb_flags &= ~SB_KNOTE;
2353 	SOCKBUF_UNLOCK(&so->so_rcv);
2354 }
2355 
2356 /*ARGSUSED*/
2357 static int
2358 filt_soread(struct knote *kn, long hint)
2359 {
2360 	struct socket *so;
2361 
2362 	so = kn->kn_fp->f_data;
2363 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2364 
2365 	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2366 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2367 		kn->kn_flags |= EV_EOF;
2368 		kn->kn_fflags = so->so_error;
2369 		return (1);
2370 	} else if (so->so_error)	/* temporary udp error */
2371 		return (1);
2372 	else if (kn->kn_sfflags & NOTE_LOWAT)
2373 		return (kn->kn_data >= kn->kn_sdata);
2374 	else
2375 		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2376 }
2377 
2378 static void
2379 filt_sowdetach(struct knote *kn)
2380 {
2381 	struct socket *so = kn->kn_fp->f_data;
2382 
2383 	SOCKBUF_LOCK(&so->so_snd);
2384 	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2385 	if (knlist_empty(&so->so_snd.sb_sel.si_note))
2386 		so->so_snd.sb_flags &= ~SB_KNOTE;
2387 	SOCKBUF_UNLOCK(&so->so_snd);
2388 }
2389 
2390 /*ARGSUSED*/
2391 static int
2392 filt_sowrite(struct knote *kn, long hint)
2393 {
2394 	struct socket *so;
2395 
2396 	so = kn->kn_fp->f_data;
2397 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
2398 	kn->kn_data = sbspace(&so->so_snd);
2399 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2400 		kn->kn_flags |= EV_EOF;
2401 		kn->kn_fflags = so->so_error;
2402 		return (1);
2403 	} else if (so->so_error)	/* temporary udp error */
2404 		return (1);
2405 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2406 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2407 		return (0);
2408 	else if (kn->kn_sfflags & NOTE_LOWAT)
2409 		return (kn->kn_data >= kn->kn_sdata);
2410 	else
2411 		return (kn->kn_data >= so->so_snd.sb_lowat);
2412 }
2413 
2414 /*ARGSUSED*/
2415 static int
2416 filt_solisten(struct knote *kn, long hint)
2417 {
2418 	struct socket *so = kn->kn_fp->f_data;
2419 
2420 	kn->kn_data = so->so_qlen;
2421 	return (! TAILQ_EMPTY(&so->so_comp));
2422 }
2423 
2424 int
2425 socheckuid(struct socket *so, uid_t uid)
2426 {
2427 
2428 	if (so == NULL)
2429 		return (EPERM);
2430 	if (so->so_cred->cr_uid != uid)
2431 		return (EPERM);
2432 	return (0);
2433 }
2434 
2435 static int
2436 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
2437 {
2438 	int error;
2439 	int val;
2440 
2441 	val = somaxconn;
2442 	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
2443 	if (error || !req->newptr )
2444 		return (error);
2445 
2446 	if (val < 1 || val > USHRT_MAX)
2447 		return (EINVAL);
2448 
2449 	somaxconn = val;
2450 	return (0);
2451 }
2452