xref: /freebsd/sys/kern/uipc_socket.c (revision 6e8394b8baa7d5d9153ab90de6824bcd19b3b4e1)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  *	$Id: uipc_socket.c,v 1.59 1999/06/04 02:27:02 peter Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/fcntl.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/poll.h>
46 #include <sys/proc.h>
47 #include <sys/protosw.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/resourcevar.h>
51 #include <sys/signalvar.h>
52 #include <sys/sysctl.h>
53 #include <sys/uio.h>
54 #include <vm/vm_zone.h>
55 
56 #include <machine/limits.h>
57 
58 struct	vm_zone *socket_zone;
59 so_gen_t	so_gencnt;	/* generation count for sockets */
60 
61 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
62 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
63 
64 SYSCTL_DECL(_kern_ipc);
65 
66 static int somaxconn = SOMAXCONN;
67 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
68     &somaxconn, 0, "Maximum pending socket connection queue size");
69 
70 /*
71  * Socket operation routines.
72  * These routines are called by the routines in
73  * sys_socket.c or from a system process, and
74  * implement the semantics of socket operations by
75  * switching out to the protocol specific routines.
76  */
77 
78 /*
79  * Get a socket structure from our zone, and initialize it.
80  * We don't implement `waitok' yet (see comments in uipc_domain.c).
81  * Note that it would probably be better to allocate socket
82  * and PCB at the same time, but I'm not convinced that all
83  * the protocols can be easily modified to do this.
84  */
85 struct socket *
86 soalloc(waitok)
87 	int waitok;
88 {
89 	struct socket *so;
90 
91 	so = zalloci(socket_zone);
92 	if (so) {
93 		/* XXX race condition for reentrant kernel */
94 		bzero(so, sizeof *so);
95 		so->so_gencnt = ++so_gencnt;
96 		so->so_zone = socket_zone;
97 	}
98 	return so;
99 }
100 
101 int
102 socreate(dom, aso, type, proto, p)
103 	int dom;
104 	struct socket **aso;
105 	register int type;
106 	int proto;
107 	struct proc *p;
108 {
109 	register struct protosw *prp;
110 	register struct socket *so;
111 	register int error;
112 
113 	if (proto)
114 		prp = pffindproto(dom, proto, type);
115 	else
116 		prp = pffindtype(dom, type);
117 	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
118 		return (EPROTONOSUPPORT);
119 	if (prp->pr_type != type)
120 		return (EPROTOTYPE);
121 	so = soalloc(p != 0);
122 	if (so == 0)
123 		return (ENOBUFS);
124 
125 	TAILQ_INIT(&so->so_incomp);
126 	TAILQ_INIT(&so->so_comp);
127 	so->so_type = type;
128 	if (p) {
129 		so->so_cred = p->p_cred;
130 		so->so_cred->p_refcnt++;
131 	} else so->so_cred = NULL;
132 	so->so_proto = prp;
133 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
134 	if (error) {
135 		so->so_state |= SS_NOFDREF;
136 		sofree(so);
137 		return (error);
138 	}
139 	*aso = so;
140 	return (0);
141 }
142 
143 int
144 sobind(so, nam, p)
145 	struct socket *so;
146 	struct sockaddr *nam;
147 	struct proc *p;
148 {
149 	int s = splnet();
150 	int error;
151 
152 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
153 	splx(s);
154 	return (error);
155 }
156 
157 void
158 sodealloc(so)
159 	struct socket *so;
160 {
161 	so->so_gencnt = ++so_gencnt;
162 	if (so->so_cred && --so->so_cred->p_refcnt == 0) {
163 		crfree(so->so_cred->pc_ucred);
164 		FREE(so->so_cred, M_SUBPROC);
165 	}
166 	zfreei(so->so_zone, so);
167 }
168 
169 int
170 solisten(so, backlog, p)
171 	register struct socket *so;
172 	int backlog;
173 	struct proc *p;
174 {
175 	int s, error;
176 
177 	s = splnet();
178 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
179 	if (error) {
180 		splx(s);
181 		return (error);
182 	}
183 	if (so->so_comp.tqh_first == NULL)
184 		so->so_options |= SO_ACCEPTCONN;
185 	if (backlog < 0 || backlog > somaxconn)
186 		backlog = somaxconn;
187 	so->so_qlimit = backlog;
188 	splx(s);
189 	return (0);
190 }
191 
192 void
193 sofree(so)
194 	register struct socket *so;
195 {
196 	struct socket *head = so->so_head;
197 
198 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
199 		return;
200 	if (head != NULL) {
201 		if (so->so_state & SS_INCOMP) {
202 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
203 			head->so_incqlen--;
204 		} else if (so->so_state & SS_COMP) {
205 			/*
206 			 * We must not decommission a socket that's
207 			 * on the accept(2) queue.  If we do, then
208 			 * accept(2) may hang after select(2) indicated
209 			 * that the listening socket was ready.
210 			 */
211 			return;
212 		} else {
213 			panic("sofree: not queued");
214 		}
215 		head->so_qlen--;
216 		so->so_state &= ~SS_INCOMP;
217 		so->so_head = NULL;
218 	}
219 	sbrelease(&so->so_snd);
220 	sorflush(so);
221 	sodealloc(so);
222 }
223 
224 /*
225  * Close a socket on last file table reference removal.
226  * Initiate disconnect if connected.
227  * Free socket when disconnect complete.
228  */
229 int
230 soclose(so)
231 	register struct socket *so;
232 {
233 	int s = splnet();		/* conservative */
234 	int error = 0;
235 
236 	funsetown(so->so_sigio);
237 	if (so->so_options & SO_ACCEPTCONN) {
238 		struct socket *sp, *sonext;
239 
240 		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
241 			sonext = sp->so_list.tqe_next;
242 			(void) soabort(sp);
243 		}
244 		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
245 			sonext = sp->so_list.tqe_next;
246 			/* Dequeue from so_comp since sofree() won't do it */
247 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
248 			so->so_qlen--;
249 			sp->so_state &= ~SS_COMP;
250 			sp->so_head = NULL;
251 			(void) soabort(sp);
252 		}
253 	}
254 	if (so->so_pcb == 0)
255 		goto discard;
256 	if (so->so_state & SS_ISCONNECTED) {
257 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
258 			error = sodisconnect(so);
259 			if (error)
260 				goto drop;
261 		}
262 		if (so->so_options & SO_LINGER) {
263 			if ((so->so_state & SS_ISDISCONNECTING) &&
264 			    (so->so_state & SS_NBIO))
265 				goto drop;
266 			while (so->so_state & SS_ISCONNECTED) {
267 				error = tsleep((caddr_t)&so->so_timeo,
268 				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
269 				if (error)
270 					break;
271 			}
272 		}
273 	}
274 drop:
275 	if (so->so_pcb) {
276 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
277 		if (error == 0)
278 			error = error2;
279 	}
280 discard:
281 	if (so->so_state & SS_NOFDREF)
282 		panic("soclose: NOFDREF");
283 	so->so_state |= SS_NOFDREF;
284 	sofree(so);
285 	splx(s);
286 	return (error);
287 }
288 
289 /*
290  * Must be called at splnet...
291  */
292 int
293 soabort(so)
294 	struct socket *so;
295 {
296 
297 	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
298 }
299 
300 int
301 soaccept(so, nam)
302 	register struct socket *so;
303 	struct sockaddr **nam;
304 {
305 	int s = splnet();
306 	int error;
307 
308 	if ((so->so_state & SS_NOFDREF) == 0)
309 		panic("soaccept: !NOFDREF");
310 	so->so_state &= ~SS_NOFDREF;
311  	if ((so->so_state & SS_ISDISCONNECTED) == 0)
312 		error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
313 	else {
314 		if (nam)
315 			*nam = 0;
316 		error = 0;
317 	}
318 	splx(s);
319 	return (error);
320 }
321 
322 int
323 soconnect(so, nam, p)
324 	register struct socket *so;
325 	struct sockaddr *nam;
326 	struct proc *p;
327 {
328 	int s;
329 	int error;
330 
331 	if (so->so_options & SO_ACCEPTCONN)
332 		return (EOPNOTSUPP);
333 	s = splnet();
334 	/*
335 	 * If protocol is connection-based, can only connect once.
336 	 * Otherwise, if connected, try to disconnect first.
337 	 * This allows user to disconnect by connecting to, e.g.,
338 	 * a null address.
339 	 */
340 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
341 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
342 	    (error = sodisconnect(so))))
343 		error = EISCONN;
344 	else
345 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
346 	splx(s);
347 	return (error);
348 }
349 
350 int
351 soconnect2(so1, so2)
352 	register struct socket *so1;
353 	struct socket *so2;
354 {
355 	int s = splnet();
356 	int error;
357 
358 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
359 	splx(s);
360 	return (error);
361 }
362 
363 int
364 sodisconnect(so)
365 	register struct socket *so;
366 {
367 	int s = splnet();
368 	int error;
369 
370 	if ((so->so_state & SS_ISCONNECTED) == 0) {
371 		error = ENOTCONN;
372 		goto bad;
373 	}
374 	if (so->so_state & SS_ISDISCONNECTING) {
375 		error = EALREADY;
376 		goto bad;
377 	}
378 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
379 bad:
380 	splx(s);
381 	return (error);
382 }
383 
384 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
385 /*
386  * Send on a socket.
387  * If send must go all at once and message is larger than
388  * send buffering, then hard error.
389  * Lock against other senders.
390  * If must go all at once and not enough room now, then
391  * inform user that this would block and do nothing.
392  * Otherwise, if nonblocking, send as much as possible.
393  * The data to be sent is described by "uio" if nonzero,
394  * otherwise by the mbuf chain "top" (which must be null
395  * if uio is not).  Data provided in mbuf chain must be small
396  * enough to send all at once.
397  *
398  * Returns nonzero on error, timeout or signal; callers
399  * must check for short counts if EINTR/ERESTART are returned.
400  * Data and control buffers are freed on return.
401  */
402 int
403 sosend(so, addr, uio, top, control, flags, p)
404 	register struct socket *so;
405 	struct sockaddr *addr;
406 	struct uio *uio;
407 	struct mbuf *top;
408 	struct mbuf *control;
409 	int flags;
410 	struct proc *p;
411 {
412 	struct mbuf **mp;
413 	register struct mbuf *m;
414 	register long space, len, resid;
415 	int clen = 0, error, s, dontroute, mlen;
416 	int atomic = sosendallatonce(so) || top;
417 
418 	if (uio)
419 		resid = uio->uio_resid;
420 	else
421 		resid = top->m_pkthdr.len;
422 	/*
423 	 * In theory resid should be unsigned.
424 	 * However, space must be signed, as it might be less than 0
425 	 * if we over-committed, and we must use a signed comparison
426 	 * of space and resid.  On the other hand, a negative resid
427 	 * causes us to loop sending 0-length segments to the protocol.
428 	 *
429 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
430 	 * type sockets since that's an error.
431 	 */
432 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
433 		error = EINVAL;
434 		goto out;
435 	}
436 
437 	dontroute =
438 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
439 	    (so->so_proto->pr_flags & PR_ATOMIC);
440 	if (p)
441 		p->p_stats->p_ru.ru_msgsnd++;
442 	if (control)
443 		clen = control->m_len;
444 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
445 
446 restart:
447 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
448 	if (error)
449 		goto out;
450 	do {
451 		s = splnet();
452 		if (so->so_state & SS_CANTSENDMORE)
453 			snderr(EPIPE);
454 		if (so->so_error) {
455 			error = so->so_error;
456 			so->so_error = 0;
457 			splx(s);
458 			goto release;
459 		}
460 		if ((so->so_state & SS_ISCONNECTED) == 0) {
461 			/*
462 			 * `sendto' and `sendmsg' is allowed on a connection-
463 			 * based socket if it supports implied connect.
464 			 * Return ENOTCONN if not connected and no address is
465 			 * supplied.
466 			 */
467 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
468 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
469 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
470 				    !(resid == 0 && clen != 0))
471 					snderr(ENOTCONN);
472 			} else if (addr == 0)
473 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
474 				   ENOTCONN : EDESTADDRREQ);
475 		}
476 		space = sbspace(&so->so_snd);
477 		if (flags & MSG_OOB)
478 			space += 1024;
479 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
480 		    clen > so->so_snd.sb_hiwat)
481 			snderr(EMSGSIZE);
482 		if (space < resid + clen && uio &&
483 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
484 			if (so->so_state & SS_NBIO)
485 				snderr(EWOULDBLOCK);
486 			sbunlock(&so->so_snd);
487 			error = sbwait(&so->so_snd);
488 			splx(s);
489 			if (error)
490 				goto out;
491 			goto restart;
492 		}
493 		splx(s);
494 		mp = &top;
495 		space -= clen;
496 		do {
497 		    if (uio == NULL) {
498 			/*
499 			 * Data is prepackaged in "top".
500 			 */
501 			resid = 0;
502 			if (flags & MSG_EOR)
503 				top->m_flags |= M_EOR;
504 		    } else do {
505 			if (top == 0) {
506 				MGETHDR(m, M_WAIT, MT_DATA);
507 				mlen = MHLEN;
508 				m->m_pkthdr.len = 0;
509 				m->m_pkthdr.rcvif = (struct ifnet *)0;
510 			} else {
511 				MGET(m, M_WAIT, MT_DATA);
512 				mlen = MLEN;
513 			}
514 			if (resid >= MINCLSIZE) {
515 				MCLGET(m, M_WAIT);
516 				if ((m->m_flags & M_EXT) == 0)
517 					goto nopages;
518 				mlen = MCLBYTES;
519 				len = min(min(mlen, resid), space);
520 			} else {
521 nopages:
522 				len = min(min(mlen, resid), space);
523 				/*
524 				 * For datagram protocols, leave room
525 				 * for protocol headers in first mbuf.
526 				 */
527 				if (atomic && top == 0 && len < mlen)
528 					MH_ALIGN(m, len);
529 			}
530 			space -= len;
531 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
532 			resid = uio->uio_resid;
533 			m->m_len = len;
534 			*mp = m;
535 			top->m_pkthdr.len += len;
536 			if (error)
537 				goto release;
538 			mp = &m->m_next;
539 			if (resid <= 0) {
540 				if (flags & MSG_EOR)
541 					top->m_flags |= M_EOR;
542 				break;
543 			}
544 		    } while (space > 0 && atomic);
545 		    if (dontroute)
546 			    so->so_options |= SO_DONTROUTE;
547 		    s = splnet();				/* XXX */
548 		    /*
549 		     * XXX all the SS_CANTSENDMORE checks previously
550 		     * done could be out of date.  We could have recieved
551 		     * a reset packet in an interrupt or maybe we slept
552 		     * while doing page faults in uiomove() etc. We could
553 		     * probably recheck again inside the splnet() protection
554 		     * here, but there are probably other places that this
555 		     * also happens.  We must rethink this.
556 		     */
557 		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
558 			(flags & MSG_OOB) ? PRUS_OOB :
559 			/*
560 			 * If the user set MSG_EOF, the protocol
561 			 * understands this flag and nothing left to
562 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
563 			 */
564 			((flags & MSG_EOF) &&
565 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
566 			 (resid <= 0)) ?
567 				PRUS_EOF :
568 			/* If there is more to send set PRUS_MORETOCOME */
569 			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
570 			top, addr, control, p);
571 		    splx(s);
572 		    if (dontroute)
573 			    so->so_options &= ~SO_DONTROUTE;
574 		    clen = 0;
575 		    control = 0;
576 		    top = 0;
577 		    mp = &top;
578 		    if (error)
579 			goto release;
580 		} while (resid && space > 0);
581 	} while (resid);
582 
583 release:
584 	sbunlock(&so->so_snd);
585 out:
586 	if (top)
587 		m_freem(top);
588 	if (control)
589 		m_freem(control);
590 	return (error);
591 }
592 
593 /*
594  * Implement receive operations on a socket.
595  * We depend on the way that records are added to the sockbuf
596  * by sbappend*.  In particular, each record (mbufs linked through m_next)
597  * must begin with an address if the protocol so specifies,
598  * followed by an optional mbuf or mbufs containing ancillary data,
599  * and then zero or more mbufs of data.
600  * In order to avoid blocking network interrupts for the entire time here,
601  * we splx() while doing the actual copy to user space.
602  * Although the sockbuf is locked, new data may still be appended,
603  * and thus we must maintain consistency of the sockbuf during that time.
604  *
605  * The caller may receive the data as a single mbuf chain by supplying
606  * an mbuf **mp0 for use in returning the chain.  The uio is then used
607  * only for the count in uio_resid.
608  */
609 int
610 soreceive(so, psa, uio, mp0, controlp, flagsp)
611 	register struct socket *so;
612 	struct sockaddr **psa;
613 	struct uio *uio;
614 	struct mbuf **mp0;
615 	struct mbuf **controlp;
616 	int *flagsp;
617 {
618 	register struct mbuf *m, **mp;
619 	register int flags, len, error, s, offset;
620 	struct protosw *pr = so->so_proto;
621 	struct mbuf *nextrecord;
622 	int moff, type = 0;
623 	int orig_resid = uio->uio_resid;
624 
625 	mp = mp0;
626 	if (psa)
627 		*psa = 0;
628 	if (controlp)
629 		*controlp = 0;
630 	if (flagsp)
631 		flags = *flagsp &~ MSG_EOR;
632 	else
633 		flags = 0;
634 	if (flags & MSG_OOB) {
635 		m = m_get(M_WAIT, MT_DATA);
636 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
637 		if (error)
638 			goto bad;
639 		do {
640 			error = uiomove(mtod(m, caddr_t),
641 			    (int) min(uio->uio_resid, m->m_len), uio);
642 			m = m_free(m);
643 		} while (uio->uio_resid && error == 0 && m);
644 bad:
645 		if (m)
646 			m_freem(m);
647 		return (error);
648 	}
649 	if (mp)
650 		*mp = (struct mbuf *)0;
651 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
652 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
653 
654 restart:
655 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
656 	if (error)
657 		return (error);
658 	s = splnet();
659 
660 	m = so->so_rcv.sb_mb;
661 	/*
662 	 * If we have less data than requested, block awaiting more
663 	 * (subject to any timeout) if:
664 	 *   1. the current count is less than the low water mark, or
665 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
666 	 *	receive operation at once if we block (resid <= hiwat).
667 	 *   3. MSG_DONTWAIT is not set
668 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
669 	 * we have to do the receive in sections, and thus risk returning
670 	 * a short count if a timeout or signal occurs after we start.
671 	 */
672 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
673 	    so->so_rcv.sb_cc < uio->uio_resid) &&
674 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
675 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
676 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
677 		KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
678 		if (so->so_error) {
679 			if (m)
680 				goto dontblock;
681 			error = so->so_error;
682 			if ((flags & MSG_PEEK) == 0)
683 				so->so_error = 0;
684 			goto release;
685 		}
686 		if (so->so_state & SS_CANTRCVMORE) {
687 			if (m)
688 				goto dontblock;
689 			else
690 				goto release;
691 		}
692 		for (; m; m = m->m_next)
693 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
694 				m = so->so_rcv.sb_mb;
695 				goto dontblock;
696 			}
697 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
698 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
699 			error = ENOTCONN;
700 			goto release;
701 		}
702 		if (uio->uio_resid == 0)
703 			goto release;
704 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
705 			error = EWOULDBLOCK;
706 			goto release;
707 		}
708 		sbunlock(&so->so_rcv);
709 		error = sbwait(&so->so_rcv);
710 		splx(s);
711 		if (error)
712 			return (error);
713 		goto restart;
714 	}
715 dontblock:
716 	if (uio->uio_procp)
717 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
718 	nextrecord = m->m_nextpkt;
719 	if (pr->pr_flags & PR_ADDR) {
720 		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
721 		orig_resid = 0;
722 		if (psa)
723 			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
724 					    mp0 == 0);
725 		if (flags & MSG_PEEK) {
726 			m = m->m_next;
727 		} else {
728 			sbfree(&so->so_rcv, m);
729 			MFREE(m, so->so_rcv.sb_mb);
730 			m = so->so_rcv.sb_mb;
731 		}
732 	}
733 	while (m && m->m_type == MT_CONTROL && error == 0) {
734 		if (flags & MSG_PEEK) {
735 			if (controlp)
736 				*controlp = m_copy(m, 0, m->m_len);
737 			m = m->m_next;
738 		} else {
739 			sbfree(&so->so_rcv, m);
740 			if (controlp) {
741 				if (pr->pr_domain->dom_externalize &&
742 				    mtod(m, struct cmsghdr *)->cmsg_type ==
743 				    SCM_RIGHTS)
744 				   error = (*pr->pr_domain->dom_externalize)(m);
745 				*controlp = m;
746 				so->so_rcv.sb_mb = m->m_next;
747 				m->m_next = 0;
748 				m = so->so_rcv.sb_mb;
749 			} else {
750 				MFREE(m, so->so_rcv.sb_mb);
751 				m = so->so_rcv.sb_mb;
752 			}
753 		}
754 		if (controlp) {
755 			orig_resid = 0;
756 			controlp = &(*controlp)->m_next;
757 		}
758 	}
759 	if (m) {
760 		if ((flags & MSG_PEEK) == 0)
761 			m->m_nextpkt = nextrecord;
762 		type = m->m_type;
763 		if (type == MT_OOBDATA)
764 			flags |= MSG_OOB;
765 	}
766 	moff = 0;
767 	offset = 0;
768 	while (m && uio->uio_resid > 0 && error == 0) {
769 		if (m->m_type == MT_OOBDATA) {
770 			if (type != MT_OOBDATA)
771 				break;
772 		} else if (type == MT_OOBDATA)
773 			break;
774 		else
775 		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
776 			("receive 3"));
777 		so->so_state &= ~SS_RCVATMARK;
778 		len = uio->uio_resid;
779 		if (so->so_oobmark && len > so->so_oobmark - offset)
780 			len = so->so_oobmark - offset;
781 		if (len > m->m_len - moff)
782 			len = m->m_len - moff;
783 		/*
784 		 * If mp is set, just pass back the mbufs.
785 		 * Otherwise copy them out via the uio, then free.
786 		 * Sockbuf must be consistent here (points to current mbuf,
787 		 * it points to next record) when we drop priority;
788 		 * we must note any additions to the sockbuf when we
789 		 * block interrupts again.
790 		 */
791 		if (mp == 0) {
792 			splx(s);
793 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
794 			s = splnet();
795 			if (error)
796 				goto release;
797 		} else
798 			uio->uio_resid -= len;
799 		if (len == m->m_len - moff) {
800 			if (m->m_flags & M_EOR)
801 				flags |= MSG_EOR;
802 			if (flags & MSG_PEEK) {
803 				m = m->m_next;
804 				moff = 0;
805 			} else {
806 				nextrecord = m->m_nextpkt;
807 				sbfree(&so->so_rcv, m);
808 				if (mp) {
809 					*mp = m;
810 					mp = &m->m_next;
811 					so->so_rcv.sb_mb = m = m->m_next;
812 					*mp = (struct mbuf *)0;
813 				} else {
814 					MFREE(m, so->so_rcv.sb_mb);
815 					m = so->so_rcv.sb_mb;
816 				}
817 				if (m)
818 					m->m_nextpkt = nextrecord;
819 			}
820 		} else {
821 			if (flags & MSG_PEEK)
822 				moff += len;
823 			else {
824 				if (mp)
825 					*mp = m_copym(m, 0, len, M_WAIT);
826 				m->m_data += len;
827 				m->m_len -= len;
828 				so->so_rcv.sb_cc -= len;
829 			}
830 		}
831 		if (so->so_oobmark) {
832 			if ((flags & MSG_PEEK) == 0) {
833 				so->so_oobmark -= len;
834 				if (so->so_oobmark == 0) {
835 					so->so_state |= SS_RCVATMARK;
836 					break;
837 				}
838 			} else {
839 				offset += len;
840 				if (offset == so->so_oobmark)
841 					break;
842 			}
843 		}
844 		if (flags & MSG_EOR)
845 			break;
846 		/*
847 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
848 		 * we must not quit until "uio->uio_resid == 0" or an error
849 		 * termination.  If a signal/timeout occurs, return
850 		 * with a short count but without error.
851 		 * Keep sockbuf locked against other readers.
852 		 */
853 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
854 		    !sosendallatonce(so) && !nextrecord) {
855 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
856 				break;
857 			error = sbwait(&so->so_rcv);
858 			if (error) {
859 				sbunlock(&so->so_rcv);
860 				splx(s);
861 				return (0);
862 			}
863 			m = so->so_rcv.sb_mb;
864 			if (m)
865 				nextrecord = m->m_nextpkt;
866 		}
867 	}
868 
869 	if (m && pr->pr_flags & PR_ATOMIC) {
870 		flags |= MSG_TRUNC;
871 		if ((flags & MSG_PEEK) == 0)
872 			(void) sbdroprecord(&so->so_rcv);
873 	}
874 	if ((flags & MSG_PEEK) == 0) {
875 		if (m == 0)
876 			so->so_rcv.sb_mb = nextrecord;
877 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
878 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
879 	}
880 	if (orig_resid == uio->uio_resid && orig_resid &&
881 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
882 		sbunlock(&so->so_rcv);
883 		splx(s);
884 		goto restart;
885 	}
886 
887 	if (flagsp)
888 		*flagsp |= flags;
889 release:
890 	sbunlock(&so->so_rcv);
891 	splx(s);
892 	return (error);
893 }
894 
895 int
896 soshutdown(so, how)
897 	register struct socket *so;
898 	register int how;
899 {
900 	register struct protosw *pr = so->so_proto;
901 
902 	how++;
903 	if (how & FREAD)
904 		sorflush(so);
905 	if (how & FWRITE)
906 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
907 	return (0);
908 }
909 
910 void
911 sorflush(so)
912 	register struct socket *so;
913 {
914 	register struct sockbuf *sb = &so->so_rcv;
915 	register struct protosw *pr = so->so_proto;
916 	register int s;
917 	struct sockbuf asb;
918 
919 	sb->sb_flags |= SB_NOINTR;
920 	(void) sblock(sb, M_WAITOK);
921 	s = splimp();
922 	socantrcvmore(so);
923 	sbunlock(sb);
924 	asb = *sb;
925 	bzero((caddr_t)sb, sizeof (*sb));
926 	splx(s);
927 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
928 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
929 	sbrelease(&asb);
930 }
931 
932 /*
933  * Perhaps this routine, and sooptcopyout(), below, ought to come in
934  * an additional variant to handle the case where the option value needs
935  * to be some kind of integer, but not a specific size.
936  * In addition to their use here, these functions are also called by the
937  * protocol-level pr_ctloutput() routines.
938  */
939 int
940 sooptcopyin(sopt, buf, len, minlen)
941 	struct	sockopt *sopt;
942 	void	*buf;
943 	size_t	len;
944 	size_t	minlen;
945 {
946 	size_t	valsize;
947 
948 	/*
949 	 * If the user gives us more than we wanted, we ignore it,
950 	 * but if we don't get the minimum length the caller
951 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
952 	 * is set to however much we actually retrieved.
953 	 */
954 	if ((valsize = sopt->sopt_valsize) < minlen)
955 		return EINVAL;
956 	if (valsize > len)
957 		sopt->sopt_valsize = valsize = len;
958 
959 	if (sopt->sopt_p != 0)
960 		return (copyin(sopt->sopt_val, buf, valsize));
961 
962 	bcopy(sopt->sopt_val, buf, valsize);
963 	return 0;
964 }
965 
966 int
967 sosetopt(so, sopt)
968 	struct socket *so;
969 	struct sockopt *sopt;
970 {
971 	int	error, optval;
972 	struct	linger l;
973 	struct	timeval tv;
974 	u_long  val;
975 
976 	error = 0;
977 	if (sopt->sopt_level != SOL_SOCKET) {
978 		if (so->so_proto && so->so_proto->pr_ctloutput)
979 			return ((*so->so_proto->pr_ctloutput)
980 				  (so, sopt));
981 		error = ENOPROTOOPT;
982 	} else {
983 		switch (sopt->sopt_name) {
984 		case SO_LINGER:
985 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
986 			if (error)
987 				goto bad;
988 
989 			so->so_linger = l.l_linger;
990 			if (l.l_onoff)
991 				so->so_options |= SO_LINGER;
992 			else
993 				so->so_options &= ~SO_LINGER;
994 			break;
995 
996 		case SO_DEBUG:
997 		case SO_KEEPALIVE:
998 		case SO_DONTROUTE:
999 		case SO_USELOOPBACK:
1000 		case SO_BROADCAST:
1001 		case SO_REUSEADDR:
1002 		case SO_REUSEPORT:
1003 		case SO_OOBINLINE:
1004 		case SO_TIMESTAMP:
1005 			error = sooptcopyin(sopt, &optval, sizeof optval,
1006 					    sizeof optval);
1007 			if (error)
1008 				goto bad;
1009 			if (optval)
1010 				so->so_options |= sopt->sopt_name;
1011 			else
1012 				so->so_options &= ~sopt->sopt_name;
1013 			break;
1014 
1015 		case SO_SNDBUF:
1016 		case SO_RCVBUF:
1017 		case SO_SNDLOWAT:
1018 		case SO_RCVLOWAT:
1019 			error = sooptcopyin(sopt, &optval, sizeof optval,
1020 					    sizeof optval);
1021 			if (error)
1022 				goto bad;
1023 
1024 			/*
1025 			 * Values < 1 make no sense for any of these
1026 			 * options, so disallow them.
1027 			 */
1028 			if (optval < 1) {
1029 				error = EINVAL;
1030 				goto bad;
1031 			}
1032 
1033 			switch (sopt->sopt_name) {
1034 			case SO_SNDBUF:
1035 			case SO_RCVBUF:
1036 				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1037 					      &so->so_snd : &so->so_rcv,
1038 					      (u_long) optval) == 0) {
1039 					error = ENOBUFS;
1040 					goto bad;
1041 				}
1042 				break;
1043 
1044 			/*
1045 			 * Make sure the low-water is never greater than
1046 			 * the high-water.
1047 			 */
1048 			case SO_SNDLOWAT:
1049 				so->so_snd.sb_lowat =
1050 				    (optval > so->so_snd.sb_hiwat) ?
1051 				    so->so_snd.sb_hiwat : optval;
1052 				break;
1053 			case SO_RCVLOWAT:
1054 				so->so_rcv.sb_lowat =
1055 				    (optval > so->so_rcv.sb_hiwat) ?
1056 				    so->so_rcv.sb_hiwat : optval;
1057 				break;
1058 			}
1059 			break;
1060 
1061 		case SO_SNDTIMEO:
1062 		case SO_RCVTIMEO:
1063 			error = sooptcopyin(sopt, &tv, sizeof tv,
1064 					    sizeof tv);
1065 			if (error)
1066 				goto bad;
1067 
1068 			/* assert(hz > 0); */
1069 			if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1070 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1071 				error = EDOM;
1072 				goto bad;
1073 			}
1074 			/* assert(tick > 0); */
1075 			/* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1076 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1077 			if (val > SHRT_MAX) {
1078 				error = EDOM;
1079 				goto bad;
1080 			}
1081 
1082 			switch (sopt->sopt_name) {
1083 			case SO_SNDTIMEO:
1084 				so->so_snd.sb_timeo = val;
1085 				break;
1086 			case SO_RCVTIMEO:
1087 				so->so_rcv.sb_timeo = val;
1088 				break;
1089 			}
1090 			break;
1091 
1092 		default:
1093 			error = ENOPROTOOPT;
1094 			break;
1095 		}
1096 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1097 			(void) ((*so->so_proto->pr_ctloutput)
1098 				  (so, sopt));
1099 		}
1100 	}
1101 bad:
1102 	return (error);
1103 }
1104 
1105 /* Helper routine for getsockopt */
1106 int
1107 sooptcopyout(sopt, buf, len)
1108 	struct	sockopt *sopt;
1109 	void	*buf;
1110 	size_t	len;
1111 {
1112 	int	error;
1113 	size_t	valsize;
1114 
1115 	error = 0;
1116 
1117 	/*
1118 	 * Documented get behavior is that we always return a value,
1119 	 * possibly truncated to fit in the user's buffer.
1120 	 * Traditional behavior is that we always tell the user
1121 	 * precisely how much we copied, rather than something useful
1122 	 * like the total amount we had available for her.
1123 	 * Note that this interface is not idempotent; the entire answer must
1124 	 * generated ahead of time.
1125 	 */
1126 	valsize = min(len, sopt->sopt_valsize);
1127 	sopt->sopt_valsize = valsize;
1128 	if (sopt->sopt_val != 0) {
1129 		if (sopt->sopt_p != 0)
1130 			error = copyout(buf, sopt->sopt_val, valsize);
1131 		else
1132 			bcopy(buf, sopt->sopt_val, valsize);
1133 	}
1134 	return error;
1135 }
1136 
1137 int
1138 sogetopt(so, sopt)
1139 	struct socket *so;
1140 	struct sockopt *sopt;
1141 {
1142 	int	error, optval;
1143 	struct	linger l;
1144 	struct	timeval tv;
1145 
1146 	error = 0;
1147 	if (sopt->sopt_level != SOL_SOCKET) {
1148 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1149 			return ((*so->so_proto->pr_ctloutput)
1150 				  (so, sopt));
1151 		} else
1152 			return (ENOPROTOOPT);
1153 	} else {
1154 		switch (sopt->sopt_name) {
1155 		case SO_LINGER:
1156 			l.l_onoff = so->so_options & SO_LINGER;
1157 			l.l_linger = so->so_linger;
1158 			error = sooptcopyout(sopt, &l, sizeof l);
1159 			break;
1160 
1161 		case SO_USELOOPBACK:
1162 		case SO_DONTROUTE:
1163 		case SO_DEBUG:
1164 		case SO_KEEPALIVE:
1165 		case SO_REUSEADDR:
1166 		case SO_REUSEPORT:
1167 		case SO_BROADCAST:
1168 		case SO_OOBINLINE:
1169 		case SO_TIMESTAMP:
1170 			optval = so->so_options & sopt->sopt_name;
1171 integer:
1172 			error = sooptcopyout(sopt, &optval, sizeof optval);
1173 			break;
1174 
1175 		case SO_TYPE:
1176 			optval = so->so_type;
1177 			goto integer;
1178 
1179 		case SO_ERROR:
1180 			optval = so->so_error;
1181 			so->so_error = 0;
1182 			goto integer;
1183 
1184 		case SO_SNDBUF:
1185 			optval = so->so_snd.sb_hiwat;
1186 			goto integer;
1187 
1188 		case SO_RCVBUF:
1189 			optval = so->so_rcv.sb_hiwat;
1190 			goto integer;
1191 
1192 		case SO_SNDLOWAT:
1193 			optval = so->so_snd.sb_lowat;
1194 			goto integer;
1195 
1196 		case SO_RCVLOWAT:
1197 			optval = so->so_rcv.sb_lowat;
1198 			goto integer;
1199 
1200 		case SO_SNDTIMEO:
1201 		case SO_RCVTIMEO:
1202 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
1203 				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1204 
1205 			tv.tv_sec = optval / hz;
1206 			tv.tv_usec = (optval % hz) * tick;
1207 			error = sooptcopyout(sopt, &tv, sizeof tv);
1208 			break;
1209 
1210 		default:
1211 			error = ENOPROTOOPT;
1212 			break;
1213 		}
1214 		return (error);
1215 	}
1216 }
1217 
1218 void
1219 sohasoutofband(so)
1220 	register struct socket *so;
1221 {
1222 	if (so->so_sigio != NULL)
1223 		pgsigio(so->so_sigio, SIGURG, 0);
1224 	selwakeup(&so->so_rcv.sb_sel);
1225 }
1226 
1227 int
1228 sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1229 {
1230 	int revents = 0;
1231 	int s = splnet();
1232 
1233 	if (events & (POLLIN | POLLRDNORM))
1234 		if (soreadable(so))
1235 			revents |= events & (POLLIN | POLLRDNORM);
1236 
1237 	if (events & (POLLOUT | POLLWRNORM))
1238 		if (sowriteable(so))
1239 			revents |= events & (POLLOUT | POLLWRNORM);
1240 
1241 	if (events & (POLLPRI | POLLRDBAND))
1242 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1243 			revents |= events & (POLLPRI | POLLRDBAND);
1244 
1245 	if (revents == 0) {
1246 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1247 			selrecord(p, &so->so_rcv.sb_sel);
1248 			so->so_rcv.sb_flags |= SB_SEL;
1249 		}
1250 
1251 		if (events & (POLLOUT | POLLWRNORM)) {
1252 			selrecord(p, &so->so_snd.sb_sel);
1253 			so->so_snd.sb_flags |= SB_SEL;
1254 		}
1255 	}
1256 
1257 	splx(s);
1258 	return (revents);
1259 }
1260