xref: /freebsd/sys/kern/uipc_socket.c (revision ce834215a70ff69e7e222827437116eee2f9ac6f)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  *	$Id: uipc_socket.c,v 1.26 1997/04/27 20:00:44 wollman Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/queue.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/fcntl.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/signalvar.h>
51 #include <sys/sysctl.h>
52 
53 static int somaxconn = SOMAXCONN;
54 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
55 	   0, "");
56 
57 /*
58  * Socket operation routines.
59  * These routines are called by the routines in
60  * sys_socket.c or from a system process, and
61  * implement the semantics of socket operations by
62  * switching out to the protocol specific routines.
63  */
64 /*ARGSUSED*/
65 int
66 socreate(dom, aso, type, proto, p)
67 	int dom;
68 	struct socket **aso;
69 	register int type;
70 	int proto;
71 	struct proc *p;
72 {
73 	register struct protosw *prp;
74 	register struct socket *so;
75 	register int error;
76 
77 	if (proto)
78 		prp = pffindproto(dom, proto, type);
79 	else
80 		prp = pffindtype(dom, type);
81 	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
82 		return (EPROTONOSUPPORT);
83 	if (prp->pr_type != type)
84 		return (EPROTOTYPE);
85 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
86 	bzero((caddr_t)so, sizeof(*so));
87 	TAILQ_INIT(&so->so_incomp);
88 	TAILQ_INIT(&so->so_comp);
89 	so->so_type = type;
90 	so->so_proto = prp;
91 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
92 	if (error) {
93 		so->so_state |= SS_NOFDREF;
94 		sofree(so);
95 		return (error);
96 	}
97 	*aso = so;
98 	return (0);
99 }
100 
101 int
102 sobind(so, nam, p)
103 	struct socket *so;
104 	struct mbuf *nam;
105 	struct proc *p;
106 {
107 	int s = splnet();
108 	int error;
109 
110 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
111 	splx(s);
112 	return (error);
113 }
114 
115 int
116 solisten(so, backlog, p)
117 	register struct socket *so;
118 	int backlog;
119 	struct proc *p;
120 {
121 	int s = splnet(), error;
122 
123 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
124 	if (error) {
125 		splx(s);
126 		return (error);
127 	}
128 	if (so->so_comp.tqh_first == NULL)
129 		so->so_options |= SO_ACCEPTCONN;
130 	if (backlog < 0 || backlog > somaxconn)
131 		backlog = somaxconn;
132 	so->so_qlimit = backlog;
133 	splx(s);
134 	return (0);
135 }
136 
137 void
138 sofree(so)
139 	register struct socket *so;
140 {
141 	struct socket *head = so->so_head;
142 
143 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
144 		return;
145 	if (head != NULL) {
146 		if (so->so_state & SS_INCOMP) {
147 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
148 			head->so_incqlen--;
149 		} else if (so->so_state & SS_COMP) {
150 			TAILQ_REMOVE(&head->so_comp, so, so_list);
151 		} else {
152 			panic("sofree: not queued");
153 		}
154 		head->so_qlen--;
155 		so->so_state &= ~(SS_INCOMP|SS_COMP);
156 		so->so_head = NULL;
157 	}
158 	sbrelease(&so->so_snd);
159 	sorflush(so);
160 	FREE(so, M_SOCKET);
161 }
162 
163 /*
164  * Close a socket on last file table reference removal.
165  * Initiate disconnect if connected.
166  * Free socket when disconnect complete.
167  */
168 int
169 soclose(so)
170 	register struct socket *so;
171 {
172 	int s = splnet();		/* conservative */
173 	int error = 0;
174 
175 	if (so->so_options & SO_ACCEPTCONN) {
176 		struct socket *sp, *sonext;
177 
178 		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
179 			sonext = sp->so_list.tqe_next;
180 			(void) soabort(sp);
181 		}
182 		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
183 			sonext = sp->so_list.tqe_next;
184 			(void) soabort(sp);
185 		}
186 	}
187 	if (so->so_pcb == 0)
188 		goto discard;
189 	if (so->so_state & SS_ISCONNECTED) {
190 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
191 			error = sodisconnect(so);
192 			if (error)
193 				goto drop;
194 		}
195 		if (so->so_options & SO_LINGER) {
196 			if ((so->so_state & SS_ISDISCONNECTING) &&
197 			    (so->so_state & SS_NBIO))
198 				goto drop;
199 			while (so->so_state & SS_ISCONNECTED) {
200 				error = tsleep((caddr_t)&so->so_timeo,
201 				    PSOCK | PCATCH, "soclos", so->so_linger);
202 				if (error)
203 					break;
204 			}
205 		}
206 	}
207 drop:
208 	if (so->so_pcb) {
209 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
210 		if (error == 0)
211 			error = error2;
212 	}
213 discard:
214 	if (so->so_state & SS_NOFDREF)
215 		panic("soclose: NOFDREF");
216 	so->so_state |= SS_NOFDREF;
217 	sofree(so);
218 	splx(s);
219 	return (error);
220 }
221 
222 /*
223  * Must be called at splnet...
224  */
225 int
226 soabort(so)
227 	struct socket *so;
228 {
229 
230 	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
231 }
232 
233 int
234 soaccept(so, nam)
235 	register struct socket *so;
236 	struct mbuf *nam;
237 {
238 	int s = splnet();
239 	int error;
240 
241 	if ((so->so_state & SS_NOFDREF) == 0)
242 		panic("soaccept: !NOFDREF");
243 	so->so_state &= ~SS_NOFDREF;
244 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
245 	splx(s);
246 	return (error);
247 }
248 
249 int
250 soconnect(so, nam, p)
251 	register struct socket *so;
252 	struct mbuf *nam;
253 	struct proc *p;
254 {
255 	int s;
256 	int error;
257 
258 	if (so->so_options & SO_ACCEPTCONN)
259 		return (EOPNOTSUPP);
260 	s = splnet();
261 	/*
262 	 * If protocol is connection-based, can only connect once.
263 	 * Otherwise, if connected, try to disconnect first.
264 	 * This allows user to disconnect by connecting to, e.g.,
265 	 * a null address.
266 	 */
267 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
268 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
269 	    (error = sodisconnect(so))))
270 		error = EISCONN;
271 	else
272 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
273 	splx(s);
274 	return (error);
275 }
276 
277 int
278 soconnect2(so1, so2)
279 	register struct socket *so1;
280 	struct socket *so2;
281 {
282 	int s = splnet();
283 	int error;
284 
285 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
286 	splx(s);
287 	return (error);
288 }
289 
290 int
291 sodisconnect(so)
292 	register struct socket *so;
293 {
294 	int s = splnet();
295 	int error;
296 
297 	if ((so->so_state & SS_ISCONNECTED) == 0) {
298 		error = ENOTCONN;
299 		goto bad;
300 	}
301 	if (so->so_state & SS_ISDISCONNECTING) {
302 		error = EALREADY;
303 		goto bad;
304 	}
305 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
306 bad:
307 	splx(s);
308 	return (error);
309 }
310 
311 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
312 /*
313  * Send on a socket.
314  * If send must go all at once and message is larger than
315  * send buffering, then hard error.
316  * Lock against other senders.
317  * If must go all at once and not enough room now, then
318  * inform user that this would block and do nothing.
319  * Otherwise, if nonblocking, send as much as possible.
320  * The data to be sent is described by "uio" if nonzero,
321  * otherwise by the mbuf chain "top" (which must be null
322  * if uio is not).  Data provided in mbuf chain must be small
323  * enough to send all at once.
324  *
325  * Returns nonzero on error, timeout or signal; callers
326  * must check for short counts if EINTR/ERESTART are returned.
327  * Data and control buffers are freed on return.
328  */
329 int
330 sosend(so, addr, uio, top, control, flags)
331 	register struct socket *so;
332 	struct mbuf *addr;
333 	struct uio *uio;
334 	struct mbuf *top;
335 	struct mbuf *control;
336 	int flags;
337 {
338 	struct proc *p = curproc;		/* XXX */
339 	struct mbuf **mp;
340 	register struct mbuf *m;
341 	register long space, len, resid;
342 	int clen = 0, error, s, dontroute, mlen;
343 	int atomic = sosendallatonce(so) || top;
344 
345 	if (uio)
346 		resid = uio->uio_resid;
347 	else
348 		resid = top->m_pkthdr.len;
349 	/*
350 	 * In theory resid should be unsigned.
351 	 * However, space must be signed, as it might be less than 0
352 	 * if we over-committed, and we must use a signed comparison
353 	 * of space and resid.  On the other hand, a negative resid
354 	 * causes us to loop sending 0-length segments to the protocol.
355 	 */
356 	if (resid < 0)
357 		return (EINVAL);
358 	dontroute =
359 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
360 	    (so->so_proto->pr_flags & PR_ATOMIC);
361 	p->p_stats->p_ru.ru_msgsnd++;
362 	if (control)
363 		clen = control->m_len;
364 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
365 
366 restart:
367 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
368 	if (error)
369 		goto out;
370 	do {
371 		s = splnet();
372 		if (so->so_state & SS_CANTSENDMORE)
373 			snderr(EPIPE);
374 		if (so->so_error)
375 			snderr(so->so_error);
376 		if ((so->so_state & SS_ISCONNECTED) == 0) {
377 			/*
378 			 * `sendto' and `sendmsg' is allowed on a connection-
379 			 * based socket if it supports implied connect.
380 			 * Return ENOTCONN if not connected and no address is
381 			 * supplied.
382 			 */
383 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
384 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
385 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
386 				    !(resid == 0 && clen != 0))
387 					snderr(ENOTCONN);
388 			} else if (addr == 0)
389 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
390 				   ENOTCONN : EDESTADDRREQ);
391 		}
392 		space = sbspace(&so->so_snd);
393 		if (flags & MSG_OOB)
394 			space += 1024;
395 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
396 		    clen > so->so_snd.sb_hiwat)
397 			snderr(EMSGSIZE);
398 		if (space < resid + clen && uio &&
399 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
400 			if (so->so_state & SS_NBIO)
401 				snderr(EWOULDBLOCK);
402 			sbunlock(&so->so_snd);
403 			error = sbwait(&so->so_snd);
404 			splx(s);
405 			if (error)
406 				goto out;
407 			goto restart;
408 		}
409 		splx(s);
410 		mp = &top;
411 		space -= clen;
412 		do {
413 		    if (uio == NULL) {
414 			/*
415 			 * Data is prepackaged in "top".
416 			 */
417 			resid = 0;
418 			if (flags & MSG_EOR)
419 				top->m_flags |= M_EOR;
420 		    } else do {
421 			if (top == 0) {
422 				MGETHDR(m, M_WAIT, MT_DATA);
423 				mlen = MHLEN;
424 				m->m_pkthdr.len = 0;
425 				m->m_pkthdr.rcvif = (struct ifnet *)0;
426 			} else {
427 				MGET(m, M_WAIT, MT_DATA);
428 				mlen = MLEN;
429 			}
430 			if (resid >= MINCLSIZE) {
431 				MCLGET(m, M_WAIT);
432 				if ((m->m_flags & M_EXT) == 0)
433 					goto nopages;
434 				mlen = MCLBYTES;
435 				len = min(min(mlen, resid), space);
436 			} else {
437 nopages:
438 				len = min(min(mlen, resid), space);
439 				/*
440 				 * For datagram protocols, leave room
441 				 * for protocol headers in first mbuf.
442 				 */
443 				if (atomic && top == 0 && len < mlen)
444 					MH_ALIGN(m, len);
445 			}
446 			space -= len;
447 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
448 			resid = uio->uio_resid;
449 			m->m_len = len;
450 			*mp = m;
451 			top->m_pkthdr.len += len;
452 			if (error)
453 				goto release;
454 			mp = &m->m_next;
455 			if (resid <= 0) {
456 				if (flags & MSG_EOR)
457 					top->m_flags |= M_EOR;
458 				break;
459 			}
460 		    } while (space > 0 && atomic);
461 		    if (dontroute)
462 			    so->so_options |= SO_DONTROUTE;
463 		    s = splnet();				/* XXX */
464 		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
465 			(flags & MSG_OOB) ? PRUS_OOB :
466 			/*
467 			 * If the user set MSG_EOF, the protocol
468 			 * understands this flag and nothing left to
469 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
470 			 */
471 			((flags & MSG_EOF) &&
472 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
473 			 (resid <= 0)) ?
474 				PRUS_EOF : 0,
475 			top, addr, control, p);
476 		    splx(s);
477 		    if (dontroute)
478 			    so->so_options &= ~SO_DONTROUTE;
479 		    clen = 0;
480 		    control = 0;
481 		    top = 0;
482 		    mp = &top;
483 		    if (error)
484 			goto release;
485 		} while (resid && space > 0);
486 	} while (resid);
487 
488 release:
489 	sbunlock(&so->so_snd);
490 out:
491 	if (top)
492 		m_freem(top);
493 	if (control)
494 		m_freem(control);
495 	return (error);
496 }
497 
498 /*
499  * Implement receive operations on a socket.
500  * We depend on the way that records are added to the sockbuf
501  * by sbappend*.  In particular, each record (mbufs linked through m_next)
502  * must begin with an address if the protocol so specifies,
503  * followed by an optional mbuf or mbufs containing ancillary data,
504  * and then zero or more mbufs of data.
505  * In order to avoid blocking network interrupts for the entire time here,
506  * we splx() while doing the actual copy to user space.
507  * Although the sockbuf is locked, new data may still be appended,
508  * and thus we must maintain consistency of the sockbuf during that time.
509  *
510  * The caller may receive the data as a single mbuf chain by supplying
511  * an mbuf **mp0 for use in returning the chain.  The uio is then used
512  * only for the count in uio_resid.
513  */
514 int
515 soreceive(so, paddr, uio, mp0, controlp, flagsp)
516 	register struct socket *so;
517 	struct mbuf **paddr;
518 	struct uio *uio;
519 	struct mbuf **mp0;
520 	struct mbuf **controlp;
521 	int *flagsp;
522 {
523 	register struct mbuf *m, **mp;
524 	register int flags, len, error, s, offset;
525 	struct protosw *pr = so->so_proto;
526 	struct mbuf *nextrecord;
527 	int moff, type = 0;
528 	int orig_resid = uio->uio_resid;
529 
530 	mp = mp0;
531 	if (paddr)
532 		*paddr = 0;
533 	if (controlp)
534 		*controlp = 0;
535 	if (flagsp)
536 		flags = *flagsp &~ MSG_EOR;
537 	else
538 		flags = 0;
539 	if (flags & MSG_OOB) {
540 		m = m_get(M_WAIT, MT_DATA);
541 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
542 		if (error)
543 			goto bad;
544 		do {
545 			error = uiomove(mtod(m, caddr_t),
546 			    (int) min(uio->uio_resid, m->m_len), uio);
547 			m = m_free(m);
548 		} while (uio->uio_resid && error == 0 && m);
549 bad:
550 		if (m)
551 			m_freem(m);
552 		return (error);
553 	}
554 	if (mp)
555 		*mp = (struct mbuf *)0;
556 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
557 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
558 
559 restart:
560 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
561 	if (error)
562 		return (error);
563 	s = splnet();
564 
565 	m = so->so_rcv.sb_mb;
566 	/*
567 	 * If we have less data than requested, block awaiting more
568 	 * (subject to any timeout) if:
569 	 *   1. the current count is less than the low water mark, or
570 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
571 	 *	receive operation at once if we block (resid <= hiwat).
572 	 *   3. MSG_DONTWAIT is not set
573 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
574 	 * we have to do the receive in sections, and thus risk returning
575 	 * a short count if a timeout or signal occurs after we start.
576 	 */
577 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
578 	    so->so_rcv.sb_cc < uio->uio_resid) &&
579 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
580 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
581 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
582 #ifdef DIAGNOSTIC
583 		if (m == 0 && so->so_rcv.sb_cc)
584 			panic("receive 1");
585 #endif
586 		if (so->so_error) {
587 			if (m)
588 				goto dontblock;
589 			error = so->so_error;
590 			if ((flags & MSG_PEEK) == 0)
591 				so->so_error = 0;
592 			goto release;
593 		}
594 		if (so->so_state & SS_CANTRCVMORE) {
595 			if (m)
596 				goto dontblock;
597 			else
598 				goto release;
599 		}
600 		for (; m; m = m->m_next)
601 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
602 				m = so->so_rcv.sb_mb;
603 				goto dontblock;
604 			}
605 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
606 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
607 			error = ENOTCONN;
608 			goto release;
609 		}
610 		if (uio->uio_resid == 0)
611 			goto release;
612 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
613 			error = EWOULDBLOCK;
614 			goto release;
615 		}
616 		sbunlock(&so->so_rcv);
617 		error = sbwait(&so->so_rcv);
618 		splx(s);
619 		if (error)
620 			return (error);
621 		goto restart;
622 	}
623 dontblock:
624 	if (uio->uio_procp)
625 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
626 	nextrecord = m->m_nextpkt;
627 	if (pr->pr_flags & PR_ADDR) {
628 #ifdef DIAGNOSTIC
629 		if (m->m_type != MT_SONAME)
630 			panic("receive 1a");
631 #endif
632 		orig_resid = 0;
633 		if (flags & MSG_PEEK) {
634 			if (paddr)
635 				*paddr = m_copy(m, 0, m->m_len);
636 			m = m->m_next;
637 		} else {
638 			sbfree(&so->so_rcv, m);
639 			if (paddr) {
640 				*paddr = m;
641 				so->so_rcv.sb_mb = m->m_next;
642 				m->m_next = 0;
643 				m = so->so_rcv.sb_mb;
644 			} else {
645 				MFREE(m, so->so_rcv.sb_mb);
646 				m = so->so_rcv.sb_mb;
647 			}
648 		}
649 	}
650 	while (m && m->m_type == MT_CONTROL && error == 0) {
651 		if (flags & MSG_PEEK) {
652 			if (controlp)
653 				*controlp = m_copy(m, 0, m->m_len);
654 			m = m->m_next;
655 		} else {
656 			sbfree(&so->so_rcv, m);
657 			if (controlp) {
658 				if (pr->pr_domain->dom_externalize &&
659 				    mtod(m, struct cmsghdr *)->cmsg_type ==
660 				    SCM_RIGHTS)
661 				   error = (*pr->pr_domain->dom_externalize)(m);
662 				*controlp = m;
663 				so->so_rcv.sb_mb = m->m_next;
664 				m->m_next = 0;
665 				m = so->so_rcv.sb_mb;
666 			} else {
667 				MFREE(m, so->so_rcv.sb_mb);
668 				m = so->so_rcv.sb_mb;
669 			}
670 		}
671 		if (controlp) {
672 			orig_resid = 0;
673 			controlp = &(*controlp)->m_next;
674 		}
675 	}
676 	if (m) {
677 		if ((flags & MSG_PEEK) == 0)
678 			m->m_nextpkt = nextrecord;
679 		type = m->m_type;
680 		if (type == MT_OOBDATA)
681 			flags |= MSG_OOB;
682 	}
683 	moff = 0;
684 	offset = 0;
685 	while (m && uio->uio_resid > 0 && error == 0) {
686 		if (m->m_type == MT_OOBDATA) {
687 			if (type != MT_OOBDATA)
688 				break;
689 		} else if (type == MT_OOBDATA)
690 			break;
691 #ifdef DIAGNOSTIC
692 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
693 			panic("receive 3");
694 #endif
695 		so->so_state &= ~SS_RCVATMARK;
696 		len = uio->uio_resid;
697 		if (so->so_oobmark && len > so->so_oobmark - offset)
698 			len = so->so_oobmark - offset;
699 		if (len > m->m_len - moff)
700 			len = m->m_len - moff;
701 		/*
702 		 * If mp is set, just pass back the mbufs.
703 		 * Otherwise copy them out via the uio, then free.
704 		 * Sockbuf must be consistent here (points to current mbuf,
705 		 * it points to next record) when we drop priority;
706 		 * we must note any additions to the sockbuf when we
707 		 * block interrupts again.
708 		 */
709 		if (mp == 0) {
710 			splx(s);
711 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
712 			s = splnet();
713 			if (error)
714 				goto release;
715 		} else
716 			uio->uio_resid -= len;
717 		if (len == m->m_len - moff) {
718 			if (m->m_flags & M_EOR)
719 				flags |= MSG_EOR;
720 			if (flags & MSG_PEEK) {
721 				m = m->m_next;
722 				moff = 0;
723 			} else {
724 				nextrecord = m->m_nextpkt;
725 				sbfree(&so->so_rcv, m);
726 				if (mp) {
727 					*mp = m;
728 					mp = &m->m_next;
729 					so->so_rcv.sb_mb = m = m->m_next;
730 					*mp = (struct mbuf *)0;
731 				} else {
732 					MFREE(m, so->so_rcv.sb_mb);
733 					m = so->so_rcv.sb_mb;
734 				}
735 				if (m)
736 					m->m_nextpkt = nextrecord;
737 			}
738 		} else {
739 			if (flags & MSG_PEEK)
740 				moff += len;
741 			else {
742 				if (mp)
743 					*mp = m_copym(m, 0, len, M_WAIT);
744 				m->m_data += len;
745 				m->m_len -= len;
746 				so->so_rcv.sb_cc -= len;
747 			}
748 		}
749 		if (so->so_oobmark) {
750 			if ((flags & MSG_PEEK) == 0) {
751 				so->so_oobmark -= len;
752 				if (so->so_oobmark == 0) {
753 					so->so_state |= SS_RCVATMARK;
754 					break;
755 				}
756 			} else {
757 				offset += len;
758 				if (offset == so->so_oobmark)
759 					break;
760 			}
761 		}
762 		if (flags & MSG_EOR)
763 			break;
764 		/*
765 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
766 		 * we must not quit until "uio->uio_resid == 0" or an error
767 		 * termination.  If a signal/timeout occurs, return
768 		 * with a short count but without error.
769 		 * Keep sockbuf locked against other readers.
770 		 */
771 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
772 		    !sosendallatonce(so) && !nextrecord) {
773 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
774 				break;
775 			error = sbwait(&so->so_rcv);
776 			if (error) {
777 				sbunlock(&so->so_rcv);
778 				splx(s);
779 				return (0);
780 			}
781 			m = so->so_rcv.sb_mb;
782 			if (m)
783 				nextrecord = m->m_nextpkt;
784 		}
785 	}
786 
787 	if (m && pr->pr_flags & PR_ATOMIC) {
788 		flags |= MSG_TRUNC;
789 		if ((flags & MSG_PEEK) == 0)
790 			(void) sbdroprecord(&so->so_rcv);
791 	}
792 	if ((flags & MSG_PEEK) == 0) {
793 		if (m == 0)
794 			so->so_rcv.sb_mb = nextrecord;
795 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
796 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
797 	}
798 	if (orig_resid == uio->uio_resid && orig_resid &&
799 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
800 		sbunlock(&so->so_rcv);
801 		splx(s);
802 		goto restart;
803 	}
804 
805 	if (flagsp)
806 		*flagsp |= flags;
807 release:
808 	sbunlock(&so->so_rcv);
809 	splx(s);
810 	return (error);
811 }
812 
813 int
814 soshutdown(so, how)
815 	register struct socket *so;
816 	register int how;
817 {
818 	register struct protosw *pr = so->so_proto;
819 
820 	how++;
821 	if (how & FREAD)
822 		sorflush(so);
823 	if (how & FWRITE)
824 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
825 	return (0);
826 }
827 
828 void
829 sorflush(so)
830 	register struct socket *so;
831 {
832 	register struct sockbuf *sb = &so->so_rcv;
833 	register struct protosw *pr = so->so_proto;
834 	register int s;
835 	struct sockbuf asb;
836 
837 	sb->sb_flags |= SB_NOINTR;
838 	(void) sblock(sb, M_WAITOK);
839 	s = splimp();
840 	socantrcvmore(so);
841 	sbunlock(sb);
842 	asb = *sb;
843 	bzero((caddr_t)sb, sizeof (*sb));
844 	splx(s);
845 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
846 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
847 	sbrelease(&asb);
848 }
849 
850 int
851 sosetopt(so, level, optname, m0, p)
852 	register struct socket *so;
853 	int level, optname;
854 	struct mbuf *m0;
855 	struct proc *p;
856 {
857 	int error = 0;
858 	register struct mbuf *m = m0;
859 
860 	if (level != SOL_SOCKET) {
861 		if (so->so_proto && so->so_proto->pr_ctloutput)
862 			return ((*so->so_proto->pr_ctloutput)
863 				  (PRCO_SETOPT, so, level, optname, &m0, p));
864 		error = ENOPROTOOPT;
865 	} else {
866 		switch (optname) {
867 
868 		case SO_LINGER:
869 			if (m == NULL || m->m_len != sizeof (struct linger)) {
870 				error = EINVAL;
871 				goto bad;
872 			}
873 			so->so_linger = mtod(m, struct linger *)->l_linger;
874 			/* fall thru... */
875 
876 		case SO_DEBUG:
877 		case SO_KEEPALIVE:
878 		case SO_DONTROUTE:
879 		case SO_USELOOPBACK:
880 		case SO_BROADCAST:
881 		case SO_REUSEADDR:
882 		case SO_REUSEPORT:
883 		case SO_OOBINLINE:
884 		case SO_TIMESTAMP:
885 			if (m == NULL || m->m_len < sizeof (int)) {
886 				error = EINVAL;
887 				goto bad;
888 			}
889 			if (*mtod(m, int *))
890 				so->so_options |= optname;
891 			else
892 				so->so_options &= ~optname;
893 			break;
894 
895 		case SO_SNDBUF:
896 		case SO_RCVBUF:
897 		case SO_SNDLOWAT:
898 		case SO_RCVLOWAT:
899 		    {
900 			int optval;
901 
902 			if (m == NULL || m->m_len < sizeof (int)) {
903 				error = EINVAL;
904 				goto bad;
905 			}
906 
907 			/*
908 			 * Values < 1 make no sense for any of these
909 			 * options, so disallow them.
910 			 */
911 			optval = *mtod(m, int *);
912 			if (optval < 1) {
913 				error = EINVAL;
914 				goto bad;
915 			}
916 
917 			switch (optname) {
918 
919 			case SO_SNDBUF:
920 			case SO_RCVBUF:
921 				if (sbreserve(optname == SO_SNDBUF ?
922 				    &so->so_snd : &so->so_rcv,
923 				    (u_long) optval) == 0) {
924 					error = ENOBUFS;
925 					goto bad;
926 				}
927 				break;
928 
929 			/*
930 			 * Make sure the low-water is never greater than
931 			 * the high-water.
932 			 */
933 			case SO_SNDLOWAT:
934 				so->so_snd.sb_lowat =
935 				    (optval > so->so_snd.sb_hiwat) ?
936 				    so->so_snd.sb_hiwat : optval;
937 				break;
938 			case SO_RCVLOWAT:
939 				so->so_rcv.sb_lowat =
940 				    (optval > so->so_rcv.sb_hiwat) ?
941 				    so->so_rcv.sb_hiwat : optval;
942 				break;
943 			}
944 			break;
945 		    }
946 
947 		case SO_SNDTIMEO:
948 		case SO_RCVTIMEO:
949 		    {
950 			struct timeval *tv;
951 			short val;
952 
953 			if (m == NULL || m->m_len < sizeof (*tv)) {
954 				error = EINVAL;
955 				goto bad;
956 			}
957 			tv = mtod(m, struct timeval *);
958 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
959 				error = EDOM;
960 				goto bad;
961 			}
962 			val = tv->tv_sec * hz + tv->tv_usec / tick;
963 
964 			switch (optname) {
965 
966 			case SO_SNDTIMEO:
967 				so->so_snd.sb_timeo = val;
968 				break;
969 			case SO_RCVTIMEO:
970 				so->so_rcv.sb_timeo = val;
971 				break;
972 			}
973 			break;
974 		    }
975 
976 		default:
977 			error = ENOPROTOOPT;
978 			break;
979 		}
980 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
981 			(void) ((*so->so_proto->pr_ctloutput)
982 				  (PRCO_SETOPT, so, level, optname, &m0, p));
983 			m = NULL;	/* freed by protocol */
984 		}
985 	}
986 bad:
987 	if (m)
988 		(void) m_free(m);
989 	return (error);
990 }
991 
992 int
993 sogetopt(so, level, optname, mp, p)
994 	register struct socket *so;
995 	int level, optname;
996 	struct mbuf **mp;
997 	struct proc *p;
998 {
999 	register struct mbuf *m;
1000 
1001 	if (level != SOL_SOCKET) {
1002 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1003 			return ((*so->so_proto->pr_ctloutput)
1004 				  (PRCO_GETOPT, so, level, optname, mp, p));
1005 		} else
1006 			return (ENOPROTOOPT);
1007 	} else {
1008 		m = m_get(M_WAIT, MT_SOOPTS);
1009 		m->m_len = sizeof (int);
1010 
1011 		switch (optname) {
1012 
1013 		case SO_LINGER:
1014 			m->m_len = sizeof (struct linger);
1015 			mtod(m, struct linger *)->l_onoff =
1016 				so->so_options & SO_LINGER;
1017 			mtod(m, struct linger *)->l_linger = so->so_linger;
1018 			break;
1019 
1020 		case SO_USELOOPBACK:
1021 		case SO_DONTROUTE:
1022 		case SO_DEBUG:
1023 		case SO_KEEPALIVE:
1024 		case SO_REUSEADDR:
1025 		case SO_REUSEPORT:
1026 		case SO_BROADCAST:
1027 		case SO_OOBINLINE:
1028 		case SO_TIMESTAMP:
1029 			*mtod(m, int *) = so->so_options & optname;
1030 			break;
1031 
1032 		case SO_TYPE:
1033 			*mtod(m, int *) = so->so_type;
1034 			break;
1035 
1036 		case SO_ERROR:
1037 			*mtod(m, int *) = so->so_error;
1038 			so->so_error = 0;
1039 			break;
1040 
1041 		case SO_SNDBUF:
1042 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1043 			break;
1044 
1045 		case SO_RCVBUF:
1046 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1047 			break;
1048 
1049 		case SO_SNDLOWAT:
1050 			*mtod(m, int *) = so->so_snd.sb_lowat;
1051 			break;
1052 
1053 		case SO_RCVLOWAT:
1054 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1055 			break;
1056 
1057 		case SO_SNDTIMEO:
1058 		case SO_RCVTIMEO:
1059 		    {
1060 			int val = (optname == SO_SNDTIMEO ?
1061 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1062 
1063 			m->m_len = sizeof(struct timeval);
1064 			mtod(m, struct timeval *)->tv_sec = val / hz;
1065 			mtod(m, struct timeval *)->tv_usec =
1066 			    (val % hz) * tick;
1067 			break;
1068 		    }
1069 
1070 		default:
1071 			(void)m_free(m);
1072 			return (ENOPROTOOPT);
1073 		}
1074 		*mp = m;
1075 		return (0);
1076 	}
1077 }
1078 
1079 void
1080 sohasoutofband(so)
1081 	register struct socket *so;
1082 {
1083 	struct proc *p;
1084 
1085 	if (so->so_pgid < 0)
1086 		gsignal(-so->so_pgid, SIGURG);
1087 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1088 		psignal(p, SIGURG);
1089 	selwakeup(&so->so_rcv.sb_sel);
1090 }
1091 
1092 int
1093 soselect(struct socket *so, int which, struct proc *p)
1094 {
1095 	int s = splnet();
1096 	switch (which) {
1097 
1098 	case FREAD:
1099 		if (soreadable(so)) {
1100 			splx(s);
1101 			return (1);
1102 		}
1103 		selrecord(p, &so->so_rcv.sb_sel);
1104 		so->so_rcv.sb_flags |= SB_SEL;
1105 		break;
1106 
1107 	case FWRITE:
1108 		if (sowriteable(so)) {
1109 			splx(s);
1110 			return (1);
1111 		}
1112 		selrecord(p, &so->so_snd.sb_sel);
1113 		so->so_snd.sb_flags |= SB_SEL;
1114 		break;
1115 
1116 	case 0:
1117 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
1118 			splx(s);
1119 			return (1);
1120 		}
1121 		selrecord(p, &so->so_rcv.sb_sel);
1122 		so->so_rcv.sb_flags |= SB_SEL;
1123 		break;
1124 	}
1125 	splx(s);
1126 	return (0);
1127 }
1128