xref: /freebsd/sys/kern/uipc_socket.c (revision 0ea3482342b4d7d6e71f3007ce4dafe445c639fd)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  * $Id: uipc_socket.c,v 1.11 1995/08/25 20:27:46 bde Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 #include <sys/file.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/domain.h>
44 #include <sys/kernel.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/resourcevar.h>
49 #include <sys/signalvar.h>
50 #include <sys/sysctl.h>
51 
52 static int somaxconn = SOMAXCONN;
53 SYSCTL_INT(_kern, KERN_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "");
54 
55 /*
56  * Socket operation routines.
57  * These routines are called by the routines in
58  * sys_socket.c or from a system process, and
59  * implement the semantics of socket operations by
60  * switching out to the protocol specific routines.
61  */
62 /*ARGSUSED*/
63 int
64 socreate(dom, aso, type, proto)
65 	int dom;
66 	struct socket **aso;
67 	register int type;
68 	int proto;
69 {
70 	struct proc *p = curproc;		/* XXX */
71 	register struct protosw *prp;
72 	register struct socket *so;
73 	register int error;
74 
75 	if (proto)
76 		prp = pffindproto(dom, proto, type);
77 	else
78 		prp = pffindtype(dom, type);
79 	if (prp == 0 || prp->pr_usrreq == 0)
80 		return (EPROTONOSUPPORT);
81 	if (prp->pr_type != type)
82 		return (EPROTOTYPE);
83 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
84 	bzero((caddr_t)so, sizeof(*so));
85 	so->so_type = type;
86 	if (p->p_ucred->cr_uid == 0)
87 		so->so_state = SS_PRIV;
88 	so->so_proto = prp;
89 	error =
90 	    (*prp->pr_usrreq)(so, PRU_ATTACH,
91 		(struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0);
92 	if (error) {
93 		so->so_state |= SS_NOFDREF;
94 		sofree(so);
95 		return (error);
96 	}
97 	*aso = so;
98 	return (0);
99 }
100 
101 int
102 sobind(so, nam)
103 	struct socket *so;
104 	struct mbuf *nam;
105 {
106 	int s = splnet();
107 	int error;
108 
109 	error =
110 	    (*so->so_proto->pr_usrreq)(so, PRU_BIND,
111 		(struct mbuf *)0, nam, (struct mbuf *)0);
112 	splx(s);
113 	return (error);
114 }
115 
116 int
117 solisten(so, backlog)
118 	register struct socket *so;
119 	int backlog;
120 {
121 	int s = splnet(), error;
122 
123 	error =
124 	    (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
125 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
126 	if (error) {
127 		splx(s);
128 		return (error);
129 	}
130 	if (so->so_q == 0)
131 		so->so_options |= SO_ACCEPTCONN;
132 	if (backlog < 0 || backlog > somaxconn)
133 		backlog = somaxconn;
134 	so->so_qlimit = backlog;
135 	splx(s);
136 	return (0);
137 }
138 
139 void
140 sofree(so)
141 	register struct socket *so;
142 {
143 
144 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
145 		return;
146 	if (so->so_head) {
147 		if (!soqremque(so, 0) && !soqremque(so, 1))
148 			panic("sofree dq");
149 		so->so_head = 0;
150 	}
151 	sbrelease(&so->so_snd);
152 	sorflush(so);
153 	FREE(so, M_SOCKET);
154 }
155 
156 /*
157  * Close a socket on last file table reference removal.
158  * Initiate disconnect if connected.
159  * Free socket when disconnect complete.
160  */
161 int
162 soclose(so)
163 	register struct socket *so;
164 {
165 	int s = splnet();		/* conservative */
166 	int error = 0;
167 
168 	if (so->so_options & SO_ACCEPTCONN) {
169 		while (so->so_q0)
170 			(void) soabort(so->so_q0);
171 		while (so->so_q)
172 			(void) soabort(so->so_q);
173 	}
174 	if (so->so_pcb == 0)
175 		goto discard;
176 	if (so->so_state & SS_ISCONNECTED) {
177 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
178 			error = sodisconnect(so);
179 			if (error)
180 				goto drop;
181 		}
182 		if (so->so_options & SO_LINGER) {
183 			if ((so->so_state & SS_ISDISCONNECTING) &&
184 			    (so->so_state & SS_NBIO))
185 				goto drop;
186 			while (so->so_state & SS_ISCONNECTED) {
187 				error = tsleep((caddr_t)&so->so_timeo,
188 				    PSOCK | PCATCH, netcls, so->so_linger);
189 				if (error)
190 					break;
191 			}
192 		}
193 	}
194 drop:
195 	if (so->so_pcb) {
196 		int error2 =
197 		    (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
198 			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
199 		if (error == 0)
200 			error = error2;
201 	}
202 discard:
203 	if (so->so_state & SS_NOFDREF)
204 		panic("soclose: NOFDREF");
205 	so->so_state |= SS_NOFDREF;
206 	sofree(so);
207 	splx(s);
208 	return (error);
209 }
210 
211 /*
212  * Must be called at splnet...
213  */
214 int
215 soabort(so)
216 	struct socket *so;
217 {
218 
219 	return (
220 	    (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
221 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
222 }
223 
224 int
225 soaccept(so, nam)
226 	register struct socket *so;
227 	struct mbuf *nam;
228 {
229 	int s = splnet();
230 	int error;
231 
232 	if ((so->so_state & SS_NOFDREF) == 0)
233 		panic("soaccept: !NOFDREF");
234 	so->so_state &= ~SS_NOFDREF;
235 	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
236 	    (struct mbuf *)0, nam, (struct mbuf *)0);
237 	splx(s);
238 	return (error);
239 }
240 
241 int
242 soconnect(so, nam)
243 	register struct socket *so;
244 	struct mbuf *nam;
245 {
246 	int s;
247 	int error;
248 
249 	if (so->so_options & SO_ACCEPTCONN)
250 		return (EOPNOTSUPP);
251 	s = splnet();
252 	/*
253 	 * If protocol is connection-based, can only connect once.
254 	 * Otherwise, if connected, try to disconnect first.
255 	 * This allows user to disconnect by connecting to, e.g.,
256 	 * a null address.
257 	 */
258 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
259 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
260 	    (error = sodisconnect(so))))
261 		error = EISCONN;
262 	else
263 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
264 		    (struct mbuf *)0, nam, (struct mbuf *)0);
265 	splx(s);
266 	return (error);
267 }
268 
269 int
270 soconnect2(so1, so2)
271 	register struct socket *so1;
272 	struct socket *so2;
273 {
274 	int s = splnet();
275 	int error;
276 
277 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
278 	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
279 	splx(s);
280 	return (error);
281 }
282 
283 int
284 sodisconnect(so)
285 	register struct socket *so;
286 {
287 	int s = splnet();
288 	int error;
289 
290 	if ((so->so_state & SS_ISCONNECTED) == 0) {
291 		error = ENOTCONN;
292 		goto bad;
293 	}
294 	if (so->so_state & SS_ISDISCONNECTING) {
295 		error = EALREADY;
296 		goto bad;
297 	}
298 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
299 	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
300 bad:
301 	splx(s);
302 	return (error);
303 }
304 
305 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
306 /*
307  * Send on a socket.
308  * If send must go all at once and message is larger than
309  * send buffering, then hard error.
310  * Lock against other senders.
311  * If must go all at once and not enough room now, then
312  * inform user that this would block and do nothing.
313  * Otherwise, if nonblocking, send as much as possible.
314  * The data to be sent is described by "uio" if nonzero,
315  * otherwise by the mbuf chain "top" (which must be null
316  * if uio is not).  Data provided in mbuf chain must be small
317  * enough to send all at once.
318  *
319  * Returns nonzero on error, timeout or signal; callers
320  * must check for short counts if EINTR/ERESTART are returned.
321  * Data and control buffers are freed on return.
322  */
323 int
324 sosend(so, addr, uio, top, control, flags)
325 	register struct socket *so;
326 	struct mbuf *addr;
327 	struct uio *uio;
328 	struct mbuf *top;
329 	struct mbuf *control;
330 	int flags;
331 {
332 	struct proc *p = curproc;		/* XXX */
333 	struct mbuf **mp;
334 	register struct mbuf *m;
335 	register long space, len, resid;
336 	int clen = 0, error, s, dontroute, mlen;
337 	int atomic = sosendallatonce(so) || top;
338 
339 	if (uio)
340 		resid = uio->uio_resid;
341 	else
342 		resid = top->m_pkthdr.len;
343 	/*
344 	 * In theory resid should be unsigned.
345 	 * However, space must be signed, as it might be less than 0
346 	 * if we over-committed, and we must use a signed comparison
347 	 * of space and resid.  On the other hand, a negative resid
348 	 * causes us to loop sending 0-length segments to the protocol.
349 	 */
350 	if (resid < 0)
351 		return (EINVAL);
352 	dontroute =
353 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
354 	    (so->so_proto->pr_flags & PR_ATOMIC);
355 	p->p_stats->p_ru.ru_msgsnd++;
356 	if (control)
357 		clen = control->m_len;
358 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
359 
360 restart:
361 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
362 	if (error)
363 		goto out;
364 	do {
365 		s = splnet();
366 		if (so->so_state & SS_CANTSENDMORE)
367 			snderr(EPIPE);
368 		if (so->so_error)
369 			snderr(so->so_error);
370 		if ((so->so_state & SS_ISCONNECTED) == 0) {
371 			/*
372 			 * `sendto' and `sendmsg' is allowed on a connection-
373 			 * based socket if it supports implied connect.
374 			 * Return ENOTCONN if not connected and no address is
375 			 * supplied.
376 			 */
377 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
378 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
379 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
380 				    !(resid == 0 && clen != 0))
381 					snderr(ENOTCONN);
382 			} else if (addr == 0)
383 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
384 				   ENOTCONN : EDESTADDRREQ);
385 		}
386 		space = sbspace(&so->so_snd);
387 		if (flags & MSG_OOB)
388 			space += 1024;
389 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
390 		    clen > so->so_snd.sb_hiwat)
391 			snderr(EMSGSIZE);
392 		if (space < resid + clen && uio &&
393 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
394 			if (so->so_state & SS_NBIO)
395 				snderr(EWOULDBLOCK);
396 			sbunlock(&so->so_snd);
397 			error = sbwait(&so->so_snd);
398 			splx(s);
399 			if (error)
400 				goto out;
401 			goto restart;
402 		}
403 		splx(s);
404 		mp = &top;
405 		space -= clen;
406 		do {
407 		    if (uio == NULL) {
408 			/*
409 			 * Data is prepackaged in "top".
410 			 */
411 			resid = 0;
412 			if (flags & MSG_EOR)
413 				top->m_flags |= M_EOR;
414 		    } else do {
415 			if (top == 0) {
416 				MGETHDR(m, M_WAIT, MT_DATA);
417 				mlen = MHLEN;
418 				m->m_pkthdr.len = 0;
419 				m->m_pkthdr.rcvif = (struct ifnet *)0;
420 			} else {
421 				MGET(m, M_WAIT, MT_DATA);
422 				mlen = MLEN;
423 			}
424 			if (resid >= MINCLSIZE) {
425 				MCLGET(m, M_WAIT);
426 				if ((m->m_flags & M_EXT) == 0)
427 					goto nopages;
428 				mlen = MCLBYTES;
429 				len = min(min(mlen, resid), space);
430 			} else {
431 nopages:
432 				len = min(min(mlen, resid), space);
433 				/*
434 				 * For datagram protocols, leave room
435 				 * for protocol headers in first mbuf.
436 				 */
437 				if (atomic && top == 0 && len < mlen)
438 					MH_ALIGN(m, len);
439 			}
440 			space -= len;
441 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
442 			resid = uio->uio_resid;
443 			m->m_len = len;
444 			*mp = m;
445 			top->m_pkthdr.len += len;
446 			if (error)
447 				goto release;
448 			mp = &m->m_next;
449 			if (resid <= 0) {
450 				if (flags & MSG_EOR)
451 					top->m_flags |= M_EOR;
452 				break;
453 			}
454 		    } while (space > 0 && atomic);
455 		    if (dontroute)
456 			    so->so_options |= SO_DONTROUTE;
457 		    s = splnet();				/* XXX */
458 		    error = (*so->so_proto->pr_usrreq)(so,
459 			(flags & MSG_OOB) ? PRU_SENDOOB :
460 			/*
461 			 * If the user set MSG_EOF, the protocol
462 			 * understands this flag and nothing left to
463 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
464 			 */
465 			((flags & MSG_EOF) &&
466 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
467 			 (resid <= 0)) ?
468 				PRU_SEND_EOF : PRU_SEND,
469 			top, addr, control);
470 		    splx(s);
471 		    if (dontroute)
472 			    so->so_options &= ~SO_DONTROUTE;
473 		    clen = 0;
474 		    control = 0;
475 		    top = 0;
476 		    mp = &top;
477 		    if (error)
478 			goto release;
479 		} while (resid && space > 0);
480 	} while (resid);
481 
482 release:
483 	sbunlock(&so->so_snd);
484 out:
485 	if (top)
486 		m_freem(top);
487 	if (control)
488 		m_freem(control);
489 	return (error);
490 }
491 
492 /*
493  * Implement receive operations on a socket.
494  * We depend on the way that records are added to the sockbuf
495  * by sbappend*.  In particular, each record (mbufs linked through m_next)
496  * must begin with an address if the protocol so specifies,
497  * followed by an optional mbuf or mbufs containing ancillary data,
498  * and then zero or more mbufs of data.
499  * In order to avoid blocking network interrupts for the entire time here,
500  * we splx() while doing the actual copy to user space.
501  * Although the sockbuf is locked, new data may still be appended,
502  * and thus we must maintain consistency of the sockbuf during that time.
503  *
504  * The caller may receive the data as a single mbuf chain by supplying
505  * an mbuf **mp0 for use in returning the chain.  The uio is then used
506  * only for the count in uio_resid.
507  */
508 int
509 soreceive(so, paddr, uio, mp0, controlp, flagsp)
510 	register struct socket *so;
511 	struct mbuf **paddr;
512 	struct uio *uio;
513 	struct mbuf **mp0;
514 	struct mbuf **controlp;
515 	int *flagsp;
516 {
517 	register struct mbuf *m, **mp;
518 	register int flags, len, error, s, offset;
519 	struct protosw *pr = so->so_proto;
520 	struct mbuf *nextrecord;
521 	int moff, type = 0;
522 	int orig_resid = uio->uio_resid;
523 
524 	mp = mp0;
525 	if (paddr)
526 		*paddr = 0;
527 	if (controlp)
528 		*controlp = 0;
529 	if (flagsp)
530 		flags = *flagsp &~ MSG_EOR;
531 	else
532 		flags = 0;
533 	if (flags & MSG_OOB) {
534 		m = m_get(M_WAIT, MT_DATA);
535 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB,
536 		    m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0);
537 		if (error)
538 			goto bad;
539 		do {
540 			error = uiomove(mtod(m, caddr_t),
541 			    (int) min(uio->uio_resid, m->m_len), uio);
542 			m = m_free(m);
543 		} while (uio->uio_resid && error == 0 && m);
544 bad:
545 		if (m)
546 			m_freem(m);
547 		return (error);
548 	}
549 	if (mp)
550 		*mp = (struct mbuf *)0;
551 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
552 		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
553 		    (struct mbuf *)0, (struct mbuf *)0);
554 
555 restart:
556 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
557 	if (error)
558 		return (error);
559 	s = splnet();
560 
561 	m = so->so_rcv.sb_mb;
562 	/*
563 	 * If we have less data than requested, block awaiting more
564 	 * (subject to any timeout) if:
565 	 *   1. the current count is less than the low water mark, or
566 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
567 	 *	receive operation at once if we block (resid <= hiwat).
568 	 *   3. MSG_DONTWAIT is not set
569 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
570 	 * we have to do the receive in sections, and thus risk returning
571 	 * a short count if a timeout or signal occurs after we start.
572 	 */
573 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
574 	    so->so_rcv.sb_cc < uio->uio_resid) &&
575 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
576 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
577 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
578 #ifdef DIAGNOSTIC
579 		if (m == 0 && so->so_rcv.sb_cc)
580 			panic("receive 1");
581 #endif
582 		if (so->so_error) {
583 			if (m)
584 				goto dontblock;
585 			error = so->so_error;
586 			if ((flags & MSG_PEEK) == 0)
587 				so->so_error = 0;
588 			goto release;
589 		}
590 		if (so->so_state & SS_CANTRCVMORE) {
591 			if (m)
592 				goto dontblock;
593 			else
594 				goto release;
595 		}
596 		for (; m; m = m->m_next)
597 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
598 				m = so->so_rcv.sb_mb;
599 				goto dontblock;
600 			}
601 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
602 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
603 			error = ENOTCONN;
604 			goto release;
605 		}
606 		if (uio->uio_resid == 0)
607 			goto release;
608 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
609 			error = EWOULDBLOCK;
610 			goto release;
611 		}
612 		sbunlock(&so->so_rcv);
613 		error = sbwait(&so->so_rcv);
614 		splx(s);
615 		if (error)
616 			return (error);
617 		goto restart;
618 	}
619 dontblock:
620 	if (uio->uio_procp)
621 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
622 	nextrecord = m->m_nextpkt;
623 	if (pr->pr_flags & PR_ADDR) {
624 #ifdef DIAGNOSTIC
625 		if (m->m_type != MT_SONAME)
626 			panic("receive 1a");
627 #endif
628 		orig_resid = 0;
629 		if (flags & MSG_PEEK) {
630 			if (paddr)
631 				*paddr = m_copy(m, 0, m->m_len);
632 			m = m->m_next;
633 		} else {
634 			sbfree(&so->so_rcv, m);
635 			if (paddr) {
636 				*paddr = m;
637 				so->so_rcv.sb_mb = m->m_next;
638 				m->m_next = 0;
639 				m = so->so_rcv.sb_mb;
640 			} else {
641 				MFREE(m, so->so_rcv.sb_mb);
642 				m = so->so_rcv.sb_mb;
643 			}
644 		}
645 	}
646 	while (m && m->m_type == MT_CONTROL && error == 0) {
647 		if (flags & MSG_PEEK) {
648 			if (controlp)
649 				*controlp = m_copy(m, 0, m->m_len);
650 			m = m->m_next;
651 		} else {
652 			sbfree(&so->so_rcv, m);
653 			if (controlp) {
654 				if (pr->pr_domain->dom_externalize &&
655 				    mtod(m, struct cmsghdr *)->cmsg_type ==
656 				    SCM_RIGHTS)
657 				   error = (*pr->pr_domain->dom_externalize)(m);
658 				*controlp = m;
659 				so->so_rcv.sb_mb = m->m_next;
660 				m->m_next = 0;
661 				m = so->so_rcv.sb_mb;
662 			} else {
663 				MFREE(m, so->so_rcv.sb_mb);
664 				m = so->so_rcv.sb_mb;
665 			}
666 		}
667 		if (controlp) {
668 			orig_resid = 0;
669 			controlp = &(*controlp)->m_next;
670 		}
671 	}
672 	if (m) {
673 		if ((flags & MSG_PEEK) == 0)
674 			m->m_nextpkt = nextrecord;
675 		type = m->m_type;
676 		if (type == MT_OOBDATA)
677 			flags |= MSG_OOB;
678 	}
679 	moff = 0;
680 	offset = 0;
681 	while (m && uio->uio_resid > 0 && error == 0) {
682 		if (m->m_type == MT_OOBDATA) {
683 			if (type != MT_OOBDATA)
684 				break;
685 		} else if (type == MT_OOBDATA)
686 			break;
687 #ifdef DIAGNOSTIC
688 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
689 			panic("receive 3");
690 #endif
691 		so->so_state &= ~SS_RCVATMARK;
692 		len = uio->uio_resid;
693 		if (so->so_oobmark && len > so->so_oobmark - offset)
694 			len = so->so_oobmark - offset;
695 		if (len > m->m_len - moff)
696 			len = m->m_len - moff;
697 		/*
698 		 * If mp is set, just pass back the mbufs.
699 		 * Otherwise copy them out via the uio, then free.
700 		 * Sockbuf must be consistent here (points to current mbuf,
701 		 * it points to next record) when we drop priority;
702 		 * we must note any additions to the sockbuf when we
703 		 * block interrupts again.
704 		 */
705 		if (mp == 0) {
706 			splx(s);
707 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
708 			s = splnet();
709 		} else
710 			uio->uio_resid -= len;
711 		if (len == m->m_len - moff) {
712 			if (m->m_flags & M_EOR)
713 				flags |= MSG_EOR;
714 			if (flags & MSG_PEEK) {
715 				m = m->m_next;
716 				moff = 0;
717 			} else {
718 				nextrecord = m->m_nextpkt;
719 				sbfree(&so->so_rcv, m);
720 				if (mp) {
721 					*mp = m;
722 					mp = &m->m_next;
723 					so->so_rcv.sb_mb = m = m->m_next;
724 					*mp = (struct mbuf *)0;
725 				} else {
726 					MFREE(m, so->so_rcv.sb_mb);
727 					m = so->so_rcv.sb_mb;
728 				}
729 				if (m)
730 					m->m_nextpkt = nextrecord;
731 			}
732 		} else {
733 			if (flags & MSG_PEEK)
734 				moff += len;
735 			else {
736 				if (mp)
737 					*mp = m_copym(m, 0, len, M_WAIT);
738 				m->m_data += len;
739 				m->m_len -= len;
740 				so->so_rcv.sb_cc -= len;
741 			}
742 		}
743 		if (so->so_oobmark) {
744 			if ((flags & MSG_PEEK) == 0) {
745 				so->so_oobmark -= len;
746 				if (so->so_oobmark == 0) {
747 					so->so_state |= SS_RCVATMARK;
748 					break;
749 				}
750 			} else {
751 				offset += len;
752 				if (offset == so->so_oobmark)
753 					break;
754 			}
755 		}
756 		if (flags & MSG_EOR)
757 			break;
758 		/*
759 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
760 		 * we must not quit until "uio->uio_resid == 0" or an error
761 		 * termination.  If a signal/timeout occurs, return
762 		 * with a short count but without error.
763 		 * Keep sockbuf locked against other readers.
764 		 */
765 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
766 		    !sosendallatonce(so) && !nextrecord) {
767 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
768 				break;
769 			error = sbwait(&so->so_rcv);
770 			if (error) {
771 				sbunlock(&so->so_rcv);
772 				splx(s);
773 				return (0);
774 			}
775 			m = so->so_rcv.sb_mb;
776 			if (m)
777 				nextrecord = m->m_nextpkt;
778 		}
779 	}
780 
781 	if (m && pr->pr_flags & PR_ATOMIC) {
782 		flags |= MSG_TRUNC;
783 		if ((flags & MSG_PEEK) == 0)
784 			(void) sbdroprecord(&so->so_rcv);
785 	}
786 	if ((flags & MSG_PEEK) == 0) {
787 		if (m == 0)
788 			so->so_rcv.sb_mb = nextrecord;
789 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
790 			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
791 			    (struct mbuf *)flags, (struct mbuf *)0);
792 	}
793 	if (orig_resid == uio->uio_resid && orig_resid &&
794 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
795 		sbunlock(&so->so_rcv);
796 		splx(s);
797 		goto restart;
798 	}
799 
800 	if (flagsp)
801 		*flagsp |= flags;
802 release:
803 	sbunlock(&so->so_rcv);
804 	splx(s);
805 	return (error);
806 }
807 
808 int
809 soshutdown(so, how)
810 	register struct socket *so;
811 	register int how;
812 {
813 	register struct protosw *pr = so->so_proto;
814 
815 	how++;
816 	if (how & FREAD)
817 		sorflush(so);
818 	if (how & FWRITE)
819 		return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
820 		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
821 	return (0);
822 }
823 
824 void
825 sorflush(so)
826 	register struct socket *so;
827 {
828 	register struct sockbuf *sb = &so->so_rcv;
829 	register struct protosw *pr = so->so_proto;
830 	register int s;
831 	struct sockbuf asb;
832 
833 	sb->sb_flags |= SB_NOINTR;
834 	(void) sblock(sb, M_WAITOK);
835 	s = splimp();
836 	socantrcvmore(so);
837 	sbunlock(sb);
838 	asb = *sb;
839 	bzero((caddr_t)sb, sizeof (*sb));
840 	splx(s);
841 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
842 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
843 	sbrelease(&asb);
844 }
845 
846 int
847 sosetopt(so, level, optname, m0)
848 	register struct socket *so;
849 	int level, optname;
850 	struct mbuf *m0;
851 {
852 	int error = 0;
853 	register struct mbuf *m = m0;
854 
855 	if (level != SOL_SOCKET) {
856 		if (so->so_proto && so->so_proto->pr_ctloutput)
857 			return ((*so->so_proto->pr_ctloutput)
858 				  (PRCO_SETOPT, so, level, optname, &m0));
859 		error = ENOPROTOOPT;
860 	} else {
861 		switch (optname) {
862 
863 		case SO_LINGER:
864 			if (m == NULL || m->m_len != sizeof (struct linger)) {
865 				error = EINVAL;
866 				goto bad;
867 			}
868 			so->so_linger = mtod(m, struct linger *)->l_linger;
869 			/* fall thru... */
870 
871 		case SO_DEBUG:
872 		case SO_KEEPALIVE:
873 		case SO_DONTROUTE:
874 		case SO_USELOOPBACK:
875 		case SO_BROADCAST:
876 		case SO_REUSEADDR:
877 		case SO_REUSEPORT:
878 		case SO_OOBINLINE:
879 			if (m == NULL || m->m_len < sizeof (int)) {
880 				error = EINVAL;
881 				goto bad;
882 			}
883 			if (*mtod(m, int *))
884 				so->so_options |= optname;
885 			else
886 				so->so_options &= ~optname;
887 			break;
888 
889 		case SO_SNDBUF:
890 		case SO_RCVBUF:
891 		case SO_SNDLOWAT:
892 		case SO_RCVLOWAT:
893 			if (m == NULL || m->m_len < sizeof (int)) {
894 				error = EINVAL;
895 				goto bad;
896 			}
897 			switch (optname) {
898 
899 			case SO_SNDBUF:
900 			case SO_RCVBUF:
901 				if (sbreserve(optname == SO_SNDBUF ?
902 				    &so->so_snd : &so->so_rcv,
903 				    (u_long) *mtod(m, int *)) == 0) {
904 					error = ENOBUFS;
905 					goto bad;
906 				}
907 				break;
908 
909 			case SO_SNDLOWAT:
910 				so->so_snd.sb_lowat = *mtod(m, int *);
911 				break;
912 			case SO_RCVLOWAT:
913 				so->so_rcv.sb_lowat = *mtod(m, int *);
914 				break;
915 			}
916 			break;
917 
918 		case SO_SNDTIMEO:
919 		case SO_RCVTIMEO:
920 		    {
921 			struct timeval *tv;
922 			short val;
923 
924 			if (m == NULL || m->m_len < sizeof (*tv)) {
925 				error = EINVAL;
926 				goto bad;
927 			}
928 			tv = mtod(m, struct timeval *);
929 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
930 				error = EDOM;
931 				goto bad;
932 			}
933 			val = tv->tv_sec * hz + tv->tv_usec / tick;
934 
935 			switch (optname) {
936 
937 			case SO_SNDTIMEO:
938 				so->so_snd.sb_timeo = val;
939 				break;
940 			case SO_RCVTIMEO:
941 				so->so_rcv.sb_timeo = val;
942 				break;
943 			}
944 			break;
945 		    }
946 
947 		default:
948 			error = ENOPROTOOPT;
949 			break;
950 		}
951 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
952 			(void) ((*so->so_proto->pr_ctloutput)
953 				  (PRCO_SETOPT, so, level, optname, &m0));
954 			m = NULL;	/* freed by protocol */
955 		}
956 	}
957 bad:
958 	if (m)
959 		(void) m_free(m);
960 	return (error);
961 }
962 
963 int
964 sogetopt(so, level, optname, mp)
965 	register struct socket *so;
966 	int level, optname;
967 	struct mbuf **mp;
968 {
969 	register struct mbuf *m;
970 
971 	if (level != SOL_SOCKET) {
972 		if (so->so_proto && so->so_proto->pr_ctloutput) {
973 			return ((*so->so_proto->pr_ctloutput)
974 				  (PRCO_GETOPT, so, level, optname, mp));
975 		} else
976 			return (ENOPROTOOPT);
977 	} else {
978 		m = m_get(M_WAIT, MT_SOOPTS);
979 		m->m_len = sizeof (int);
980 
981 		switch (optname) {
982 
983 		case SO_LINGER:
984 			m->m_len = sizeof (struct linger);
985 			mtod(m, struct linger *)->l_onoff =
986 				so->so_options & SO_LINGER;
987 			mtod(m, struct linger *)->l_linger = so->so_linger;
988 			break;
989 
990 		case SO_USELOOPBACK:
991 		case SO_DONTROUTE:
992 		case SO_DEBUG:
993 		case SO_KEEPALIVE:
994 		case SO_REUSEADDR:
995 		case SO_REUSEPORT:
996 		case SO_BROADCAST:
997 		case SO_OOBINLINE:
998 			*mtod(m, int *) = so->so_options & optname;
999 			break;
1000 
1001 		case SO_TYPE:
1002 			*mtod(m, int *) = so->so_type;
1003 			break;
1004 
1005 		case SO_ERROR:
1006 			*mtod(m, int *) = so->so_error;
1007 			so->so_error = 0;
1008 			break;
1009 
1010 		case SO_SNDBUF:
1011 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1012 			break;
1013 
1014 		case SO_RCVBUF:
1015 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1016 			break;
1017 
1018 		case SO_SNDLOWAT:
1019 			*mtod(m, int *) = so->so_snd.sb_lowat;
1020 			break;
1021 
1022 		case SO_RCVLOWAT:
1023 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1024 			break;
1025 
1026 		case SO_SNDTIMEO:
1027 		case SO_RCVTIMEO:
1028 		    {
1029 			int val = (optname == SO_SNDTIMEO ?
1030 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1031 
1032 			m->m_len = sizeof(struct timeval);
1033 			mtod(m, struct timeval *)->tv_sec = val / hz;
1034 			mtod(m, struct timeval *)->tv_usec =
1035 			    (val % hz) * tick;
1036 			break;
1037 		    }
1038 
1039 		default:
1040 			(void)m_free(m);
1041 			return (ENOPROTOOPT);
1042 		}
1043 		*mp = m;
1044 		return (0);
1045 	}
1046 }
1047 
1048 void
1049 sohasoutofband(so)
1050 	register struct socket *so;
1051 {
1052 	struct proc *p;
1053 
1054 	if (so->so_pgid < 0)
1055 		gsignal(-so->so_pgid, SIGURG);
1056 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1057 		psignal(p, SIGURG);
1058 	selwakeup(&so->so_rcv.sb_sel);
1059 }
1060