xref: /freebsd/sys/kern/uipc_socket.c (revision df7f5d4de4592a8948a25ce01e5bddfbb7ce39dc)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  *	$Id: uipc_socket.c,v 1.23 1997/02/22 09:39:28 peter Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/queue.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/file.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/signalvar.h>
51 #include <sys/sysctl.h>
52 
53 static int somaxconn = SOMAXCONN;
54 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
55 	   0, "");
56 
57 /*
58  * Socket operation routines.
59  * These routines are called by the routines in
60  * sys_socket.c or from a system process, and
61  * implement the semantics of socket operations by
62  * switching out to the protocol specific routines.
63  */
64 /*ARGSUSED*/
65 int
66 socreate(dom, aso, type, proto, p)
67 	int dom;
68 	struct socket **aso;
69 	register int type;
70 	int proto;
71 	struct proc *p;
72 {
73 	register struct protosw *prp;
74 	register struct socket *so;
75 	register int error;
76 
77 	if (proto)
78 		prp = pffindproto(dom, proto, type);
79 	else
80 		prp = pffindtype(dom, type);
81 	if (prp == 0 || prp->pr_usrreqs == 0)
82 		return (EPROTONOSUPPORT);
83 	if (prp->pr_type != type)
84 		return (EPROTOTYPE);
85 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
86 	bzero((caddr_t)so, sizeof(*so));
87 	TAILQ_INIT(&so->so_incomp);
88 	TAILQ_INIT(&so->so_comp);
89 	so->so_type = type;
90 	if (p->p_ucred->cr_uid == 0)
91 		so->so_state = SS_PRIV;
92 	so->so_proto = prp;
93 	error = (*prp->pr_usrreqs->pru_attach)(so, proto);
94 	if (error) {
95 		so->so_state |= SS_NOFDREF;
96 		sofree(so);
97 		return (error);
98 	}
99 	*aso = so;
100 	return (0);
101 }
102 
103 int
104 sobind(so, nam)
105 	struct socket *so;
106 	struct mbuf *nam;
107 {
108 	int s = splnet();
109 	int error;
110 
111 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam);
112 	splx(s);
113 	return (error);
114 }
115 
116 int
117 solisten(so, backlog)
118 	register struct socket *so;
119 	int backlog;
120 {
121 	int s = splnet(), error;
122 
123 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so);
124 	if (error) {
125 		splx(s);
126 		return (error);
127 	}
128 	if (so->so_comp.tqh_first == NULL)
129 		so->so_options |= SO_ACCEPTCONN;
130 	if (backlog < 0 || backlog > somaxconn)
131 		backlog = somaxconn;
132 	so->so_qlimit = backlog;
133 	splx(s);
134 	return (0);
135 }
136 
137 void
138 sofree(so)
139 	register struct socket *so;
140 {
141 	struct socket *head = so->so_head;
142 
143 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
144 		return;
145 	if (head != NULL) {
146 		if (so->so_state & SS_INCOMP) {
147 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
148 			head->so_incqlen--;
149 		} else if (so->so_state & SS_COMP) {
150 			TAILQ_REMOVE(&head->so_comp, so, so_list);
151 		} else {
152 			panic("sofree: not queued");
153 		}
154 		head->so_qlen--;
155 		so->so_state &= ~(SS_INCOMP|SS_COMP);
156 		so->so_head = NULL;
157 	}
158 	sbrelease(&so->so_snd);
159 	sorflush(so);
160 	FREE(so, M_SOCKET);
161 }
162 
163 /*
164  * Close a socket on last file table reference removal.
165  * Initiate disconnect if connected.
166  * Free socket when disconnect complete.
167  */
168 int
169 soclose(so)
170 	register struct socket *so;
171 {
172 	int s = splnet();		/* conservative */
173 	int error = 0;
174 
175 	if (so->so_options & SO_ACCEPTCONN) {
176 		struct socket *sp, *sonext;
177 
178 		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
179 			sonext = sp->so_list.tqe_next;
180 			(void) soabort(sp);
181 		}
182 		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
183 			sonext = sp->so_list.tqe_next;
184 			(void) soabort(sp);
185 		}
186 	}
187 	if (so->so_pcb == 0)
188 		goto discard;
189 	if (so->so_state & SS_ISCONNECTED) {
190 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
191 			error = sodisconnect(so);
192 			if (error)
193 				goto drop;
194 		}
195 		if (so->so_options & SO_LINGER) {
196 			if ((so->so_state & SS_ISDISCONNECTING) &&
197 			    (so->so_state & SS_NBIO))
198 				goto drop;
199 			while (so->so_state & SS_ISCONNECTED) {
200 				error = tsleep((caddr_t)&so->so_timeo,
201 				    PSOCK | PCATCH, "soclos", so->so_linger);
202 				if (error)
203 					break;
204 			}
205 		}
206 	}
207 drop:
208 	if (so->so_pcb) {
209 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
210 		if (error == 0)
211 			error = error2;
212 	}
213 discard:
214 	if (so->so_state & SS_NOFDREF)
215 		panic("soclose: NOFDREF");
216 	so->so_state |= SS_NOFDREF;
217 	sofree(so);
218 	splx(s);
219 	return (error);
220 }
221 
222 /*
223  * Must be called at splnet...
224  */
225 int
226 soabort(so)
227 	struct socket *so;
228 {
229 
230 	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
231 }
232 
233 int
234 soaccept(so, nam)
235 	register struct socket *so;
236 	struct mbuf *nam;
237 {
238 	int s = splnet();
239 	int error;
240 
241 	if ((so->so_state & SS_NOFDREF) == 0)
242 		panic("soaccept: !NOFDREF");
243 	so->so_state &= ~SS_NOFDREF;
244 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
245 	splx(s);
246 	return (error);
247 }
248 
249 int
250 soconnect(so, nam)
251 	register struct socket *so;
252 	struct mbuf *nam;
253 {
254 	int s;
255 	int error;
256 
257 	if (so->so_options & SO_ACCEPTCONN)
258 		return (EOPNOTSUPP);
259 	s = splnet();
260 	/*
261 	 * If protocol is connection-based, can only connect once.
262 	 * Otherwise, if connected, try to disconnect first.
263 	 * This allows user to disconnect by connecting to, e.g.,
264 	 * a null address.
265 	 */
266 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
267 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
268 	    (error = sodisconnect(so))))
269 		error = EISCONN;
270 	else
271 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
272 	splx(s);
273 	return (error);
274 }
275 
276 int
277 soconnect2(so1, so2)
278 	register struct socket *so1;
279 	struct socket *so2;
280 {
281 	int s = splnet();
282 	int error;
283 
284 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
285 	splx(s);
286 	return (error);
287 }
288 
289 int
290 sodisconnect(so)
291 	register struct socket *so;
292 {
293 	int s = splnet();
294 	int error;
295 
296 	if ((so->so_state & SS_ISCONNECTED) == 0) {
297 		error = ENOTCONN;
298 		goto bad;
299 	}
300 	if (so->so_state & SS_ISDISCONNECTING) {
301 		error = EALREADY;
302 		goto bad;
303 	}
304 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
305 bad:
306 	splx(s);
307 	return (error);
308 }
309 
310 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
311 /*
312  * Send on a socket.
313  * If send must go all at once and message is larger than
314  * send buffering, then hard error.
315  * Lock against other senders.
316  * If must go all at once and not enough room now, then
317  * inform user that this would block and do nothing.
318  * Otherwise, if nonblocking, send as much as possible.
319  * The data to be sent is described by "uio" if nonzero,
320  * otherwise by the mbuf chain "top" (which must be null
321  * if uio is not).  Data provided in mbuf chain must be small
322  * enough to send all at once.
323  *
324  * Returns nonzero on error, timeout or signal; callers
325  * must check for short counts if EINTR/ERESTART are returned.
326  * Data and control buffers are freed on return.
327  */
328 int
329 sosend(so, addr, uio, top, control, flags)
330 	register struct socket *so;
331 	struct mbuf *addr;
332 	struct uio *uio;
333 	struct mbuf *top;
334 	struct mbuf *control;
335 	int flags;
336 {
337 	struct proc *p = curproc;		/* XXX */
338 	struct mbuf **mp;
339 	register struct mbuf *m;
340 	register long space, len, resid;
341 	int clen = 0, error, s, dontroute, mlen;
342 	int atomic = sosendallatonce(so) || top;
343 
344 	if (uio)
345 		resid = uio->uio_resid;
346 	else
347 		resid = top->m_pkthdr.len;
348 	/*
349 	 * In theory resid should be unsigned.
350 	 * However, space must be signed, as it might be less than 0
351 	 * if we over-committed, and we must use a signed comparison
352 	 * of space and resid.  On the other hand, a negative resid
353 	 * causes us to loop sending 0-length segments to the protocol.
354 	 */
355 	if (resid < 0)
356 		return (EINVAL);
357 	dontroute =
358 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
359 	    (so->so_proto->pr_flags & PR_ATOMIC);
360 	p->p_stats->p_ru.ru_msgsnd++;
361 	if (control)
362 		clen = control->m_len;
363 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
364 
365 restart:
366 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
367 	if (error)
368 		goto out;
369 	do {
370 		s = splnet();
371 		if (so->so_state & SS_CANTSENDMORE)
372 			snderr(EPIPE);
373 		if (so->so_error)
374 			snderr(so->so_error);
375 		if ((so->so_state & SS_ISCONNECTED) == 0) {
376 			/*
377 			 * `sendto' and `sendmsg' is allowed on a connection-
378 			 * based socket if it supports implied connect.
379 			 * Return ENOTCONN if not connected and no address is
380 			 * supplied.
381 			 */
382 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
383 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
384 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
385 				    !(resid == 0 && clen != 0))
386 					snderr(ENOTCONN);
387 			} else if (addr == 0)
388 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
389 				   ENOTCONN : EDESTADDRREQ);
390 		}
391 		space = sbspace(&so->so_snd);
392 		if (flags & MSG_OOB)
393 			space += 1024;
394 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
395 		    clen > so->so_snd.sb_hiwat)
396 			snderr(EMSGSIZE);
397 		if (space < resid + clen && uio &&
398 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
399 			if (so->so_state & SS_NBIO)
400 				snderr(EWOULDBLOCK);
401 			sbunlock(&so->so_snd);
402 			error = sbwait(&so->so_snd);
403 			splx(s);
404 			if (error)
405 				goto out;
406 			goto restart;
407 		}
408 		splx(s);
409 		mp = &top;
410 		space -= clen;
411 		do {
412 		    if (uio == NULL) {
413 			/*
414 			 * Data is prepackaged in "top".
415 			 */
416 			resid = 0;
417 			if (flags & MSG_EOR)
418 				top->m_flags |= M_EOR;
419 		    } else do {
420 			if (top == 0) {
421 				MGETHDR(m, M_WAIT, MT_DATA);
422 				mlen = MHLEN;
423 				m->m_pkthdr.len = 0;
424 				m->m_pkthdr.rcvif = (struct ifnet *)0;
425 			} else {
426 				MGET(m, M_WAIT, MT_DATA);
427 				mlen = MLEN;
428 			}
429 			if (resid >= MINCLSIZE) {
430 				MCLGET(m, M_WAIT);
431 				if ((m->m_flags & M_EXT) == 0)
432 					goto nopages;
433 				mlen = MCLBYTES;
434 				len = min(min(mlen, resid), space);
435 			} else {
436 nopages:
437 				len = min(min(mlen, resid), space);
438 				/*
439 				 * For datagram protocols, leave room
440 				 * for protocol headers in first mbuf.
441 				 */
442 				if (atomic && top == 0 && len < mlen)
443 					MH_ALIGN(m, len);
444 			}
445 			space -= len;
446 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
447 			resid = uio->uio_resid;
448 			m->m_len = len;
449 			*mp = m;
450 			top->m_pkthdr.len += len;
451 			if (error)
452 				goto release;
453 			mp = &m->m_next;
454 			if (resid <= 0) {
455 				if (flags & MSG_EOR)
456 					top->m_flags |= M_EOR;
457 				break;
458 			}
459 		    } while (space > 0 && atomic);
460 		    if (dontroute)
461 			    so->so_options |= SO_DONTROUTE;
462 		    s = splnet();				/* XXX */
463 		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
464 			(flags & MSG_OOB) ? PRUS_OOB :
465 			/*
466 			 * If the user set MSG_EOF, the protocol
467 			 * understands this flag and nothing left to
468 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
469 			 */
470 			((flags & MSG_EOF) &&
471 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
472 			 (resid <= 0)) ?
473 				PRUS_EOF : 0,
474 			top, addr, control);
475 		    splx(s);
476 		    if (dontroute)
477 			    so->so_options &= ~SO_DONTROUTE;
478 		    clen = 0;
479 		    control = 0;
480 		    top = 0;
481 		    mp = &top;
482 		    if (error)
483 			goto release;
484 		} while (resid && space > 0);
485 	} while (resid);
486 
487 release:
488 	sbunlock(&so->so_snd);
489 out:
490 	if (top)
491 		m_freem(top);
492 	if (control)
493 		m_freem(control);
494 	return (error);
495 }
496 
497 /*
498  * Implement receive operations on a socket.
499  * We depend on the way that records are added to the sockbuf
500  * by sbappend*.  In particular, each record (mbufs linked through m_next)
501  * must begin with an address if the protocol so specifies,
502  * followed by an optional mbuf or mbufs containing ancillary data,
503  * and then zero or more mbufs of data.
504  * In order to avoid blocking network interrupts for the entire time here,
505  * we splx() while doing the actual copy to user space.
506  * Although the sockbuf is locked, new data may still be appended,
507  * and thus we must maintain consistency of the sockbuf during that time.
508  *
509  * The caller may receive the data as a single mbuf chain by supplying
510  * an mbuf **mp0 for use in returning the chain.  The uio is then used
511  * only for the count in uio_resid.
512  */
513 int
514 soreceive(so, paddr, uio, mp0, controlp, flagsp)
515 	register struct socket *so;
516 	struct mbuf **paddr;
517 	struct uio *uio;
518 	struct mbuf **mp0;
519 	struct mbuf **controlp;
520 	int *flagsp;
521 {
522 	register struct mbuf *m, **mp;
523 	register int flags, len, error, s, offset;
524 	struct protosw *pr = so->so_proto;
525 	struct mbuf *nextrecord;
526 	int moff, type = 0;
527 	int orig_resid = uio->uio_resid;
528 
529 	mp = mp0;
530 	if (paddr)
531 		*paddr = 0;
532 	if (controlp)
533 		*controlp = 0;
534 	if (flagsp)
535 		flags = *flagsp &~ MSG_EOR;
536 	else
537 		flags = 0;
538 	if (flags & MSG_OOB) {
539 		m = m_get(M_WAIT, MT_DATA);
540 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
541 		if (error)
542 			goto bad;
543 		do {
544 			error = uiomove(mtod(m, caddr_t),
545 			    (int) min(uio->uio_resid, m->m_len), uio);
546 			m = m_free(m);
547 		} while (uio->uio_resid && error == 0 && m);
548 bad:
549 		if (m)
550 			m_freem(m);
551 		return (error);
552 	}
553 	if (mp)
554 		*mp = (struct mbuf *)0;
555 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
556 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
557 
558 restart:
559 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
560 	if (error)
561 		return (error);
562 	s = splnet();
563 
564 	m = so->so_rcv.sb_mb;
565 	/*
566 	 * If we have less data than requested, block awaiting more
567 	 * (subject to any timeout) if:
568 	 *   1. the current count is less than the low water mark, or
569 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
570 	 *	receive operation at once if we block (resid <= hiwat).
571 	 *   3. MSG_DONTWAIT is not set
572 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
573 	 * we have to do the receive in sections, and thus risk returning
574 	 * a short count if a timeout or signal occurs after we start.
575 	 */
576 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
577 	    so->so_rcv.sb_cc < uio->uio_resid) &&
578 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
579 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
580 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
581 #ifdef DIAGNOSTIC
582 		if (m == 0 && so->so_rcv.sb_cc)
583 			panic("receive 1");
584 #endif
585 		if (so->so_error) {
586 			if (m)
587 				goto dontblock;
588 			error = so->so_error;
589 			if ((flags & MSG_PEEK) == 0)
590 				so->so_error = 0;
591 			goto release;
592 		}
593 		if (so->so_state & SS_CANTRCVMORE) {
594 			if (m)
595 				goto dontblock;
596 			else
597 				goto release;
598 		}
599 		for (; m; m = m->m_next)
600 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
601 				m = so->so_rcv.sb_mb;
602 				goto dontblock;
603 			}
604 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
605 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
606 			error = ENOTCONN;
607 			goto release;
608 		}
609 		if (uio->uio_resid == 0)
610 			goto release;
611 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
612 			error = EWOULDBLOCK;
613 			goto release;
614 		}
615 		sbunlock(&so->so_rcv);
616 		error = sbwait(&so->so_rcv);
617 		splx(s);
618 		if (error)
619 			return (error);
620 		goto restart;
621 	}
622 dontblock:
623 	if (uio->uio_procp)
624 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
625 	nextrecord = m->m_nextpkt;
626 	if (pr->pr_flags & PR_ADDR) {
627 #ifdef DIAGNOSTIC
628 		if (m->m_type != MT_SONAME)
629 			panic("receive 1a");
630 #endif
631 		orig_resid = 0;
632 		if (flags & MSG_PEEK) {
633 			if (paddr)
634 				*paddr = m_copy(m, 0, m->m_len);
635 			m = m->m_next;
636 		} else {
637 			sbfree(&so->so_rcv, m);
638 			if (paddr) {
639 				*paddr = m;
640 				so->so_rcv.sb_mb = m->m_next;
641 				m->m_next = 0;
642 				m = so->so_rcv.sb_mb;
643 			} else {
644 				MFREE(m, so->so_rcv.sb_mb);
645 				m = so->so_rcv.sb_mb;
646 			}
647 		}
648 	}
649 	while (m && m->m_type == MT_CONTROL && error == 0) {
650 		if (flags & MSG_PEEK) {
651 			if (controlp)
652 				*controlp = m_copy(m, 0, m->m_len);
653 			m = m->m_next;
654 		} else {
655 			sbfree(&so->so_rcv, m);
656 			if (controlp) {
657 				if (pr->pr_domain->dom_externalize &&
658 				    mtod(m, struct cmsghdr *)->cmsg_type ==
659 				    SCM_RIGHTS)
660 				   error = (*pr->pr_domain->dom_externalize)(m);
661 				*controlp = m;
662 				so->so_rcv.sb_mb = m->m_next;
663 				m->m_next = 0;
664 				m = so->so_rcv.sb_mb;
665 			} else {
666 				MFREE(m, so->so_rcv.sb_mb);
667 				m = so->so_rcv.sb_mb;
668 			}
669 		}
670 		if (controlp) {
671 			orig_resid = 0;
672 			controlp = &(*controlp)->m_next;
673 		}
674 	}
675 	if (m) {
676 		if ((flags & MSG_PEEK) == 0)
677 			m->m_nextpkt = nextrecord;
678 		type = m->m_type;
679 		if (type == MT_OOBDATA)
680 			flags |= MSG_OOB;
681 	}
682 	moff = 0;
683 	offset = 0;
684 	while (m && uio->uio_resid > 0 && error == 0) {
685 		if (m->m_type == MT_OOBDATA) {
686 			if (type != MT_OOBDATA)
687 				break;
688 		} else if (type == MT_OOBDATA)
689 			break;
690 #ifdef DIAGNOSTIC
691 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
692 			panic("receive 3");
693 #endif
694 		so->so_state &= ~SS_RCVATMARK;
695 		len = uio->uio_resid;
696 		if (so->so_oobmark && len > so->so_oobmark - offset)
697 			len = so->so_oobmark - offset;
698 		if (len > m->m_len - moff)
699 			len = m->m_len - moff;
700 		/*
701 		 * If mp is set, just pass back the mbufs.
702 		 * Otherwise copy them out via the uio, then free.
703 		 * Sockbuf must be consistent here (points to current mbuf,
704 		 * it points to next record) when we drop priority;
705 		 * we must note any additions to the sockbuf when we
706 		 * block interrupts again.
707 		 */
708 		if (mp == 0) {
709 			splx(s);
710 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
711 			s = splnet();
712 			if (error)
713 				goto release;
714 		} else
715 			uio->uio_resid -= len;
716 		if (len == m->m_len - moff) {
717 			if (m->m_flags & M_EOR)
718 				flags |= MSG_EOR;
719 			if (flags & MSG_PEEK) {
720 				m = m->m_next;
721 				moff = 0;
722 			} else {
723 				nextrecord = m->m_nextpkt;
724 				sbfree(&so->so_rcv, m);
725 				if (mp) {
726 					*mp = m;
727 					mp = &m->m_next;
728 					so->so_rcv.sb_mb = m = m->m_next;
729 					*mp = (struct mbuf *)0;
730 				} else {
731 					MFREE(m, so->so_rcv.sb_mb);
732 					m = so->so_rcv.sb_mb;
733 				}
734 				if (m)
735 					m->m_nextpkt = nextrecord;
736 			}
737 		} else {
738 			if (flags & MSG_PEEK)
739 				moff += len;
740 			else {
741 				if (mp)
742 					*mp = m_copym(m, 0, len, M_WAIT);
743 				m->m_data += len;
744 				m->m_len -= len;
745 				so->so_rcv.sb_cc -= len;
746 			}
747 		}
748 		if (so->so_oobmark) {
749 			if ((flags & MSG_PEEK) == 0) {
750 				so->so_oobmark -= len;
751 				if (so->so_oobmark == 0) {
752 					so->so_state |= SS_RCVATMARK;
753 					break;
754 				}
755 			} else {
756 				offset += len;
757 				if (offset == so->so_oobmark)
758 					break;
759 			}
760 		}
761 		if (flags & MSG_EOR)
762 			break;
763 		/*
764 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
765 		 * we must not quit until "uio->uio_resid == 0" or an error
766 		 * termination.  If a signal/timeout occurs, return
767 		 * with a short count but without error.
768 		 * Keep sockbuf locked against other readers.
769 		 */
770 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
771 		    !sosendallatonce(so) && !nextrecord) {
772 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
773 				break;
774 			error = sbwait(&so->so_rcv);
775 			if (error) {
776 				sbunlock(&so->so_rcv);
777 				splx(s);
778 				return (0);
779 			}
780 			m = so->so_rcv.sb_mb;
781 			if (m)
782 				nextrecord = m->m_nextpkt;
783 		}
784 	}
785 
786 	if (m && pr->pr_flags & PR_ATOMIC) {
787 		flags |= MSG_TRUNC;
788 		if ((flags & MSG_PEEK) == 0)
789 			(void) sbdroprecord(&so->so_rcv);
790 	}
791 	if ((flags & MSG_PEEK) == 0) {
792 		if (m == 0)
793 			so->so_rcv.sb_mb = nextrecord;
794 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
795 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
796 	}
797 	if (orig_resid == uio->uio_resid && orig_resid &&
798 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
799 		sbunlock(&so->so_rcv);
800 		splx(s);
801 		goto restart;
802 	}
803 
804 	if (flagsp)
805 		*flagsp |= flags;
806 release:
807 	sbunlock(&so->so_rcv);
808 	splx(s);
809 	return (error);
810 }
811 
812 int
813 soshutdown(so, how)
814 	register struct socket *so;
815 	register int how;
816 {
817 	register struct protosw *pr = so->so_proto;
818 
819 	how++;
820 	if (how & FREAD)
821 		sorflush(so);
822 	if (how & FWRITE)
823 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
824 	return (0);
825 }
826 
827 void
828 sorflush(so)
829 	register struct socket *so;
830 {
831 	register struct sockbuf *sb = &so->so_rcv;
832 	register struct protosw *pr = so->so_proto;
833 	register int s;
834 	struct sockbuf asb;
835 
836 	sb->sb_flags |= SB_NOINTR;
837 	(void) sblock(sb, M_WAITOK);
838 	s = splimp();
839 	socantrcvmore(so);
840 	sbunlock(sb);
841 	asb = *sb;
842 	bzero((caddr_t)sb, sizeof (*sb));
843 	splx(s);
844 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
845 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
846 	sbrelease(&asb);
847 }
848 
849 int
850 sosetopt(so, level, optname, m0)
851 	register struct socket *so;
852 	int level, optname;
853 	struct mbuf *m0;
854 {
855 	int error = 0;
856 	register struct mbuf *m = m0;
857 
858 	if (level != SOL_SOCKET) {
859 		if (so->so_proto && so->so_proto->pr_ctloutput)
860 			return ((*so->so_proto->pr_ctloutput)
861 				  (PRCO_SETOPT, so, level, optname, &m0));
862 		error = ENOPROTOOPT;
863 	} else {
864 		switch (optname) {
865 
866 		case SO_LINGER:
867 			if (m == NULL || m->m_len != sizeof (struct linger)) {
868 				error = EINVAL;
869 				goto bad;
870 			}
871 			so->so_linger = mtod(m, struct linger *)->l_linger;
872 			/* fall thru... */
873 
874 		case SO_DEBUG:
875 		case SO_KEEPALIVE:
876 		case SO_DONTROUTE:
877 		case SO_USELOOPBACK:
878 		case SO_BROADCAST:
879 		case SO_REUSEADDR:
880 		case SO_REUSEPORT:
881 		case SO_OOBINLINE:
882 		case SO_TIMESTAMP:
883 			if (m == NULL || m->m_len < sizeof (int)) {
884 				error = EINVAL;
885 				goto bad;
886 			}
887 			if (*mtod(m, int *))
888 				so->so_options |= optname;
889 			else
890 				so->so_options &= ~optname;
891 			break;
892 
893 		case SO_SNDBUF:
894 		case SO_RCVBUF:
895 		case SO_SNDLOWAT:
896 		case SO_RCVLOWAT:
897 			if (m == NULL || m->m_len < sizeof (int)) {
898 				error = EINVAL;
899 				goto bad;
900 			}
901 			switch (optname) {
902 
903 			case SO_SNDBUF:
904 			case SO_RCVBUF:
905 				if (sbreserve(optname == SO_SNDBUF ?
906 				    &so->so_snd : &so->so_rcv,
907 				    (u_long) *mtod(m, int *)) == 0) {
908 					error = ENOBUFS;
909 					goto bad;
910 				}
911 				break;
912 
913 			case SO_SNDLOWAT:
914 				so->so_snd.sb_lowat = *mtod(m, int *);
915 				break;
916 			case SO_RCVLOWAT:
917 				so->so_rcv.sb_lowat = *mtod(m, int *);
918 				break;
919 			}
920 			break;
921 
922 		case SO_SNDTIMEO:
923 		case SO_RCVTIMEO:
924 		    {
925 			struct timeval *tv;
926 			short val;
927 
928 			if (m == NULL || m->m_len < sizeof (*tv)) {
929 				error = EINVAL;
930 				goto bad;
931 			}
932 			tv = mtod(m, struct timeval *);
933 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
934 				error = EDOM;
935 				goto bad;
936 			}
937 			val = tv->tv_sec * hz + tv->tv_usec / tick;
938 
939 			switch (optname) {
940 
941 			case SO_SNDTIMEO:
942 				so->so_snd.sb_timeo = val;
943 				break;
944 			case SO_RCVTIMEO:
945 				so->so_rcv.sb_timeo = val;
946 				break;
947 			}
948 			break;
949 		    }
950 
951 		case SO_PRIVSTATE:
952 			/* we don't care what the parameter is... */
953 			so->so_state &= ~SS_PRIV;
954 			break;
955 
956 		default:
957 			error = ENOPROTOOPT;
958 			break;
959 		}
960 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
961 			(void) ((*so->so_proto->pr_ctloutput)
962 				  (PRCO_SETOPT, so, level, optname, &m0));
963 			m = NULL;	/* freed by protocol */
964 		}
965 	}
966 bad:
967 	if (m)
968 		(void) m_free(m);
969 	return (error);
970 }
971 
972 int
973 sogetopt(so, level, optname, mp)
974 	register struct socket *so;
975 	int level, optname;
976 	struct mbuf **mp;
977 {
978 	register struct mbuf *m;
979 
980 	if (level != SOL_SOCKET) {
981 		if (so->so_proto && so->so_proto->pr_ctloutput) {
982 			return ((*so->so_proto->pr_ctloutput)
983 				  (PRCO_GETOPT, so, level, optname, mp));
984 		} else
985 			return (ENOPROTOOPT);
986 	} else {
987 		m = m_get(M_WAIT, MT_SOOPTS);
988 		m->m_len = sizeof (int);
989 
990 		switch (optname) {
991 
992 		case SO_LINGER:
993 			m->m_len = sizeof (struct linger);
994 			mtod(m, struct linger *)->l_onoff =
995 				so->so_options & SO_LINGER;
996 			mtod(m, struct linger *)->l_linger = so->so_linger;
997 			break;
998 
999 		case SO_USELOOPBACK:
1000 		case SO_DONTROUTE:
1001 		case SO_DEBUG:
1002 		case SO_KEEPALIVE:
1003 		case SO_REUSEADDR:
1004 		case SO_REUSEPORT:
1005 		case SO_BROADCAST:
1006 		case SO_OOBINLINE:
1007 		case SO_TIMESTAMP:
1008 			*mtod(m, int *) = so->so_options & optname;
1009 			break;
1010 
1011 		case SO_PRIVSTATE:
1012 			*mtod(m, int *) = so->so_state & SS_PRIV;
1013 			break;
1014 
1015 		case SO_TYPE:
1016 			*mtod(m, int *) = so->so_type;
1017 			break;
1018 
1019 		case SO_ERROR:
1020 			*mtod(m, int *) = so->so_error;
1021 			so->so_error = 0;
1022 			break;
1023 
1024 		case SO_SNDBUF:
1025 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1026 			break;
1027 
1028 		case SO_RCVBUF:
1029 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1030 			break;
1031 
1032 		case SO_SNDLOWAT:
1033 			*mtod(m, int *) = so->so_snd.sb_lowat;
1034 			break;
1035 
1036 		case SO_RCVLOWAT:
1037 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1038 			break;
1039 
1040 		case SO_SNDTIMEO:
1041 		case SO_RCVTIMEO:
1042 		    {
1043 			int val = (optname == SO_SNDTIMEO ?
1044 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1045 
1046 			m->m_len = sizeof(struct timeval);
1047 			mtod(m, struct timeval *)->tv_sec = val / hz;
1048 			mtod(m, struct timeval *)->tv_usec =
1049 			    (val % hz) * tick;
1050 			break;
1051 		    }
1052 
1053 		default:
1054 			(void)m_free(m);
1055 			return (ENOPROTOOPT);
1056 		}
1057 		*mp = m;
1058 		return (0);
1059 	}
1060 }
1061 
1062 void
1063 sohasoutofband(so)
1064 	register struct socket *so;
1065 {
1066 	struct proc *p;
1067 
1068 	if (so->so_pgid < 0)
1069 		gsignal(-so->so_pgid, SIGURG);
1070 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1071 		psignal(p, SIGURG);
1072 	selwakeup(&so->so_rcv.sb_sel);
1073 }
1074