xref: /freebsd/sys/kern/uipc_socket.c (revision 61afd5bb22d787b0641523e7b9b95c964d669bd5)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  * $Id: uipc_socket.c,v 1.20 1996/10/07 04:32:26 pst Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/queue.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/file.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/signalvar.h>
51 #include <sys/sysctl.h>
52 
53 static int somaxconn = SOMAXCONN;
54 SYSCTL_INT(_kern, KERN_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "");
55 
56 /*
57  * Socket operation routines.
58  * These routines are called by the routines in
59  * sys_socket.c or from a system process, and
60  * implement the semantics of socket operations by
61  * switching out to the protocol specific routines.
62  */
63 /*ARGSUSED*/
64 int
65 socreate(dom, aso, type, proto, p)
66 	int dom;
67 	struct socket **aso;
68 	register int type;
69 	int proto;
70 	struct proc *p;
71 {
72 	register struct protosw *prp;
73 	register struct socket *so;
74 	register int error;
75 
76 	if (proto)
77 		prp = pffindproto(dom, proto, type);
78 	else
79 		prp = pffindtype(dom, type);
80 	if (prp == 0 || prp->pr_usrreqs == 0)
81 		return (EPROTONOSUPPORT);
82 	if (prp->pr_type != type)
83 		return (EPROTOTYPE);
84 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
85 	bzero((caddr_t)so, sizeof(*so));
86 	TAILQ_INIT(&so->so_incomp);
87 	TAILQ_INIT(&so->so_comp);
88 	so->so_type = type;
89 	if (p->p_ucred->cr_uid == 0)
90 		so->so_state = SS_PRIV;
91 	so->so_proto = prp;
92 	error = (*prp->pr_usrreqs->pru_attach)(so, proto);
93 	if (error) {
94 		so->so_state |= SS_NOFDREF;
95 		sofree(so);
96 		return (error);
97 	}
98 	*aso = so;
99 	return (0);
100 }
101 
102 int
103 sobind(so, nam)
104 	struct socket *so;
105 	struct mbuf *nam;
106 {
107 	int s = splnet();
108 	int error;
109 
110 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam);
111 	splx(s);
112 	return (error);
113 }
114 
115 int
116 solisten(so, backlog)
117 	register struct socket *so;
118 	int backlog;
119 {
120 	int s = splnet(), error;
121 
122 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so);
123 	if (error) {
124 		splx(s);
125 		return (error);
126 	}
127 	if (so->so_comp.tqh_first == NULL)
128 		so->so_options |= SO_ACCEPTCONN;
129 	if (backlog < 0 || backlog > somaxconn)
130 		backlog = somaxconn;
131 	so->so_qlimit = backlog;
132 	splx(s);
133 	return (0);
134 }
135 
136 void
137 sofree(so)
138 	register struct socket *so;
139 {
140 	struct socket *head = so->so_head;
141 
142 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
143 		return;
144 	if (head != NULL) {
145 		if (so->so_state & SS_INCOMP) {
146 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
147 			head->so_incqlen--;
148 		} else if (so->so_state & SS_COMP) {
149 			TAILQ_REMOVE(&head->so_comp, so, so_list);
150 		} else {
151 			panic("sofree: not queued");
152 		}
153 		head->so_qlen--;
154 		so->so_state &= ~(SS_INCOMP|SS_COMP);
155 		so->so_head = NULL;
156 	}
157 	sbrelease(&so->so_snd);
158 	sorflush(so);
159 	FREE(so, M_SOCKET);
160 }
161 
162 /*
163  * Close a socket on last file table reference removal.
164  * Initiate disconnect if connected.
165  * Free socket when disconnect complete.
166  */
167 int
168 soclose(so)
169 	register struct socket *so;
170 {
171 	int s = splnet();		/* conservative */
172 	int error = 0;
173 
174 	if (so->so_options & SO_ACCEPTCONN) {
175 		struct socket *sp, *sonext;
176 
177 		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
178 			sonext = sp->so_list.tqe_next;
179 			(void) soabort(sp);
180 		}
181 		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
182 			sonext = sp->so_list.tqe_next;
183 			(void) soabort(sp);
184 		}
185 	}
186 	if (so->so_pcb == 0)
187 		goto discard;
188 	if (so->so_state & SS_ISCONNECTED) {
189 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
190 			error = sodisconnect(so);
191 			if (error)
192 				goto drop;
193 		}
194 		if (so->so_options & SO_LINGER) {
195 			if ((so->so_state & SS_ISDISCONNECTING) &&
196 			    (so->so_state & SS_NBIO))
197 				goto drop;
198 			while (so->so_state & SS_ISCONNECTED) {
199 				error = tsleep((caddr_t)&so->so_timeo,
200 				    PSOCK | PCATCH, "soclos", so->so_linger);
201 				if (error)
202 					break;
203 			}
204 		}
205 	}
206 drop:
207 	if (so->so_pcb) {
208 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
209 		if (error == 0)
210 			error = error2;
211 	}
212 discard:
213 	if (so->so_state & SS_NOFDREF)
214 		panic("soclose: NOFDREF");
215 	so->so_state |= SS_NOFDREF;
216 	sofree(so);
217 	splx(s);
218 	return (error);
219 }
220 
221 /*
222  * Must be called at splnet...
223  */
224 int
225 soabort(so)
226 	struct socket *so;
227 {
228 
229 	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
230 }
231 
232 int
233 soaccept(so, nam)
234 	register struct socket *so;
235 	struct mbuf *nam;
236 {
237 	int s = splnet();
238 	int error;
239 
240 	if ((so->so_state & SS_NOFDREF) == 0)
241 		panic("soaccept: !NOFDREF");
242 	so->so_state &= ~SS_NOFDREF;
243 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
244 	splx(s);
245 	return (error);
246 }
247 
248 int
249 soconnect(so, nam)
250 	register struct socket *so;
251 	struct mbuf *nam;
252 {
253 	int s;
254 	int error;
255 
256 	if (so->so_options & SO_ACCEPTCONN)
257 		return (EOPNOTSUPP);
258 	s = splnet();
259 	/*
260 	 * If protocol is connection-based, can only connect once.
261 	 * Otherwise, if connected, try to disconnect first.
262 	 * This allows user to disconnect by connecting to, e.g.,
263 	 * a null address.
264 	 */
265 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
266 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
267 	    (error = sodisconnect(so))))
268 		error = EISCONN;
269 	else
270 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
271 	splx(s);
272 	return (error);
273 }
274 
275 int
276 soconnect2(so1, so2)
277 	register struct socket *so1;
278 	struct socket *so2;
279 {
280 	int s = splnet();
281 	int error;
282 
283 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
284 	splx(s);
285 	return (error);
286 }
287 
288 int
289 sodisconnect(so)
290 	register struct socket *so;
291 {
292 	int s = splnet();
293 	int error;
294 
295 	if ((so->so_state & SS_ISCONNECTED) == 0) {
296 		error = ENOTCONN;
297 		goto bad;
298 	}
299 	if (so->so_state & SS_ISDISCONNECTING) {
300 		error = EALREADY;
301 		goto bad;
302 	}
303 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
304 bad:
305 	splx(s);
306 	return (error);
307 }
308 
309 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
310 /*
311  * Send on a socket.
312  * If send must go all at once and message is larger than
313  * send buffering, then hard error.
314  * Lock against other senders.
315  * If must go all at once and not enough room now, then
316  * inform user that this would block and do nothing.
317  * Otherwise, if nonblocking, send as much as possible.
318  * The data to be sent is described by "uio" if nonzero,
319  * otherwise by the mbuf chain "top" (which must be null
320  * if uio is not).  Data provided in mbuf chain must be small
321  * enough to send all at once.
322  *
323  * Returns nonzero on error, timeout or signal; callers
324  * must check for short counts if EINTR/ERESTART are returned.
325  * Data and control buffers are freed on return.
326  */
327 int
328 sosend(so, addr, uio, top, control, flags)
329 	register struct socket *so;
330 	struct mbuf *addr;
331 	struct uio *uio;
332 	struct mbuf *top;
333 	struct mbuf *control;
334 	int flags;
335 {
336 	struct proc *p = curproc;		/* XXX */
337 	struct mbuf **mp;
338 	register struct mbuf *m;
339 	register long space, len, resid;
340 	int clen = 0, error, s, dontroute, mlen;
341 	int atomic = sosendallatonce(so) || top;
342 
343 	if (uio)
344 		resid = uio->uio_resid;
345 	else
346 		resid = top->m_pkthdr.len;
347 	/*
348 	 * In theory resid should be unsigned.
349 	 * However, space must be signed, as it might be less than 0
350 	 * if we over-committed, and we must use a signed comparison
351 	 * of space and resid.  On the other hand, a negative resid
352 	 * causes us to loop sending 0-length segments to the protocol.
353 	 */
354 	if (resid < 0)
355 		return (EINVAL);
356 	dontroute =
357 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
358 	    (so->so_proto->pr_flags & PR_ATOMIC);
359 	p->p_stats->p_ru.ru_msgsnd++;
360 	if (control)
361 		clen = control->m_len;
362 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
363 
364 restart:
365 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
366 	if (error)
367 		goto out;
368 	do {
369 		s = splnet();
370 		if (so->so_state & SS_CANTSENDMORE)
371 			snderr(EPIPE);
372 		if (so->so_error)
373 			snderr(so->so_error);
374 		if ((so->so_state & SS_ISCONNECTED) == 0) {
375 			/*
376 			 * `sendto' and `sendmsg' is allowed on a connection-
377 			 * based socket if it supports implied connect.
378 			 * Return ENOTCONN if not connected and no address is
379 			 * supplied.
380 			 */
381 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
382 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
383 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
384 				    !(resid == 0 && clen != 0))
385 					snderr(ENOTCONN);
386 			} else if (addr == 0)
387 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
388 				   ENOTCONN : EDESTADDRREQ);
389 		}
390 		space = sbspace(&so->so_snd);
391 		if (flags & MSG_OOB)
392 			space += 1024;
393 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
394 		    clen > so->so_snd.sb_hiwat)
395 			snderr(EMSGSIZE);
396 		if (space < resid + clen && uio &&
397 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
398 			if (so->so_state & SS_NBIO)
399 				snderr(EWOULDBLOCK);
400 			sbunlock(&so->so_snd);
401 			error = sbwait(&so->so_snd);
402 			splx(s);
403 			if (error)
404 				goto out;
405 			goto restart;
406 		}
407 		splx(s);
408 		mp = &top;
409 		space -= clen;
410 		do {
411 		    if (uio == NULL) {
412 			/*
413 			 * Data is prepackaged in "top".
414 			 */
415 			resid = 0;
416 			if (flags & MSG_EOR)
417 				top->m_flags |= M_EOR;
418 		    } else do {
419 			if (top == 0) {
420 				MGETHDR(m, M_WAIT, MT_DATA);
421 				mlen = MHLEN;
422 				m->m_pkthdr.len = 0;
423 				m->m_pkthdr.rcvif = (struct ifnet *)0;
424 			} else {
425 				MGET(m, M_WAIT, MT_DATA);
426 				mlen = MLEN;
427 			}
428 			if (resid >= MINCLSIZE) {
429 				MCLGET(m, M_WAIT);
430 				if ((m->m_flags & M_EXT) == 0)
431 					goto nopages;
432 				mlen = MCLBYTES;
433 				len = min(min(mlen, resid), space);
434 			} else {
435 nopages:
436 				len = min(min(mlen, resid), space);
437 				/*
438 				 * For datagram protocols, leave room
439 				 * for protocol headers in first mbuf.
440 				 */
441 				if (atomic && top == 0 && len < mlen)
442 					MH_ALIGN(m, len);
443 			}
444 			space -= len;
445 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
446 			resid = uio->uio_resid;
447 			m->m_len = len;
448 			*mp = m;
449 			top->m_pkthdr.len += len;
450 			if (error)
451 				goto release;
452 			mp = &m->m_next;
453 			if (resid <= 0) {
454 				if (flags & MSG_EOR)
455 					top->m_flags |= M_EOR;
456 				break;
457 			}
458 		    } while (space > 0 && atomic);
459 		    if (dontroute)
460 			    so->so_options |= SO_DONTROUTE;
461 		    s = splnet();				/* XXX */
462 		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
463 			(flags & MSG_OOB) ? PRUS_OOB :
464 			/*
465 			 * If the user set MSG_EOF, the protocol
466 			 * understands this flag and nothing left to
467 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
468 			 */
469 			((flags & MSG_EOF) &&
470 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
471 			 (resid <= 0)) ?
472 				PRUS_EOF : 0,
473 			top, addr, control);
474 		    splx(s);
475 		    if (dontroute)
476 			    so->so_options &= ~SO_DONTROUTE;
477 		    clen = 0;
478 		    control = 0;
479 		    top = 0;
480 		    mp = &top;
481 		    if (error)
482 			goto release;
483 		} while (resid && space > 0);
484 	} while (resid);
485 
486 release:
487 	sbunlock(&so->so_snd);
488 out:
489 	if (top)
490 		m_freem(top);
491 	if (control)
492 		m_freem(control);
493 	return (error);
494 }
495 
496 /*
497  * Implement receive operations on a socket.
498  * We depend on the way that records are added to the sockbuf
499  * by sbappend*.  In particular, each record (mbufs linked through m_next)
500  * must begin with an address if the protocol so specifies,
501  * followed by an optional mbuf or mbufs containing ancillary data,
502  * and then zero or more mbufs of data.
503  * In order to avoid blocking network interrupts for the entire time here,
504  * we splx() while doing the actual copy to user space.
505  * Although the sockbuf is locked, new data may still be appended,
506  * and thus we must maintain consistency of the sockbuf during that time.
507  *
508  * The caller may receive the data as a single mbuf chain by supplying
509  * an mbuf **mp0 for use in returning the chain.  The uio is then used
510  * only for the count in uio_resid.
511  */
512 int
513 soreceive(so, paddr, uio, mp0, controlp, flagsp)
514 	register struct socket *so;
515 	struct mbuf **paddr;
516 	struct uio *uio;
517 	struct mbuf **mp0;
518 	struct mbuf **controlp;
519 	int *flagsp;
520 {
521 	register struct mbuf *m, **mp;
522 	register int flags, len, error, s, offset;
523 	struct protosw *pr = so->so_proto;
524 	struct mbuf *nextrecord;
525 	int moff, type = 0;
526 	int orig_resid = uio->uio_resid;
527 
528 	mp = mp0;
529 	if (paddr)
530 		*paddr = 0;
531 	if (controlp)
532 		*controlp = 0;
533 	if (flagsp)
534 		flags = *flagsp &~ MSG_EOR;
535 	else
536 		flags = 0;
537 	if (flags & MSG_OOB) {
538 		m = m_get(M_WAIT, MT_DATA);
539 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
540 		if (error)
541 			goto bad;
542 		do {
543 			error = uiomove(mtod(m, caddr_t),
544 			    (int) min(uio->uio_resid, m->m_len), uio);
545 			m = m_free(m);
546 		} while (uio->uio_resid && error == 0 && m);
547 bad:
548 		if (m)
549 			m_freem(m);
550 		return (error);
551 	}
552 	if (mp)
553 		*mp = (struct mbuf *)0;
554 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
555 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
556 
557 restart:
558 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
559 	if (error)
560 		return (error);
561 	s = splnet();
562 
563 	m = so->so_rcv.sb_mb;
564 	/*
565 	 * If we have less data than requested, block awaiting more
566 	 * (subject to any timeout) if:
567 	 *   1. the current count is less than the low water mark, or
568 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
569 	 *	receive operation at once if we block (resid <= hiwat).
570 	 *   3. MSG_DONTWAIT is not set
571 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
572 	 * we have to do the receive in sections, and thus risk returning
573 	 * a short count if a timeout or signal occurs after we start.
574 	 */
575 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
576 	    so->so_rcv.sb_cc < uio->uio_resid) &&
577 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
578 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
579 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
580 #ifdef DIAGNOSTIC
581 		if (m == 0 && so->so_rcv.sb_cc)
582 			panic("receive 1");
583 #endif
584 		if (so->so_error) {
585 			if (m)
586 				goto dontblock;
587 			error = so->so_error;
588 			if ((flags & MSG_PEEK) == 0)
589 				so->so_error = 0;
590 			goto release;
591 		}
592 		if (so->so_state & SS_CANTRCVMORE) {
593 			if (m)
594 				goto dontblock;
595 			else
596 				goto release;
597 		}
598 		for (; m; m = m->m_next)
599 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
600 				m = so->so_rcv.sb_mb;
601 				goto dontblock;
602 			}
603 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
604 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
605 			error = ENOTCONN;
606 			goto release;
607 		}
608 		if (uio->uio_resid == 0)
609 			goto release;
610 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
611 			error = EWOULDBLOCK;
612 			goto release;
613 		}
614 		sbunlock(&so->so_rcv);
615 		error = sbwait(&so->so_rcv);
616 		splx(s);
617 		if (error)
618 			return (error);
619 		goto restart;
620 	}
621 dontblock:
622 	if (uio->uio_procp)
623 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
624 	nextrecord = m->m_nextpkt;
625 	if (pr->pr_flags & PR_ADDR) {
626 #ifdef DIAGNOSTIC
627 		if (m->m_type != MT_SONAME)
628 			panic("receive 1a");
629 #endif
630 		orig_resid = 0;
631 		if (flags & MSG_PEEK) {
632 			if (paddr)
633 				*paddr = m_copy(m, 0, m->m_len);
634 			m = m->m_next;
635 		} else {
636 			sbfree(&so->so_rcv, m);
637 			if (paddr) {
638 				*paddr = m;
639 				so->so_rcv.sb_mb = m->m_next;
640 				m->m_next = 0;
641 				m = so->so_rcv.sb_mb;
642 			} else {
643 				MFREE(m, so->so_rcv.sb_mb);
644 				m = so->so_rcv.sb_mb;
645 			}
646 		}
647 	}
648 	while (m && m->m_type == MT_CONTROL && error == 0) {
649 		if (flags & MSG_PEEK) {
650 			if (controlp)
651 				*controlp = m_copy(m, 0, m->m_len);
652 			m = m->m_next;
653 		} else {
654 			sbfree(&so->so_rcv, m);
655 			if (controlp) {
656 				if (pr->pr_domain->dom_externalize &&
657 				    mtod(m, struct cmsghdr *)->cmsg_type ==
658 				    SCM_RIGHTS)
659 				   error = (*pr->pr_domain->dom_externalize)(m);
660 				*controlp = m;
661 				so->so_rcv.sb_mb = m->m_next;
662 				m->m_next = 0;
663 				m = so->so_rcv.sb_mb;
664 			} else {
665 				MFREE(m, so->so_rcv.sb_mb);
666 				m = so->so_rcv.sb_mb;
667 			}
668 		}
669 		if (controlp) {
670 			orig_resid = 0;
671 			controlp = &(*controlp)->m_next;
672 		}
673 	}
674 	if (m) {
675 		if ((flags & MSG_PEEK) == 0)
676 			m->m_nextpkt = nextrecord;
677 		type = m->m_type;
678 		if (type == MT_OOBDATA)
679 			flags |= MSG_OOB;
680 	}
681 	moff = 0;
682 	offset = 0;
683 	while (m && uio->uio_resid > 0 && error == 0) {
684 		if (m->m_type == MT_OOBDATA) {
685 			if (type != MT_OOBDATA)
686 				break;
687 		} else if (type == MT_OOBDATA)
688 			break;
689 #ifdef DIAGNOSTIC
690 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
691 			panic("receive 3");
692 #endif
693 		so->so_state &= ~SS_RCVATMARK;
694 		len = uio->uio_resid;
695 		if (so->so_oobmark && len > so->so_oobmark - offset)
696 			len = so->so_oobmark - offset;
697 		if (len > m->m_len - moff)
698 			len = m->m_len - moff;
699 		/*
700 		 * If mp is set, just pass back the mbufs.
701 		 * Otherwise copy them out via the uio, then free.
702 		 * Sockbuf must be consistent here (points to current mbuf,
703 		 * it points to next record) when we drop priority;
704 		 * we must note any additions to the sockbuf when we
705 		 * block interrupts again.
706 		 */
707 		if (mp == 0) {
708 			splx(s);
709 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
710 			s = splnet();
711 			if (error)
712 				goto release;
713 		} else
714 			uio->uio_resid -= len;
715 		if (len == m->m_len - moff) {
716 			if (m->m_flags & M_EOR)
717 				flags |= MSG_EOR;
718 			if (flags & MSG_PEEK) {
719 				m = m->m_next;
720 				moff = 0;
721 			} else {
722 				nextrecord = m->m_nextpkt;
723 				sbfree(&so->so_rcv, m);
724 				if (mp) {
725 					*mp = m;
726 					mp = &m->m_next;
727 					so->so_rcv.sb_mb = m = m->m_next;
728 					*mp = (struct mbuf *)0;
729 				} else {
730 					MFREE(m, so->so_rcv.sb_mb);
731 					m = so->so_rcv.sb_mb;
732 				}
733 				if (m)
734 					m->m_nextpkt = nextrecord;
735 			}
736 		} else {
737 			if (flags & MSG_PEEK)
738 				moff += len;
739 			else {
740 				if (mp)
741 					*mp = m_copym(m, 0, len, M_WAIT);
742 				m->m_data += len;
743 				m->m_len -= len;
744 				so->so_rcv.sb_cc -= len;
745 			}
746 		}
747 		if (so->so_oobmark) {
748 			if ((flags & MSG_PEEK) == 0) {
749 				so->so_oobmark -= len;
750 				if (so->so_oobmark == 0) {
751 					so->so_state |= SS_RCVATMARK;
752 					break;
753 				}
754 			} else {
755 				offset += len;
756 				if (offset == so->so_oobmark)
757 					break;
758 			}
759 		}
760 		if (flags & MSG_EOR)
761 			break;
762 		/*
763 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
764 		 * we must not quit until "uio->uio_resid == 0" or an error
765 		 * termination.  If a signal/timeout occurs, return
766 		 * with a short count but without error.
767 		 * Keep sockbuf locked against other readers.
768 		 */
769 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
770 		    !sosendallatonce(so) && !nextrecord) {
771 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
772 				break;
773 			error = sbwait(&so->so_rcv);
774 			if (error) {
775 				sbunlock(&so->so_rcv);
776 				splx(s);
777 				return (0);
778 			}
779 			m = so->so_rcv.sb_mb;
780 			if (m)
781 				nextrecord = m->m_nextpkt;
782 		}
783 	}
784 
785 	if (m && pr->pr_flags & PR_ATOMIC) {
786 		flags |= MSG_TRUNC;
787 		if ((flags & MSG_PEEK) == 0)
788 			(void) sbdroprecord(&so->so_rcv);
789 	}
790 	if ((flags & MSG_PEEK) == 0) {
791 		if (m == 0)
792 			so->so_rcv.sb_mb = nextrecord;
793 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
794 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
795 	}
796 	if (orig_resid == uio->uio_resid && orig_resid &&
797 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
798 		sbunlock(&so->so_rcv);
799 		splx(s);
800 		goto restart;
801 	}
802 
803 	if (flagsp)
804 		*flagsp |= flags;
805 release:
806 	sbunlock(&so->so_rcv);
807 	splx(s);
808 	return (error);
809 }
810 
811 int
812 soshutdown(so, how)
813 	register struct socket *so;
814 	register int how;
815 {
816 	register struct protosw *pr = so->so_proto;
817 
818 	how++;
819 	if (how & FREAD)
820 		sorflush(so);
821 	if (how & FWRITE)
822 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
823 	return (0);
824 }
825 
826 void
827 sorflush(so)
828 	register struct socket *so;
829 {
830 	register struct sockbuf *sb = &so->so_rcv;
831 	register struct protosw *pr = so->so_proto;
832 	register int s;
833 	struct sockbuf asb;
834 
835 	sb->sb_flags |= SB_NOINTR;
836 	(void) sblock(sb, M_WAITOK);
837 	s = splimp();
838 	socantrcvmore(so);
839 	sbunlock(sb);
840 	asb = *sb;
841 	bzero((caddr_t)sb, sizeof (*sb));
842 	splx(s);
843 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
844 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
845 	sbrelease(&asb);
846 }
847 
848 int
849 sosetopt(so, level, optname, m0)
850 	register struct socket *so;
851 	int level, optname;
852 	struct mbuf *m0;
853 {
854 	int error = 0;
855 	register struct mbuf *m = m0;
856 
857 	if (level != SOL_SOCKET) {
858 		if (so->so_proto && so->so_proto->pr_ctloutput)
859 			return ((*so->so_proto->pr_ctloutput)
860 				  (PRCO_SETOPT, so, level, optname, &m0));
861 		error = ENOPROTOOPT;
862 	} else {
863 		switch (optname) {
864 
865 		case SO_LINGER:
866 			if (m == NULL || m->m_len != sizeof (struct linger)) {
867 				error = EINVAL;
868 				goto bad;
869 			}
870 			so->so_linger = mtod(m, struct linger *)->l_linger;
871 			/* fall thru... */
872 
873 		case SO_DEBUG:
874 		case SO_KEEPALIVE:
875 		case SO_DONTROUTE:
876 		case SO_USELOOPBACK:
877 		case SO_BROADCAST:
878 		case SO_REUSEADDR:
879 		case SO_REUSEPORT:
880 		case SO_OOBINLINE:
881 		case SO_TIMESTAMP:
882 			if (m == NULL || m->m_len < sizeof (int)) {
883 				error = EINVAL;
884 				goto bad;
885 			}
886 			if (*mtod(m, int *))
887 				so->so_options |= optname;
888 			else
889 				so->so_options &= ~optname;
890 			break;
891 
892 		case SO_SNDBUF:
893 		case SO_RCVBUF:
894 		case SO_SNDLOWAT:
895 		case SO_RCVLOWAT:
896 			if (m == NULL || m->m_len < sizeof (int)) {
897 				error = EINVAL;
898 				goto bad;
899 			}
900 			switch (optname) {
901 
902 			case SO_SNDBUF:
903 			case SO_RCVBUF:
904 				if (sbreserve(optname == SO_SNDBUF ?
905 				    &so->so_snd : &so->so_rcv,
906 				    (u_long) *mtod(m, int *)) == 0) {
907 					error = ENOBUFS;
908 					goto bad;
909 				}
910 				break;
911 
912 			case SO_SNDLOWAT:
913 				so->so_snd.sb_lowat = *mtod(m, int *);
914 				break;
915 			case SO_RCVLOWAT:
916 				so->so_rcv.sb_lowat = *mtod(m, int *);
917 				break;
918 			}
919 			break;
920 
921 		case SO_SNDTIMEO:
922 		case SO_RCVTIMEO:
923 		    {
924 			struct timeval *tv;
925 			short val;
926 
927 			if (m == NULL || m->m_len < sizeof (*tv)) {
928 				error = EINVAL;
929 				goto bad;
930 			}
931 			tv = mtod(m, struct timeval *);
932 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
933 				error = EDOM;
934 				goto bad;
935 			}
936 			val = tv->tv_sec * hz + tv->tv_usec / tick;
937 
938 			switch (optname) {
939 
940 			case SO_SNDTIMEO:
941 				so->so_snd.sb_timeo = val;
942 				break;
943 			case SO_RCVTIMEO:
944 				so->so_rcv.sb_timeo = val;
945 				break;
946 			}
947 			break;
948 		    }
949 
950 		case SO_PRIVSTATE:
951 			/* we don't care what the parameter is... */
952 			so->so_state &= ~SS_PRIV;
953 			break;
954 
955 		default:
956 			error = ENOPROTOOPT;
957 			break;
958 		}
959 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
960 			(void) ((*so->so_proto->pr_ctloutput)
961 				  (PRCO_SETOPT, so, level, optname, &m0));
962 			m = NULL;	/* freed by protocol */
963 		}
964 	}
965 bad:
966 	if (m)
967 		(void) m_free(m);
968 	return (error);
969 }
970 
971 int
972 sogetopt(so, level, optname, mp)
973 	register struct socket *so;
974 	int level, optname;
975 	struct mbuf **mp;
976 {
977 	register struct mbuf *m;
978 
979 	if (level != SOL_SOCKET) {
980 		if (so->so_proto && so->so_proto->pr_ctloutput) {
981 			return ((*so->so_proto->pr_ctloutput)
982 				  (PRCO_GETOPT, so, level, optname, mp));
983 		} else
984 			return (ENOPROTOOPT);
985 	} else {
986 		m = m_get(M_WAIT, MT_SOOPTS);
987 		m->m_len = sizeof (int);
988 
989 		switch (optname) {
990 
991 		case SO_LINGER:
992 			m->m_len = sizeof (struct linger);
993 			mtod(m, struct linger *)->l_onoff =
994 				so->so_options & SO_LINGER;
995 			mtod(m, struct linger *)->l_linger = so->so_linger;
996 			break;
997 
998 		case SO_USELOOPBACK:
999 		case SO_DONTROUTE:
1000 		case SO_DEBUG:
1001 		case SO_KEEPALIVE:
1002 		case SO_REUSEADDR:
1003 		case SO_REUSEPORT:
1004 		case SO_BROADCAST:
1005 		case SO_OOBINLINE:
1006 		case SO_TIMESTAMP:
1007 			*mtod(m, int *) = so->so_options & optname;
1008 			break;
1009 
1010 		case SO_PRIVSTATE:
1011 			*mtod(m, int *) = so->so_state & SS_PRIV;
1012 			break;
1013 
1014 		case SO_TYPE:
1015 			*mtod(m, int *) = so->so_type;
1016 			break;
1017 
1018 		case SO_ERROR:
1019 			*mtod(m, int *) = so->so_error;
1020 			so->so_error = 0;
1021 			break;
1022 
1023 		case SO_SNDBUF:
1024 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1025 			break;
1026 
1027 		case SO_RCVBUF:
1028 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1029 			break;
1030 
1031 		case SO_SNDLOWAT:
1032 			*mtod(m, int *) = so->so_snd.sb_lowat;
1033 			break;
1034 
1035 		case SO_RCVLOWAT:
1036 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1037 			break;
1038 
1039 		case SO_SNDTIMEO:
1040 		case SO_RCVTIMEO:
1041 		    {
1042 			int val = (optname == SO_SNDTIMEO ?
1043 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1044 
1045 			m->m_len = sizeof(struct timeval);
1046 			mtod(m, struct timeval *)->tv_sec = val / hz;
1047 			mtod(m, struct timeval *)->tv_usec =
1048 			    (val % hz) * tick;
1049 			break;
1050 		    }
1051 
1052 		default:
1053 			(void)m_free(m);
1054 			return (ENOPROTOOPT);
1055 		}
1056 		*mp = m;
1057 		return (0);
1058 	}
1059 }
1060 
1061 void
1062 sohasoutofband(so)
1063 	register struct socket *so;
1064 {
1065 	struct proc *p;
1066 
1067 	if (so->so_pgid < 0)
1068 		gsignal(-so->so_pgid, SIGURG);
1069 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1070 		psignal(p, SIGURG);
1071 	selwakeup(&so->so_rcv.sb_sel);
1072 }
1073