xref: /freebsd/sys/kern/uipc_socket.c (revision 11afcc8f9f96d657b8e6f7547c02c1957331fc96)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
34  *	$Id: uipc_socket.c,v 1.41 1998/07/06 19:27:14 fenner Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 #include <sys/fcntl.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/domain.h>
44 #include <sys/kernel.h>
45 #include <sys/poll.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/resourcevar.h>
50 #include <sys/signalvar.h>
51 #include <sys/sysctl.h>
52 #include <sys/uio.h>
53 #include <vm/vm_zone.h>
54 
55 #include <machine/limits.h>
56 
57 struct	vm_zone *socket_zone;
58 so_gen_t	so_gencnt;	/* generation count for sockets */
59 
60 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
61 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
62 
63 static int somaxconn = SOMAXCONN;
64 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
65 	   0, "");
66 
67 /*
68  * Socket operation routines.
69  * These routines are called by the routines in
70  * sys_socket.c or from a system process, and
71  * implement the semantics of socket operations by
72  * switching out to the protocol specific routines.
73  */
74 
75 /*
76  * Get a socket structure from our zone, and initialize it.
77  * We don't implement `waitok' yet (see comments in uipc_domain.c).
78  * Note that it would probably be better to allocate socket
79  * and PCB at the same time, but I'm not convinced that all
80  * the protocols can be easily modified to do this.
81  */
82 struct socket *
83 soalloc(waitok)
84 	int waitok;
85 {
86 	struct socket *so;
87 
88 	so = zalloci(socket_zone);
89 	if (so) {
90 		/* XXX race condition for reentrant kernel */
91 		bzero(so, sizeof *so);
92 		so->so_gencnt = ++so_gencnt;
93 		so->so_zone = socket_zone;
94 	}
95 	return so;
96 }
97 
98 int
99 socreate(dom, aso, type, proto, p)
100 	int dom;
101 	struct socket **aso;
102 	register int type;
103 	int proto;
104 	struct proc *p;
105 {
106 	register struct protosw *prp;
107 	register struct socket *so;
108 	register int error;
109 
110 	if (proto)
111 		prp = pffindproto(dom, proto, type);
112 	else
113 		prp = pffindtype(dom, type);
114 	if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
115 		return (EPROTONOSUPPORT);
116 	if (prp->pr_type != type)
117 		return (EPROTOTYPE);
118 	so = soalloc(p != 0);
119 	if (so == 0)
120 		return (ENOBUFS);
121 
122 	TAILQ_INIT(&so->so_incomp);
123 	TAILQ_INIT(&so->so_comp);
124 	so->so_type = type;
125 	if (p != 0)
126 		so->so_uid = p->p_ucred->cr_uid;
127 	so->so_proto = prp;
128 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
129 	if (error) {
130 		so->so_state |= SS_NOFDREF;
131 		sofree(so);
132 		return (error);
133 	}
134 	*aso = so;
135 	return (0);
136 }
137 
138 int
139 sobind(so, nam, p)
140 	struct socket *so;
141 	struct sockaddr *nam;
142 	struct proc *p;
143 {
144 	int s = splnet();
145 	int error;
146 
147 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
148 	splx(s);
149 	return (error);
150 }
151 
152 void
153 sodealloc(so)
154 	struct socket *so;
155 {
156 	so->so_gencnt = ++so_gencnt;
157 	zfreei(so->so_zone, so);
158 }
159 
160 int
161 solisten(so, backlog, p)
162 	register struct socket *so;
163 	int backlog;
164 	struct proc *p;
165 {
166 	int s, error;
167 
168 	s = splnet();
169 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
170 	if (error) {
171 		splx(s);
172 		return (error);
173 	}
174 	if (so->so_comp.tqh_first == NULL)
175 		so->so_options |= SO_ACCEPTCONN;
176 	if (backlog < 0 || backlog > somaxconn)
177 		backlog = somaxconn;
178 	so->so_qlimit = backlog;
179 	splx(s);
180 	return (0);
181 }
182 
183 void
184 sofree(so)
185 	register struct socket *so;
186 {
187 	struct socket *head = so->so_head;
188 
189 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
190 		return;
191 	if (head != NULL) {
192 		if (so->so_state & SS_INCOMP) {
193 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
194 			head->so_incqlen--;
195 		} else if (so->so_state & SS_COMP) {
196 			TAILQ_REMOVE(&head->so_comp, so, so_list);
197 		} else {
198 			panic("sofree: not queued");
199 		}
200 		head->so_qlen--;
201 		so->so_state &= ~(SS_INCOMP|SS_COMP);
202 		so->so_head = NULL;
203 	}
204 	sbrelease(&so->so_snd);
205 	sorflush(so);
206 	sodealloc(so);
207 }
208 
209 /*
210  * Close a socket on last file table reference removal.
211  * Initiate disconnect if connected.
212  * Free socket when disconnect complete.
213  */
214 int
215 soclose(so)
216 	register struct socket *so;
217 {
218 	int s = splnet();		/* conservative */
219 	int error = 0;
220 
221 	if (so->so_options & SO_ACCEPTCONN) {
222 		struct socket *sp, *sonext;
223 
224 		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
225 			sonext = sp->so_list.tqe_next;
226 			(void) soabort(sp);
227 		}
228 		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
229 			sonext = sp->so_list.tqe_next;
230 			(void) soabort(sp);
231 		}
232 	}
233 	if (so->so_pcb == 0)
234 		goto discard;
235 	if (so->so_state & SS_ISCONNECTED) {
236 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
237 			error = sodisconnect(so);
238 			if (error)
239 				goto drop;
240 		}
241 		if (so->so_options & SO_LINGER) {
242 			if ((so->so_state & SS_ISDISCONNECTING) &&
243 			    (so->so_state & SS_NBIO))
244 				goto drop;
245 			while (so->so_state & SS_ISCONNECTED) {
246 				error = tsleep((caddr_t)&so->so_timeo,
247 				    PSOCK | PCATCH, "soclos", so->so_linger);
248 				if (error)
249 					break;
250 			}
251 		}
252 	}
253 drop:
254 	if (so->so_pcb) {
255 		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
256 		if (error == 0)
257 			error = error2;
258 	}
259 discard:
260 	if (so->so_state & SS_NOFDREF)
261 		panic("soclose: NOFDREF");
262 	so->so_state |= SS_NOFDREF;
263 	sofree(so);
264 	splx(s);
265 	return (error);
266 }
267 
268 /*
269  * Must be called at splnet...
270  */
271 int
272 soabort(so)
273 	struct socket *so;
274 {
275 
276 	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
277 }
278 
279 int
280 soaccept(so, nam)
281 	register struct socket *so;
282 	struct sockaddr **nam;
283 {
284 	int s = splnet();
285 	int error;
286 
287 	if ((so->so_state & SS_NOFDREF) == 0)
288 		panic("soaccept: !NOFDREF");
289 	so->so_state &= ~SS_NOFDREF;
290 	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
291 	splx(s);
292 	return (error);
293 }
294 
295 int
296 soconnect(so, nam, p)
297 	register struct socket *so;
298 	struct sockaddr *nam;
299 	struct proc *p;
300 {
301 	int s;
302 	int error;
303 
304 	if (so->so_options & SO_ACCEPTCONN)
305 		return (EOPNOTSUPP);
306 	s = splnet();
307 	/*
308 	 * If protocol is connection-based, can only connect once.
309 	 * Otherwise, if connected, try to disconnect first.
310 	 * This allows user to disconnect by connecting to, e.g.,
311 	 * a null address.
312 	 */
313 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
314 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
315 	    (error = sodisconnect(so))))
316 		error = EISCONN;
317 	else
318 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
319 	splx(s);
320 	return (error);
321 }
322 
323 int
324 soconnect2(so1, so2)
325 	register struct socket *so1;
326 	struct socket *so2;
327 {
328 	int s = splnet();
329 	int error;
330 
331 	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
332 	splx(s);
333 	return (error);
334 }
335 
336 int
337 sodisconnect(so)
338 	register struct socket *so;
339 {
340 	int s = splnet();
341 	int error;
342 
343 	if ((so->so_state & SS_ISCONNECTED) == 0) {
344 		error = ENOTCONN;
345 		goto bad;
346 	}
347 	if (so->so_state & SS_ISDISCONNECTING) {
348 		error = EALREADY;
349 		goto bad;
350 	}
351 	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
352 bad:
353 	splx(s);
354 	return (error);
355 }
356 
357 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
358 /*
359  * Send on a socket.
360  * If send must go all at once and message is larger than
361  * send buffering, then hard error.
362  * Lock against other senders.
363  * If must go all at once and not enough room now, then
364  * inform user that this would block and do nothing.
365  * Otherwise, if nonblocking, send as much as possible.
366  * The data to be sent is described by "uio" if nonzero,
367  * otherwise by the mbuf chain "top" (which must be null
368  * if uio is not).  Data provided in mbuf chain must be small
369  * enough to send all at once.
370  *
371  * Returns nonzero on error, timeout or signal; callers
372  * must check for short counts if EINTR/ERESTART are returned.
373  * Data and control buffers are freed on return.
374  */
375 int
376 sosend(so, addr, uio, top, control, flags, p)
377 	register struct socket *so;
378 	struct sockaddr *addr;
379 	struct uio *uio;
380 	struct mbuf *top;
381 	struct mbuf *control;
382 	int flags;
383 	struct proc *p;
384 {
385 	struct mbuf **mp;
386 	register struct mbuf *m;
387 	register long space, len, resid;
388 	int clen = 0, error, s, dontroute, mlen;
389 	int atomic = sosendallatonce(so) || top;
390 
391 	if (uio)
392 		resid = uio->uio_resid;
393 	else
394 		resid = top->m_pkthdr.len;
395 	/*
396 	 * In theory resid should be unsigned.
397 	 * However, space must be signed, as it might be less than 0
398 	 * if we over-committed, and we must use a signed comparison
399 	 * of space and resid.  On the other hand, a negative resid
400 	 * causes us to loop sending 0-length segments to the protocol.
401 	 *
402 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
403 	 * type sockets since that's an error.
404 	 */
405 	if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
406 		error = EINVAL;
407 		goto out;
408 	}
409 
410 	dontroute =
411 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
412 	    (so->so_proto->pr_flags & PR_ATOMIC);
413 	if (p)
414 		p->p_stats->p_ru.ru_msgsnd++;
415 	if (control)
416 		clen = control->m_len;
417 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
418 
419 restart:
420 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
421 	if (error)
422 		goto out;
423 	do {
424 		s = splnet();
425 		if (so->so_state & SS_CANTSENDMORE)
426 			snderr(EPIPE);
427 		if (so->so_error) {
428 			error = so->so_error;
429 			so->so_error = 0;
430 			splx(s);
431 			goto release;
432 		}
433 		if ((so->so_state & SS_ISCONNECTED) == 0) {
434 			/*
435 			 * `sendto' and `sendmsg' is allowed on a connection-
436 			 * based socket if it supports implied connect.
437 			 * Return ENOTCONN if not connected and no address is
438 			 * supplied.
439 			 */
440 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
441 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
442 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
443 				    !(resid == 0 && clen != 0))
444 					snderr(ENOTCONN);
445 			} else if (addr == 0)
446 			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
447 				   ENOTCONN : EDESTADDRREQ);
448 		}
449 		space = sbspace(&so->so_snd);
450 		if (flags & MSG_OOB)
451 			space += 1024;
452 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
453 		    clen > so->so_snd.sb_hiwat)
454 			snderr(EMSGSIZE);
455 		if (space < resid + clen && uio &&
456 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
457 			if (so->so_state & SS_NBIO)
458 				snderr(EWOULDBLOCK);
459 			sbunlock(&so->so_snd);
460 			error = sbwait(&so->so_snd);
461 			splx(s);
462 			if (error)
463 				goto out;
464 			goto restart;
465 		}
466 		splx(s);
467 		mp = &top;
468 		space -= clen;
469 		do {
470 		    if (uio == NULL) {
471 			/*
472 			 * Data is prepackaged in "top".
473 			 */
474 			resid = 0;
475 			if (flags & MSG_EOR)
476 				top->m_flags |= M_EOR;
477 		    } else do {
478 			if (top == 0) {
479 				MGETHDR(m, M_WAIT, MT_DATA);
480 				mlen = MHLEN;
481 				m->m_pkthdr.len = 0;
482 				m->m_pkthdr.rcvif = (struct ifnet *)0;
483 			} else {
484 				MGET(m, M_WAIT, MT_DATA);
485 				mlen = MLEN;
486 			}
487 			if (resid >= MINCLSIZE) {
488 				MCLGET(m, M_WAIT);
489 				if ((m->m_flags & M_EXT) == 0)
490 					goto nopages;
491 				mlen = MCLBYTES;
492 				len = min(min(mlen, resid), space);
493 			} else {
494 nopages:
495 				len = min(min(mlen, resid), space);
496 				/*
497 				 * For datagram protocols, leave room
498 				 * for protocol headers in first mbuf.
499 				 */
500 				if (atomic && top == 0 && len < mlen)
501 					MH_ALIGN(m, len);
502 			}
503 			space -= len;
504 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
505 			resid = uio->uio_resid;
506 			m->m_len = len;
507 			*mp = m;
508 			top->m_pkthdr.len += len;
509 			if (error)
510 				goto release;
511 			mp = &m->m_next;
512 			if (resid <= 0) {
513 				if (flags & MSG_EOR)
514 					top->m_flags |= M_EOR;
515 				break;
516 			}
517 		    } while (space > 0 && atomic);
518 		    if (dontroute)
519 			    so->so_options |= SO_DONTROUTE;
520 		    s = splnet();				/* XXX */
521 		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
522 			(flags & MSG_OOB) ? PRUS_OOB :
523 			/*
524 			 * If the user set MSG_EOF, the protocol
525 			 * understands this flag and nothing left to
526 			 * send then use PRU_SEND_EOF instead of PRU_SEND.
527 			 */
528 			((flags & MSG_EOF) &&
529 			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
530 			 (resid <= 0)) ?
531 				PRUS_EOF : 0,
532 			top, addr, control, p);
533 		    splx(s);
534 		    if (dontroute)
535 			    so->so_options &= ~SO_DONTROUTE;
536 		    clen = 0;
537 		    control = 0;
538 		    top = 0;
539 		    mp = &top;
540 		    if (error)
541 			goto release;
542 		} while (resid && space > 0);
543 	} while (resid);
544 
545 release:
546 	sbunlock(&so->so_snd);
547 out:
548 	if (top)
549 		m_freem(top);
550 	if (control)
551 		m_freem(control);
552 	return (error);
553 }
554 
555 /*
556  * Implement receive operations on a socket.
557  * We depend on the way that records are added to the sockbuf
558  * by sbappend*.  In particular, each record (mbufs linked through m_next)
559  * must begin with an address if the protocol so specifies,
560  * followed by an optional mbuf or mbufs containing ancillary data,
561  * and then zero or more mbufs of data.
562  * In order to avoid blocking network interrupts for the entire time here,
563  * we splx() while doing the actual copy to user space.
564  * Although the sockbuf is locked, new data may still be appended,
565  * and thus we must maintain consistency of the sockbuf during that time.
566  *
567  * The caller may receive the data as a single mbuf chain by supplying
568  * an mbuf **mp0 for use in returning the chain.  The uio is then used
569  * only for the count in uio_resid.
570  */
571 int
572 soreceive(so, psa, uio, mp0, controlp, flagsp)
573 	register struct socket *so;
574 	struct sockaddr **psa;
575 	struct uio *uio;
576 	struct mbuf **mp0;
577 	struct mbuf **controlp;
578 	int *flagsp;
579 {
580 	register struct mbuf *m, **mp;
581 	register int flags, len, error, s, offset;
582 	struct protosw *pr = so->so_proto;
583 	struct mbuf *nextrecord;
584 	int moff, type = 0;
585 	int orig_resid = uio->uio_resid;
586 
587 	mp = mp0;
588 	if (psa)
589 		*psa = 0;
590 	if (controlp)
591 		*controlp = 0;
592 	if (flagsp)
593 		flags = *flagsp &~ MSG_EOR;
594 	else
595 		flags = 0;
596 	if (flags & MSG_OOB) {
597 		m = m_get(M_WAIT, MT_DATA);
598 		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
599 		if (error)
600 			goto bad;
601 		do {
602 			error = uiomove(mtod(m, caddr_t),
603 			    (int) min(uio->uio_resid, m->m_len), uio);
604 			m = m_free(m);
605 		} while (uio->uio_resid && error == 0 && m);
606 bad:
607 		if (m)
608 			m_freem(m);
609 		return (error);
610 	}
611 	if (mp)
612 		*mp = (struct mbuf *)0;
613 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
614 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
615 
616 restart:
617 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
618 	if (error)
619 		return (error);
620 	s = splnet();
621 
622 	m = so->so_rcv.sb_mb;
623 	/*
624 	 * If we have less data than requested, block awaiting more
625 	 * (subject to any timeout) if:
626 	 *   1. the current count is less than the low water mark, or
627 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
628 	 *	receive operation at once if we block (resid <= hiwat).
629 	 *   3. MSG_DONTWAIT is not set
630 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
631 	 * we have to do the receive in sections, and thus risk returning
632 	 * a short count if a timeout or signal occurs after we start.
633 	 */
634 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
635 	    so->so_rcv.sb_cc < uio->uio_resid) &&
636 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
637 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
638 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
639 #ifdef DIAGNOSTIC
640 		if (m == 0 && so->so_rcv.sb_cc)
641 			panic("receive 1");
642 #endif
643 		if (so->so_error) {
644 			if (m)
645 				goto dontblock;
646 			error = so->so_error;
647 			if ((flags & MSG_PEEK) == 0)
648 				so->so_error = 0;
649 			goto release;
650 		}
651 		if (so->so_state & SS_CANTRCVMORE) {
652 			if (m)
653 				goto dontblock;
654 			else
655 				goto release;
656 		}
657 		for (; m; m = m->m_next)
658 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
659 				m = so->so_rcv.sb_mb;
660 				goto dontblock;
661 			}
662 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
663 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
664 			error = ENOTCONN;
665 			goto release;
666 		}
667 		if (uio->uio_resid == 0)
668 			goto release;
669 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
670 			error = EWOULDBLOCK;
671 			goto release;
672 		}
673 		sbunlock(&so->so_rcv);
674 		error = sbwait(&so->so_rcv);
675 		splx(s);
676 		if (error)
677 			return (error);
678 		goto restart;
679 	}
680 dontblock:
681 	if (uio->uio_procp)
682 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
683 	nextrecord = m->m_nextpkt;
684 	if (pr->pr_flags & PR_ADDR) {
685 #ifdef DIAGNOSTIC
686 		if (m->m_type != MT_SONAME)
687 			panic("receive 1a");
688 #endif
689 		orig_resid = 0;
690 		if (psa)
691 			*psa = dup_sockaddr(mtod(m, struct sockaddr *),
692 					    mp0 == 0);
693 		if (flags & MSG_PEEK) {
694 			m = m->m_next;
695 		} else {
696 			sbfree(&so->so_rcv, m);
697 			MFREE(m, so->so_rcv.sb_mb);
698 			m = so->so_rcv.sb_mb;
699 		}
700 	}
701 	while (m && m->m_type == MT_CONTROL && error == 0) {
702 		if (flags & MSG_PEEK) {
703 			if (controlp)
704 				*controlp = m_copy(m, 0, m->m_len);
705 			m = m->m_next;
706 		} else {
707 			sbfree(&so->so_rcv, m);
708 			if (controlp) {
709 				if (pr->pr_domain->dom_externalize &&
710 				    mtod(m, struct cmsghdr *)->cmsg_type ==
711 				    SCM_RIGHTS)
712 				   error = (*pr->pr_domain->dom_externalize)(m);
713 				*controlp = m;
714 				so->so_rcv.sb_mb = m->m_next;
715 				m->m_next = 0;
716 				m = so->so_rcv.sb_mb;
717 			} else {
718 				MFREE(m, so->so_rcv.sb_mb);
719 				m = so->so_rcv.sb_mb;
720 			}
721 		}
722 		if (controlp) {
723 			orig_resid = 0;
724 			controlp = &(*controlp)->m_next;
725 		}
726 	}
727 	if (m) {
728 		if ((flags & MSG_PEEK) == 0)
729 			m->m_nextpkt = nextrecord;
730 		type = m->m_type;
731 		if (type == MT_OOBDATA)
732 			flags |= MSG_OOB;
733 	}
734 	moff = 0;
735 	offset = 0;
736 	while (m && uio->uio_resid > 0 && error == 0) {
737 		if (m->m_type == MT_OOBDATA) {
738 			if (type != MT_OOBDATA)
739 				break;
740 		} else if (type == MT_OOBDATA)
741 			break;
742 #ifdef DIAGNOSTIC
743 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
744 			panic("receive 3");
745 #endif
746 		so->so_state &= ~SS_RCVATMARK;
747 		len = uio->uio_resid;
748 		if (so->so_oobmark && len > so->so_oobmark - offset)
749 			len = so->so_oobmark - offset;
750 		if (len > m->m_len - moff)
751 			len = m->m_len - moff;
752 		/*
753 		 * If mp is set, just pass back the mbufs.
754 		 * Otherwise copy them out via the uio, then free.
755 		 * Sockbuf must be consistent here (points to current mbuf,
756 		 * it points to next record) when we drop priority;
757 		 * we must note any additions to the sockbuf when we
758 		 * block interrupts again.
759 		 */
760 		if (mp == 0) {
761 			splx(s);
762 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
763 			s = splnet();
764 			if (error)
765 				goto release;
766 		} else
767 			uio->uio_resid -= len;
768 		if (len == m->m_len - moff) {
769 			if (m->m_flags & M_EOR)
770 				flags |= MSG_EOR;
771 			if (flags & MSG_PEEK) {
772 				m = m->m_next;
773 				moff = 0;
774 			} else {
775 				nextrecord = m->m_nextpkt;
776 				sbfree(&so->so_rcv, m);
777 				if (mp) {
778 					*mp = m;
779 					mp = &m->m_next;
780 					so->so_rcv.sb_mb = m = m->m_next;
781 					*mp = (struct mbuf *)0;
782 				} else {
783 					MFREE(m, so->so_rcv.sb_mb);
784 					m = so->so_rcv.sb_mb;
785 				}
786 				if (m)
787 					m->m_nextpkt = nextrecord;
788 			}
789 		} else {
790 			if (flags & MSG_PEEK)
791 				moff += len;
792 			else {
793 				if (mp)
794 					*mp = m_copym(m, 0, len, M_WAIT);
795 				m->m_data += len;
796 				m->m_len -= len;
797 				so->so_rcv.sb_cc -= len;
798 			}
799 		}
800 		if (so->so_oobmark) {
801 			if ((flags & MSG_PEEK) == 0) {
802 				so->so_oobmark -= len;
803 				if (so->so_oobmark == 0) {
804 					so->so_state |= SS_RCVATMARK;
805 					break;
806 				}
807 			} else {
808 				offset += len;
809 				if (offset == so->so_oobmark)
810 					break;
811 			}
812 		}
813 		if (flags & MSG_EOR)
814 			break;
815 		/*
816 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
817 		 * we must not quit until "uio->uio_resid == 0" or an error
818 		 * termination.  If a signal/timeout occurs, return
819 		 * with a short count but without error.
820 		 * Keep sockbuf locked against other readers.
821 		 */
822 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
823 		    !sosendallatonce(so) && !nextrecord) {
824 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
825 				break;
826 			error = sbwait(&so->so_rcv);
827 			if (error) {
828 				sbunlock(&so->so_rcv);
829 				splx(s);
830 				return (0);
831 			}
832 			m = so->so_rcv.sb_mb;
833 			if (m)
834 				nextrecord = m->m_nextpkt;
835 		}
836 	}
837 
838 	if (m && pr->pr_flags & PR_ATOMIC) {
839 		flags |= MSG_TRUNC;
840 		if ((flags & MSG_PEEK) == 0)
841 			(void) sbdroprecord(&so->so_rcv);
842 	}
843 	if ((flags & MSG_PEEK) == 0) {
844 		if (m == 0)
845 			so->so_rcv.sb_mb = nextrecord;
846 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
847 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
848 	}
849 	if (orig_resid == uio->uio_resid && orig_resid &&
850 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
851 		sbunlock(&so->so_rcv);
852 		splx(s);
853 		goto restart;
854 	}
855 
856 	if (flagsp)
857 		*flagsp |= flags;
858 release:
859 	sbunlock(&so->so_rcv);
860 	splx(s);
861 	return (error);
862 }
863 
864 int
865 soshutdown(so, how)
866 	register struct socket *so;
867 	register int how;
868 {
869 	register struct protosw *pr = so->so_proto;
870 
871 	how++;
872 	if (how & FREAD)
873 		sorflush(so);
874 	if (how & FWRITE)
875 		return ((*pr->pr_usrreqs->pru_shutdown)(so));
876 	return (0);
877 }
878 
879 void
880 sorflush(so)
881 	register struct socket *so;
882 {
883 	register struct sockbuf *sb = &so->so_rcv;
884 	register struct protosw *pr = so->so_proto;
885 	register int s;
886 	struct sockbuf asb;
887 
888 	sb->sb_flags |= SB_NOINTR;
889 	(void) sblock(sb, M_WAITOK);
890 	s = splimp();
891 	socantrcvmore(so);
892 	sbunlock(sb);
893 	asb = *sb;
894 	bzero((caddr_t)sb, sizeof (*sb));
895 	splx(s);
896 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
897 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
898 	sbrelease(&asb);
899 }
900 
901 int
902 sosetopt(so, level, optname, m0, p)
903 	register struct socket *so;
904 	int level, optname;
905 	struct mbuf *m0;
906 	struct proc *p;
907 {
908 	int error = 0;
909 	register struct mbuf *m = m0;
910 
911 	if (level != SOL_SOCKET) {
912 		if (so->so_proto && so->so_proto->pr_ctloutput)
913 			return ((*so->so_proto->pr_ctloutput)
914 				  (PRCO_SETOPT, so, level, optname, &m0, p));
915 		error = ENOPROTOOPT;
916 	} else {
917 		switch (optname) {
918 
919 		case SO_LINGER:
920 			if (m == NULL || m->m_len != sizeof (struct linger)) {
921 				error = EINVAL;
922 				goto bad;
923 			}
924 			so->so_linger = mtod(m, struct linger *)->l_linger;
925 			/* fall thru... */
926 
927 		case SO_DEBUG:
928 		case SO_KEEPALIVE:
929 		case SO_DONTROUTE:
930 		case SO_USELOOPBACK:
931 		case SO_BROADCAST:
932 		case SO_REUSEADDR:
933 		case SO_REUSEPORT:
934 		case SO_OOBINLINE:
935 		case SO_TIMESTAMP:
936 			if (m == NULL || m->m_len < sizeof (int)) {
937 				error = EINVAL;
938 				goto bad;
939 			}
940 			if (*mtod(m, int *))
941 				so->so_options |= optname;
942 			else
943 				so->so_options &= ~optname;
944 			break;
945 
946 		case SO_SNDBUF:
947 		case SO_RCVBUF:
948 		case SO_SNDLOWAT:
949 		case SO_RCVLOWAT:
950 		    {
951 			int optval;
952 
953 			if (m == NULL || m->m_len < sizeof (int)) {
954 				error = EINVAL;
955 				goto bad;
956 			}
957 
958 			/*
959 			 * Values < 1 make no sense for any of these
960 			 * options, so disallow them.
961 			 */
962 			optval = *mtod(m, int *);
963 			if (optval < 1) {
964 				error = EINVAL;
965 				goto bad;
966 			}
967 
968 			switch (optname) {
969 
970 			case SO_SNDBUF:
971 			case SO_RCVBUF:
972 				if (sbreserve(optname == SO_SNDBUF ?
973 				    &so->so_snd : &so->so_rcv,
974 				    (u_long) optval) == 0) {
975 					error = ENOBUFS;
976 					goto bad;
977 				}
978 				break;
979 
980 			/*
981 			 * Make sure the low-water is never greater than
982 			 * the high-water.
983 			 */
984 			case SO_SNDLOWAT:
985 				so->so_snd.sb_lowat =
986 				    (optval > so->so_snd.sb_hiwat) ?
987 				    so->so_snd.sb_hiwat : optval;
988 				break;
989 			case SO_RCVLOWAT:
990 				so->so_rcv.sb_lowat =
991 				    (optval > so->so_rcv.sb_hiwat) ?
992 				    so->so_rcv.sb_hiwat : optval;
993 				break;
994 			}
995 			break;
996 		    }
997 
998 		case SO_SNDTIMEO:
999 		case SO_RCVTIMEO:
1000 		    {
1001 			struct timeval *tv;
1002 			short val;
1003 
1004 			if (m == NULL || m->m_len < sizeof (*tv)) {
1005 				error = EINVAL;
1006 				goto bad;
1007 			}
1008 			tv = mtod(m, struct timeval *);
1009 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
1010 				error = EDOM;
1011 				goto bad;
1012 			}
1013 			val = tv->tv_sec * hz + tv->tv_usec / tick;
1014 
1015 			switch (optname) {
1016 
1017 			case SO_SNDTIMEO:
1018 				so->so_snd.sb_timeo = val;
1019 				break;
1020 			case SO_RCVTIMEO:
1021 				so->so_rcv.sb_timeo = val;
1022 				break;
1023 			}
1024 			break;
1025 		    }
1026 
1027 		default:
1028 			error = ENOPROTOOPT;
1029 			break;
1030 		}
1031 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1032 			(void) ((*so->so_proto->pr_ctloutput)
1033 				  (PRCO_SETOPT, so, level, optname, &m0, p));
1034 			m = NULL;	/* freed by protocol */
1035 		}
1036 	}
1037 bad:
1038 	if (m)
1039 		(void) m_free(m);
1040 	return (error);
1041 }
1042 
1043 int
1044 sogetopt(so, level, optname, mp, p)
1045 	register struct socket *so;
1046 	int level, optname;
1047 	struct mbuf **mp;
1048 	struct proc *p;
1049 {
1050 	register struct mbuf *m;
1051 
1052 	if (level != SOL_SOCKET) {
1053 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1054 			return ((*so->so_proto->pr_ctloutput)
1055 				  (PRCO_GETOPT, so, level, optname, mp, p));
1056 		} else
1057 			return (ENOPROTOOPT);
1058 	} else {
1059 		m = m_get(M_WAIT, MT_SOOPTS);
1060 		m->m_len = sizeof (int);
1061 
1062 		switch (optname) {
1063 
1064 		case SO_LINGER:
1065 			m->m_len = sizeof (struct linger);
1066 			mtod(m, struct linger *)->l_onoff =
1067 				so->so_options & SO_LINGER;
1068 			mtod(m, struct linger *)->l_linger = so->so_linger;
1069 			break;
1070 
1071 		case SO_USELOOPBACK:
1072 		case SO_DONTROUTE:
1073 		case SO_DEBUG:
1074 		case SO_KEEPALIVE:
1075 		case SO_REUSEADDR:
1076 		case SO_REUSEPORT:
1077 		case SO_BROADCAST:
1078 		case SO_OOBINLINE:
1079 		case SO_TIMESTAMP:
1080 			*mtod(m, int *) = so->so_options & optname;
1081 			break;
1082 
1083 		case SO_TYPE:
1084 			*mtod(m, int *) = so->so_type;
1085 			break;
1086 
1087 		case SO_ERROR:
1088 			*mtod(m, int *) = so->so_error;
1089 			so->so_error = 0;
1090 			break;
1091 
1092 		case SO_SNDBUF:
1093 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1094 			break;
1095 
1096 		case SO_RCVBUF:
1097 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1098 			break;
1099 
1100 		case SO_SNDLOWAT:
1101 			*mtod(m, int *) = so->so_snd.sb_lowat;
1102 			break;
1103 
1104 		case SO_RCVLOWAT:
1105 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1106 			break;
1107 
1108 		case SO_SNDTIMEO:
1109 		case SO_RCVTIMEO:
1110 		    {
1111 			int val = (optname == SO_SNDTIMEO ?
1112 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1113 
1114 			m->m_len = sizeof(struct timeval);
1115 			mtod(m, struct timeval *)->tv_sec = val / hz;
1116 			mtod(m, struct timeval *)->tv_usec =
1117 			    (val % hz) * tick;
1118 			break;
1119 		    }
1120 
1121 		default:
1122 			(void)m_free(m);
1123 			return (ENOPROTOOPT);
1124 		}
1125 		*mp = m;
1126 		return (0);
1127 	}
1128 }
1129 
1130 void
1131 sohasoutofband(so)
1132 	register struct socket *so;
1133 {
1134 	struct proc *p;
1135 
1136 	if (so->so_pgid < 0)
1137 		gsignal(-so->so_pgid, SIGURG);
1138 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1139 		psignal(p, SIGURG);
1140 	selwakeup(&so->so_rcv.sb_sel);
1141 }
1142 
1143 int
1144 sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p)
1145 {
1146 	int revents = 0;
1147 	int s = splnet();
1148 
1149 	if (events & (POLLIN | POLLRDNORM))
1150 		if (soreadable(so))
1151 			revents |= events & (POLLIN | POLLRDNORM);
1152 
1153 	if (events & (POLLOUT | POLLWRNORM))
1154 		if (sowriteable(so))
1155 			revents |= events & (POLLOUT | POLLWRNORM);
1156 
1157 	if (events & (POLLPRI | POLLRDBAND))
1158 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
1159 			revents |= events & (POLLPRI | POLLRDBAND);
1160 
1161 	if (revents == 0) {
1162 		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
1163 			selrecord(p, &so->so_rcv.sb_sel);
1164 			so->so_rcv.sb_flags |= SB_SEL;
1165 		}
1166 
1167 		if (events & (POLLOUT | POLLWRNORM)) {
1168 			selrecord(p, &so->so_snd.sb_sel);
1169 			so->so_snd.sb_flags |= SB_SEL;
1170 		}
1171 	}
1172 
1173 	splx(s);
1174 	return (revents);
1175 }
1176